Bug bash - Fix bugs in multi GPU benchmarks (#98)

* Add `sb deploy` command content.
* Fix inline if-expression syntax in playbook.
* Fix quote escape issue in bash command.
* Add custom env in config.
* Update default config for multi GPU benchmarks.
* Update MANIFEST.in to include jinja2 template.
* Require jinja2 minimum version.
* Fix occasional duplicate output in Ansible runner.
* Fix mixed color from Ansible and Python colorlog.
* Update according to comments.
* Change superbench.env from list to dict in config file.
This commit is contained in:
Yifan Xiong 2021-06-23 18:16:43 +08:00 коммит произвёл GitHub
Родитель 216c5b5c71
Коммит c0c43b8f81
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
12 изменённых файлов: 118 добавлений и 103 удалений

Просмотреть файл

@ -1,5 +1,3 @@
include LICENSE README.md include LICENSE README.md
recursive-include superbench *.py recursive-include superbench *.py *.j2 *.yaml
recursive-include superbench *.yaml global-exclude *.py[cod] __pycache__
global-exclude *.pyc
global-exclude __pycache__

Просмотреть файл

@ -134,11 +134,13 @@ setup(
python_requires='>=3.6, <4', python_requires='>=3.6, <4',
install_requires=[ install_requires=[
'ansible_base>=2.10.9;os_name=="posix"', 'ansible_base>=2.10.9;os_name=="posix"',
'ansible_runner>=1.4.7', 'ansible_runner>=2.0.0rc1',
'colorlog>=4.7.2', 'colorlog>=4.7.2',
'jinja2>=2.10.1',
'joblib>=1.0.1', 'joblib>=1.0.1',
'knack>=0.7.2', 'knack>=0.7.2',
'omegaconf==2.0.6', 'omegaconf==2.0.6',
'pyyaml>=5.3',
], ],
extras_require={ extras_require={
'dev': ['pre-commit>=2.10.0'], 'dev': ['pre-commit>=2.10.0'],

Просмотреть файл

@ -227,8 +227,8 @@ def deploy_command_handler(
private_key=private_key, private_key=private_key,
) )
SuperBenchRunner(sb_config, docker_config, ansible_config, output_dir) runner = SuperBenchRunner(sb_config, docker_config, ansible_config, output_dir)
raise NotImplementedError runner.deploy()
def run_command_handler( def run_command_handler(

Просмотреть файл

@ -6,6 +6,7 @@
from pathlib import Path from pathlib import Path
from datetime import datetime from datetime import datetime
import yaml
from omegaconf import OmegaConf from omegaconf import OmegaConf
@ -38,4 +39,5 @@ def get_sb_config(config_file):
p = Path(config_file) if config_file else default_config_file p = Path(config_file) if config_file else default_config_file
if not p.is_file(): if not p.is_file():
return None return None
return OmegaConf.load(str(p)) with p.open() as fp:
return OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))

Просмотреть файл

@ -41,6 +41,7 @@ class SuperBenchLogger:
) )
if color: if color:
formatter = colorlog.ColoredFormatter( formatter = colorlog.ColoredFormatter(
'%(reset)s'
'[%(cyan)s%(asctime)s %(hostname)s:%(process)d%(reset)s]' '[%(cyan)s%(asctime)s %(hostname)s:%(process)d%(reset)s]'
'[%(blue)s%(filename)s:%(lineno)s%(reset)s]' '[%(blue)s%(filename)s:%(lineno)s%(reset)s]'
'[%(log_color)s%(levelname)s%(reset)s] %(message)s' '[%(log_color)s%(levelname)s%(reset)s] %(message)s'

Просмотреть файл

@ -1,115 +1,95 @@
# SuperBench Config # SuperBench Config
superbench: superbench:
enable: null enable: null
benchmarks: var:
kernel-launch: default_local_mode: &default_local_mode
enable: true
gemm-flops:
enable: true
cudnn-function:
enable: true
cublas-function:
enable: true
matmul:
enable: true enable: true
modes: modes:
- name: local - name: local
proc_num: 8 proc_num: 8
prefix: CUDA_VISIBLE_DEVICES={proc_rank} prefix: CUDA_VISIBLE_DEVICES={proc_rank}
parallel: no parallel: yes
frameworks: default_pytorch_mode: &default_pytorch_mode
- pytorch
gpt_models:
enable: true enable: true
modes: modes:
- name: torch.distributed - name: torch.distributed
proc_num: 8 proc_num: 8
node_num: all node_num: 1
frameworks: frameworks:
- pytorch - pytorch
common_model_config: &common_model_config
duration: 0
num_warmup: 16
num_steps: 128
precision:
- float32
- float16
model_action:
- train
benchmarks:
kernel-launch:
<<: *default_local_mode
gemm-flops:
<<: *default_local_mode
cudnn-function:
<<: *default_local_mode
cublas-function:
<<: *default_local_mode
matmul:
<<: *default_local_mode
frameworks:
- pytorch
sharding-matmul:
<<: *default_pytorch_mode
computation-communication-overlap:
<<: *default_pytorch_mode
gpt_models:
<<: *default_pytorch_mode
models: models:
- gpt2-small - gpt2-small
- gpt2-large - gpt2-large
parameters: parameters:
duration: 0 <<: *common_model_config
num_warmup: 16
num_steps: 128
batch_size: 4 batch_size: 4
precision:
- float32
- float16
model_action:
- train
- inference
bert_models: bert_models:
enable: true <<: *default_pytorch_mode
modes:
- name: torch.distributed
proc_num: 8
node_num: all
frameworks:
- pytorch
models: models:
- bert-base - bert-base
- bert-large - bert-large
parameters: parameters:
duration: 0 <<: *common_model_config
num_warmup: 16 batch_size: 8
num_steps: 128
batch_size: 16
precision:
- float32
- float16
model_action:
- train
- inference
lstm_models: lstm_models:
enable: true <<: *default_pytorch_mode
modes:
- name: torch.distributed
proc_num: 8
node_num: all
frameworks:
- pytorch
models: models:
- lstm - lstm
parameters: parameters:
duration: 0 <<: *common_model_config
num_warmup: 16
num_steps: 128
batch_size: 128 batch_size: 128
precision: resnet_models:
- float32 <<: *default_pytorch_mode
- float16
model_action:
- train
- inference
cnn_models:
enable: true
modes:
- name: torch.distributed
proc_num: 8
node_num: all
frameworks:
- pytorch
models: models:
- resnet50 - resnet50
- resnet101 - resnet101
- resnet152 - resnet152
parameters:
<<: *common_model_config
batch_size: 128
densenet_models:
<<: *default_pytorch_mode
models:
- densenet169 - densenet169
- densenet201 - densenet201
parameters:
<<: *common_model_config
batch_size: 128
vgg_models:
<<: *default_pytorch_mode
models:
- vgg11 - vgg11
- vgg13 - vgg13
- vgg16 - vgg16
- vgg19 - vgg19
parameters: parameters:
duration: 0 <<: *common_model_config
num_warmup: 16
num_steps: 128
batch_size: 128 batch_size: 128
precision:
- float32
- float16
model_action:
- train
- inference

Просмотреть файл

@ -22,10 +22,14 @@
container: sb-workspace container: sb-workspace
sb_nodes: '{{ hostvars.values() | map(attribute="ansible_hostname") | sort }}' sb_nodes: '{{ hostvars.values() | map(attribute="ansible_hostname") | sort }}'
sb_env: | sb_env: |
# pytorch env
NNODES={{ sb_nodes | length }} NNODES={{ sb_nodes | length }}
NODE_RANK={{ lookup('ansible.utils.index_of', sb_nodes, 'eq', ansible_hostname) }} NODE_RANK={{ lookup('ansible.utils.index_of', sb_nodes, 'eq', ansible_hostname) }}
MASTER_ADDR={{ sb_nodes | first }} MASTER_ADDR={{ sb_nodes | first }}
MASTER_PORT=29500 MASTER_PORT=29500
OMP_NUM_THREADS=1
# config env
{{ env | default('') }}
tasks: tasks:
- name: Updating Config - name: Updating Config
copy: copy:

Просмотреть файл

@ -65,8 +65,8 @@
docker rm --force {{ container }} ||: && \ docker rm --force {{ container }} ||: && \
docker run -itd --name={{ container }} \ docker run -itd --name={{ container }} \
--privileged --net=host --ipc=host \ --privileged --net=host --ipc=host \
{{ '--gpus=all' if gpu_vendor == 'nvidia' }} \ {{ '--gpus=all' if gpu_vendor == 'nvidia' else '' }} \
{{ '--security-opt seccomp=unconfined --group-add video' if gpu_vendor == 'amd' }} \ {{ '--security-opt seccomp=unconfined --group-add video' if gpu_vendor == 'amd' else '' }} \
-w /root -v {{ workspace }}:/root -v /mnt:/mnt \ -w /root -v {{ workspace }}:/root -v /mnt:/mnt \
{{ docker_image }} bash && \ {{ docker_image }} bash && \
docker exec {{ container }} bash -c \ docker exec {{ container }} bash -c \

Просмотреть файл

@ -54,6 +54,8 @@ class SuperBenchRunner():
InvalidConfigError: If input config is invalid. InvalidConfigError: If input config is invalid.
""" """
# TODO: add validation and defaulting # TODO: add validation and defaulting
if not self._sb_config.superbench.env:
self._sb_config.superbench.env = {}
for name in self._sb_benchmarks: for name in self._sb_benchmarks:
if not self._sb_benchmarks[name].modes: if not self._sb_benchmarks[name].modes:
self._sb_benchmarks[name].modes = [] self._sb_benchmarks[name].modes = []
@ -141,7 +143,13 @@ class SuperBenchRunner():
logger.info('Checking SuperBench environment.') logger.info('Checking SuperBench environment.')
OmegaConf.save(config=self._sb_config, f=str(Path(self._output_dir) / 'sb.config.yaml')) OmegaConf.save(config=self._sb_config, f=str(Path(self._output_dir) / 'sb.config.yaml'))
self._ansible_client.run( self._ansible_client.run(
self._ansible_client.get_playbook_config('check_env.yaml', extravars={'output_dir': self._output_dir}) self._ansible_client.get_playbook_config(
'check_env.yaml',
extravars={
'output_dir': self._output_dir,
'env': '\n'.join(f'{k}={v}' for k, v in self._sb_config.superbench.env.items()),
}
)
) )
def _run_proc(self, benchmark_name, mode, vars): def _run_proc(self, benchmark_name, mode, vars):
@ -161,7 +169,7 @@ class SuperBenchRunner():
self._ansible_client.get_shell_config( self._ansible_client.get_shell_config(
( (
'docker exec sb-workspace bash -c ' 'docker exec sb-workspace bash -c '
'"set -o allexport && source sb.env && set +o allexport && {command}"' "'set -o allexport && source sb.env && set +o allexport && {command}'"
).format(command=self.__get_mode_command(benchmark_name, mode), ) ).format(command=self.__get_mode_command(benchmark_name, mode), )
), ),
sudo=True sudo=True

Просмотреть файл

@ -53,7 +53,11 @@ class SuperBenchCLIScenarioTest(ScenarioTest):
def test_sb_deploy(self): def test_sb_deploy(self):
"""Test sb deploy.""" """Test sb deploy."""
self.cmd('sb deploy --host-list localhost', expect_failure=True) self.cmd('sb deploy --host-list localhost', checks=[NoneCheck()])
def test_sb_deploy_no_host(self):
"""Test sb deploy, no host_file or host_list provided, should fail."""
self.cmd('sb deploy', expect_failure=True)
def test_sb_exec(self): def test_sb_exec(self):
"""Test sb exec.""" """Test sb exec."""

Просмотреть файл

@ -10,6 +10,7 @@ import tempfile
from pathlib import Path from pathlib import Path
from unittest import mock from unittest import mock
import yaml
from omegaconf import OmegaConf from omegaconf import OmegaConf
from superbench.executor import SuperBenchExecutor from superbench.executor import SuperBenchExecutor
@ -24,7 +25,8 @@ class ExecutorTestCase(unittest.TestCase):
def setUp(self): def setUp(self):
"""Hook method for setting up the test fixture before exercising it.""" """Hook method for setting up the test fixture before exercising it."""
default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml' default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml'
self.default_config = OmegaConf.load(str(default_config_file)) with default_config_file.open() as fp:
self.default_config = OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
self.output_dir = tempfile.mkdtemp() self.output_dir = tempfile.mkdtemp()
self.executor = SuperBenchExecutor(self.default_config, self.output_dir) self.executor = SuperBenchExecutor(self.default_config, self.output_dir)
@ -61,19 +63,31 @@ class ExecutorTestCase(unittest.TestCase):
def test_get_arguments(self): def test_get_arguments(self):
"""Test benchmarks arguments.""" """Test benchmarks arguments."""
expected_matmul_args = '' test_cases = [
self.assertEqual( {
self.executor._SuperBenchExecutor__get_arguments( 'parameters': None,
self.default_config.superbench.benchmarks.matmul.parameters 'expected_args': '',
), expected_matmul_args },
) {
expected_bert_models_args = \ 'parameters': {
'--duration 0 --num_warmup 16 --num_steps 128 --batch_size 16 ' \ 'duration': 0,
'num_warmup': 16,
'num_steps': 128,
'batch_size': 16,
'precision': ['float32', 'float16'],
'model_action': ['train', 'inference'],
},
'expected_args': (
'--duration 0 --num_warmup 16 --num_steps 128 --batch_size 16 '
'--precision float32 float16 --model_action train inference' '--precision float32 float16 --model_action train inference'
),
},
]
for test_case in test_cases:
with self.subTest(msg='Testing with case', test_case=test_case):
self.assertEqual( self.assertEqual(
self.executor._SuperBenchExecutor__get_arguments( self.executor._SuperBenchExecutor__get_arguments(test_case['parameters']),
self.default_config.superbench.benchmarks.bert_models.parameters test_case['expected_args']
), expected_bert_models_args
) )
def test_create_benchmark_dir(self): def test_create_benchmark_dir(self):

Просмотреть файл

@ -9,6 +9,7 @@ import tempfile
from pathlib import Path from pathlib import Path
from unittest import mock from unittest import mock
import yaml
from omegaconf import OmegaConf from omegaconf import OmegaConf
from superbench.runner import SuperBenchRunner from superbench.runner import SuperBenchRunner
@ -19,7 +20,8 @@ class RunnerTestCase(unittest.TestCase):
def setUp(self): def setUp(self):
"""Hook method for setting up the test fixture before exercising it.""" """Hook method for setting up the test fixture before exercising it."""
default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml' default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml'
self.default_config = OmegaConf.load(str(default_config_file)) with default_config_file.open() as fp:
self.default_config = OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
self.output_dir = tempfile.mkdtemp() self.output_dir = tempfile.mkdtemp()
self.runner = SuperBenchRunner(self.default_config, None, None, self.output_dir) self.runner = SuperBenchRunner(self.default_config, None, None, self.output_dir)