Bug bash - Fix bugs in multi GPU benchmarks (#98)

* Add `sb deploy` command content.
* Fix inline if-expression syntax in playbook.
* Fix quote escape issue in bash command.
* Add custom env in config.
* Update default config for multi GPU benchmarks.
* Update MANIFEST.in to include jinja2 template.
* Require jinja2 minimum version.
* Fix occasional duplicate output in Ansible runner.
* Fix mixed color from Ansible and Python colorlog.
* Update according to comments.
* Change superbench.env from list to dict in config file.
This commit is contained in:
Yifan Xiong 2021-06-23 18:16:43 +08:00 коммит произвёл GitHub
Родитель 216c5b5c71
Коммит c0c43b8f81
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
12 изменённых файлов: 118 добавлений и 103 удалений

Просмотреть файл

@ -1,5 +1,3 @@
include LICENSE README.md
recursive-include superbench *.py
recursive-include superbench *.yaml
global-exclude *.pyc
global-exclude __pycache__
recursive-include superbench *.py *.j2 *.yaml
global-exclude *.py[cod] __pycache__

Просмотреть файл

@ -134,11 +134,13 @@ setup(
python_requires='>=3.6, <4',
install_requires=[
'ansible_base>=2.10.9;os_name=="posix"',
'ansible_runner>=1.4.7',
'ansible_runner>=2.0.0rc1',
'colorlog>=4.7.2',
'jinja2>=2.10.1',
'joblib>=1.0.1',
'knack>=0.7.2',
'omegaconf==2.0.6',
'pyyaml>=5.3',
],
extras_require={
'dev': ['pre-commit>=2.10.0'],

Просмотреть файл

@ -227,8 +227,8 @@ def deploy_command_handler(
private_key=private_key,
)
SuperBenchRunner(sb_config, docker_config, ansible_config, output_dir)
raise NotImplementedError
runner = SuperBenchRunner(sb_config, docker_config, ansible_config, output_dir)
runner.deploy()
def run_command_handler(

Просмотреть файл

@ -6,6 +6,7 @@
from pathlib import Path
from datetime import datetime
import yaml
from omegaconf import OmegaConf
@ -38,4 +39,5 @@ def get_sb_config(config_file):
p = Path(config_file) if config_file else default_config_file
if not p.is_file():
return None
return OmegaConf.load(str(p))
with p.open() as fp:
return OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))

Просмотреть файл

@ -41,6 +41,7 @@ class SuperBenchLogger:
)
if color:
formatter = colorlog.ColoredFormatter(
'%(reset)s'
'[%(cyan)s%(asctime)s %(hostname)s:%(process)d%(reset)s]'
'[%(blue)s%(filename)s:%(lineno)s%(reset)s]'
'[%(log_color)s%(levelname)s%(reset)s] %(message)s'

Просмотреть файл

@ -1,115 +1,95 @@
# SuperBench Config
superbench:
enable: null
benchmarks:
kernel-launch:
enable: true
gemm-flops:
enable: true
cudnn-function:
enable: true
cublas-function:
enable: true
matmul:
var:
default_local_mode: &default_local_mode
enable: true
modes:
- name: local
proc_num: 8
prefix: CUDA_VISIBLE_DEVICES={proc_rank}
parallel: no
frameworks:
- pytorch
gpt_models:
parallel: yes
default_pytorch_mode: &default_pytorch_mode
enable: true
modes:
- name: torch.distributed
proc_num: 8
node_num: all
node_num: 1
frameworks:
- pytorch
common_model_config: &common_model_config
duration: 0
num_warmup: 16
num_steps: 128
precision:
- float32
- float16
model_action:
- train
benchmarks:
kernel-launch:
<<: *default_local_mode
gemm-flops:
<<: *default_local_mode
cudnn-function:
<<: *default_local_mode
cublas-function:
<<: *default_local_mode
matmul:
<<: *default_local_mode
frameworks:
- pytorch
sharding-matmul:
<<: *default_pytorch_mode
computation-communication-overlap:
<<: *default_pytorch_mode
gpt_models:
<<: *default_pytorch_mode
models:
- gpt2-small
- gpt2-large
parameters:
duration: 0
num_warmup: 16
num_steps: 128
<<: *common_model_config
batch_size: 4
precision:
- float32
- float16
model_action:
- train
- inference
bert_models:
enable: true
modes:
- name: torch.distributed
proc_num: 8
node_num: all
frameworks:
- pytorch
<<: *default_pytorch_mode
models:
- bert-base
- bert-large
parameters:
duration: 0
num_warmup: 16
num_steps: 128
batch_size: 16
precision:
- float32
- float16
model_action:
- train
- inference
<<: *common_model_config
batch_size: 8
lstm_models:
enable: true
modes:
- name: torch.distributed
proc_num: 8
node_num: all
frameworks:
- pytorch
<<: *default_pytorch_mode
models:
- lstm
parameters:
duration: 0
num_warmup: 16
num_steps: 128
<<: *common_model_config
batch_size: 128
precision:
- float32
- float16
model_action:
- train
- inference
cnn_models:
enable: true
modes:
- name: torch.distributed
proc_num: 8
node_num: all
frameworks:
- pytorch
resnet_models:
<<: *default_pytorch_mode
models:
- resnet50
- resnet101
- resnet152
parameters:
<<: *common_model_config
batch_size: 128
densenet_models:
<<: *default_pytorch_mode
models:
- densenet169
- densenet201
parameters:
<<: *common_model_config
batch_size: 128
vgg_models:
<<: *default_pytorch_mode
models:
- vgg11
- vgg13
- vgg16
- vgg19
parameters:
duration: 0
num_warmup: 16
num_steps: 128
<<: *common_model_config
batch_size: 128
precision:
- float32
- float16
model_action:
- train
- inference

Просмотреть файл

@ -22,10 +22,14 @@
container: sb-workspace
sb_nodes: '{{ hostvars.values() | map(attribute="ansible_hostname") | sort }}'
sb_env: |
# pytorch env
NNODES={{ sb_nodes | length }}
NODE_RANK={{ lookup('ansible.utils.index_of', sb_nodes, 'eq', ansible_hostname) }}
MASTER_ADDR={{ sb_nodes | first }}
MASTER_PORT=29500
OMP_NUM_THREADS=1
# config env
{{ env | default('') }}
tasks:
- name: Updating Config
copy:

Просмотреть файл

@ -65,8 +65,8 @@
docker rm --force {{ container }} ||: && \
docker run -itd --name={{ container }} \
--privileged --net=host --ipc=host \
{{ '--gpus=all' if gpu_vendor == 'nvidia' }} \
{{ '--security-opt seccomp=unconfined --group-add video' if gpu_vendor == 'amd' }} \
{{ '--gpus=all' if gpu_vendor == 'nvidia' else '' }} \
{{ '--security-opt seccomp=unconfined --group-add video' if gpu_vendor == 'amd' else '' }} \
-w /root -v {{ workspace }}:/root -v /mnt:/mnt \
{{ docker_image }} bash && \
docker exec {{ container }} bash -c \

Просмотреть файл

@ -54,6 +54,8 @@ class SuperBenchRunner():
InvalidConfigError: If input config is invalid.
"""
# TODO: add validation and defaulting
if not self._sb_config.superbench.env:
self._sb_config.superbench.env = {}
for name in self._sb_benchmarks:
if not self._sb_benchmarks[name].modes:
self._sb_benchmarks[name].modes = []
@ -141,7 +143,13 @@ class SuperBenchRunner():
logger.info('Checking SuperBench environment.')
OmegaConf.save(config=self._sb_config, f=str(Path(self._output_dir) / 'sb.config.yaml'))
self._ansible_client.run(
self._ansible_client.get_playbook_config('check_env.yaml', extravars={'output_dir': self._output_dir})
self._ansible_client.get_playbook_config(
'check_env.yaml',
extravars={
'output_dir': self._output_dir,
'env': '\n'.join(f'{k}={v}' for k, v in self._sb_config.superbench.env.items()),
}
)
)
def _run_proc(self, benchmark_name, mode, vars):
@ -161,7 +169,7 @@ class SuperBenchRunner():
self._ansible_client.get_shell_config(
(
'docker exec sb-workspace bash -c '
'"set -o allexport && source sb.env && set +o allexport && {command}"'
"'set -o allexport && source sb.env && set +o allexport && {command}'"
).format(command=self.__get_mode_command(benchmark_name, mode), )
),
sudo=True

Просмотреть файл

@ -53,7 +53,11 @@ class SuperBenchCLIScenarioTest(ScenarioTest):
def test_sb_deploy(self):
"""Test sb deploy."""
self.cmd('sb deploy --host-list localhost', expect_failure=True)
self.cmd('sb deploy --host-list localhost', checks=[NoneCheck()])
def test_sb_deploy_no_host(self):
"""Test sb deploy, no host_file or host_list provided, should fail."""
self.cmd('sb deploy', expect_failure=True)
def test_sb_exec(self):
"""Test sb exec."""

Просмотреть файл

@ -10,6 +10,7 @@ import tempfile
from pathlib import Path
from unittest import mock
import yaml
from omegaconf import OmegaConf
from superbench.executor import SuperBenchExecutor
@ -24,7 +25,8 @@ class ExecutorTestCase(unittest.TestCase):
def setUp(self):
"""Hook method for setting up the test fixture before exercising it."""
default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml'
self.default_config = OmegaConf.load(str(default_config_file))
with default_config_file.open() as fp:
self.default_config = OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
self.output_dir = tempfile.mkdtemp()
self.executor = SuperBenchExecutor(self.default_config, self.output_dir)
@ -61,20 +63,32 @@ class ExecutorTestCase(unittest.TestCase):
def test_get_arguments(self):
"""Test benchmarks arguments."""
expected_matmul_args = ''
self.assertEqual(
self.executor._SuperBenchExecutor__get_arguments(
self.default_config.superbench.benchmarks.matmul.parameters
), expected_matmul_args
)
expected_bert_models_args = \
'--duration 0 --num_warmup 16 --num_steps 128 --batch_size 16 ' \
'--precision float32 float16 --model_action train inference'
self.assertEqual(
self.executor._SuperBenchExecutor__get_arguments(
self.default_config.superbench.benchmarks.bert_models.parameters
), expected_bert_models_args
)
test_cases = [
{
'parameters': None,
'expected_args': '',
},
{
'parameters': {
'duration': 0,
'num_warmup': 16,
'num_steps': 128,
'batch_size': 16,
'precision': ['float32', 'float16'],
'model_action': ['train', 'inference'],
},
'expected_args': (
'--duration 0 --num_warmup 16 --num_steps 128 --batch_size 16 '
'--precision float32 float16 --model_action train inference'
),
},
]
for test_case in test_cases:
with self.subTest(msg='Testing with case', test_case=test_case):
self.assertEqual(
self.executor._SuperBenchExecutor__get_arguments(test_case['parameters']),
test_case['expected_args']
)
def test_create_benchmark_dir(self):
"""Test __create_benchmark_dir."""

Просмотреть файл

@ -9,6 +9,7 @@ import tempfile
from pathlib import Path
from unittest import mock
import yaml
from omegaconf import OmegaConf
from superbench.runner import SuperBenchRunner
@ -19,7 +20,8 @@ class RunnerTestCase(unittest.TestCase):
def setUp(self):
"""Hook method for setting up the test fixture before exercising it."""
default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml'
self.default_config = OmegaConf.load(str(default_config_file))
with default_config_file.open() as fp:
self.default_config = OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
self.output_dir = tempfile.mkdtemp()
self.runner = SuperBenchRunner(self.default_config, None, None, self.output_dir)