Bug bash - Fix bugs in multi GPU benchmarks (#98)
* Add `sb deploy` command content. * Fix inline if-expression syntax in playbook. * Fix quote escape issue in bash command. * Add custom env in config. * Update default config for multi GPU benchmarks. * Update MANIFEST.in to include jinja2 template. * Require jinja2 minimum version. * Fix occasional duplicate output in Ansible runner. * Fix mixed color from Ansible and Python colorlog. * Update according to comments. * Change superbench.env from list to dict in config file.
This commit is contained in:
Родитель
216c5b5c71
Коммит
c0c43b8f81
|
@ -1,5 +1,3 @@
|
|||
include LICENSE README.md
|
||||
recursive-include superbench *.py
|
||||
recursive-include superbench *.yaml
|
||||
global-exclude *.pyc
|
||||
global-exclude __pycache__
|
||||
recursive-include superbench *.py *.j2 *.yaml
|
||||
global-exclude *.py[cod] __pycache__
|
||||
|
|
4
setup.py
4
setup.py
|
@ -134,11 +134,13 @@ setup(
|
|||
python_requires='>=3.6, <4',
|
||||
install_requires=[
|
||||
'ansible_base>=2.10.9;os_name=="posix"',
|
||||
'ansible_runner>=1.4.7',
|
||||
'ansible_runner>=2.0.0rc1',
|
||||
'colorlog>=4.7.2',
|
||||
'jinja2>=2.10.1',
|
||||
'joblib>=1.0.1',
|
||||
'knack>=0.7.2',
|
||||
'omegaconf==2.0.6',
|
||||
'pyyaml>=5.3',
|
||||
],
|
||||
extras_require={
|
||||
'dev': ['pre-commit>=2.10.0'],
|
||||
|
|
|
@ -227,8 +227,8 @@ def deploy_command_handler(
|
|||
private_key=private_key,
|
||||
)
|
||||
|
||||
SuperBenchRunner(sb_config, docker_config, ansible_config, output_dir)
|
||||
raise NotImplementedError
|
||||
runner = SuperBenchRunner(sb_config, docker_config, ansible_config, output_dir)
|
||||
runner.deploy()
|
||||
|
||||
|
||||
def run_command_handler(
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
import yaml
|
||||
from omegaconf import OmegaConf
|
||||
|
||||
|
||||
|
@ -38,4 +39,5 @@ def get_sb_config(config_file):
|
|||
p = Path(config_file) if config_file else default_config_file
|
||||
if not p.is_file():
|
||||
return None
|
||||
return OmegaConf.load(str(p))
|
||||
with p.open() as fp:
|
||||
return OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
|
||||
|
|
|
@ -41,6 +41,7 @@ class SuperBenchLogger:
|
|||
)
|
||||
if color:
|
||||
formatter = colorlog.ColoredFormatter(
|
||||
'%(reset)s'
|
||||
'[%(cyan)s%(asctime)s %(hostname)s:%(process)d%(reset)s]'
|
||||
'[%(blue)s%(filename)s:%(lineno)s%(reset)s]'
|
||||
'[%(log_color)s%(levelname)s%(reset)s] %(message)s'
|
||||
|
|
|
@ -1,115 +1,95 @@
|
|||
# SuperBench Config
|
||||
superbench:
|
||||
enable: null
|
||||
benchmarks:
|
||||
kernel-launch:
|
||||
enable: true
|
||||
gemm-flops:
|
||||
enable: true
|
||||
cudnn-function:
|
||||
enable: true
|
||||
cublas-function:
|
||||
enable: true
|
||||
matmul:
|
||||
var:
|
||||
default_local_mode: &default_local_mode
|
||||
enable: true
|
||||
modes:
|
||||
- name: local
|
||||
proc_num: 8
|
||||
prefix: CUDA_VISIBLE_DEVICES={proc_rank}
|
||||
parallel: no
|
||||
frameworks:
|
||||
- pytorch
|
||||
gpt_models:
|
||||
parallel: yes
|
||||
default_pytorch_mode: &default_pytorch_mode
|
||||
enable: true
|
||||
modes:
|
||||
- name: torch.distributed
|
||||
proc_num: 8
|
||||
node_num: all
|
||||
node_num: 1
|
||||
frameworks:
|
||||
- pytorch
|
||||
common_model_config: &common_model_config
|
||||
duration: 0
|
||||
num_warmup: 16
|
||||
num_steps: 128
|
||||
precision:
|
||||
- float32
|
||||
- float16
|
||||
model_action:
|
||||
- train
|
||||
benchmarks:
|
||||
kernel-launch:
|
||||
<<: *default_local_mode
|
||||
gemm-flops:
|
||||
<<: *default_local_mode
|
||||
cudnn-function:
|
||||
<<: *default_local_mode
|
||||
cublas-function:
|
||||
<<: *default_local_mode
|
||||
matmul:
|
||||
<<: *default_local_mode
|
||||
frameworks:
|
||||
- pytorch
|
||||
sharding-matmul:
|
||||
<<: *default_pytorch_mode
|
||||
computation-communication-overlap:
|
||||
<<: *default_pytorch_mode
|
||||
gpt_models:
|
||||
<<: *default_pytorch_mode
|
||||
models:
|
||||
- gpt2-small
|
||||
- gpt2-large
|
||||
parameters:
|
||||
duration: 0
|
||||
num_warmup: 16
|
||||
num_steps: 128
|
||||
<<: *common_model_config
|
||||
batch_size: 4
|
||||
precision:
|
||||
- float32
|
||||
- float16
|
||||
model_action:
|
||||
- train
|
||||
- inference
|
||||
bert_models:
|
||||
enable: true
|
||||
modes:
|
||||
- name: torch.distributed
|
||||
proc_num: 8
|
||||
node_num: all
|
||||
frameworks:
|
||||
- pytorch
|
||||
<<: *default_pytorch_mode
|
||||
models:
|
||||
- bert-base
|
||||
- bert-large
|
||||
parameters:
|
||||
duration: 0
|
||||
num_warmup: 16
|
||||
num_steps: 128
|
||||
batch_size: 16
|
||||
precision:
|
||||
- float32
|
||||
- float16
|
||||
model_action:
|
||||
- train
|
||||
- inference
|
||||
<<: *common_model_config
|
||||
batch_size: 8
|
||||
lstm_models:
|
||||
enable: true
|
||||
modes:
|
||||
- name: torch.distributed
|
||||
proc_num: 8
|
||||
node_num: all
|
||||
frameworks:
|
||||
- pytorch
|
||||
<<: *default_pytorch_mode
|
||||
models:
|
||||
- lstm
|
||||
parameters:
|
||||
duration: 0
|
||||
num_warmup: 16
|
||||
num_steps: 128
|
||||
<<: *common_model_config
|
||||
batch_size: 128
|
||||
precision:
|
||||
- float32
|
||||
- float16
|
||||
model_action:
|
||||
- train
|
||||
- inference
|
||||
cnn_models:
|
||||
enable: true
|
||||
modes:
|
||||
- name: torch.distributed
|
||||
proc_num: 8
|
||||
node_num: all
|
||||
frameworks:
|
||||
- pytorch
|
||||
resnet_models:
|
||||
<<: *default_pytorch_mode
|
||||
models:
|
||||
- resnet50
|
||||
- resnet101
|
||||
- resnet152
|
||||
parameters:
|
||||
<<: *common_model_config
|
||||
batch_size: 128
|
||||
densenet_models:
|
||||
<<: *default_pytorch_mode
|
||||
models:
|
||||
- densenet169
|
||||
- densenet201
|
||||
parameters:
|
||||
<<: *common_model_config
|
||||
batch_size: 128
|
||||
vgg_models:
|
||||
<<: *default_pytorch_mode
|
||||
models:
|
||||
- vgg11
|
||||
- vgg13
|
||||
- vgg16
|
||||
- vgg19
|
||||
parameters:
|
||||
duration: 0
|
||||
num_warmup: 16
|
||||
num_steps: 128
|
||||
<<: *common_model_config
|
||||
batch_size: 128
|
||||
precision:
|
||||
- float32
|
||||
- float16
|
||||
model_action:
|
||||
- train
|
||||
- inference
|
||||
|
|
|
@ -22,10 +22,14 @@
|
|||
container: sb-workspace
|
||||
sb_nodes: '{{ hostvars.values() | map(attribute="ansible_hostname") | sort }}'
|
||||
sb_env: |
|
||||
# pytorch env
|
||||
NNODES={{ sb_nodes | length }}
|
||||
NODE_RANK={{ lookup('ansible.utils.index_of', sb_nodes, 'eq', ansible_hostname) }}
|
||||
MASTER_ADDR={{ sb_nodes | first }}
|
||||
MASTER_PORT=29500
|
||||
OMP_NUM_THREADS=1
|
||||
# config env
|
||||
{{ env | default('') }}
|
||||
tasks:
|
||||
- name: Updating Config
|
||||
copy:
|
||||
|
|
|
@ -65,8 +65,8 @@
|
|||
docker rm --force {{ container }} ||: && \
|
||||
docker run -itd --name={{ container }} \
|
||||
--privileged --net=host --ipc=host \
|
||||
{{ '--gpus=all' if gpu_vendor == 'nvidia' }} \
|
||||
{{ '--security-opt seccomp=unconfined --group-add video' if gpu_vendor == 'amd' }} \
|
||||
{{ '--gpus=all' if gpu_vendor == 'nvidia' else '' }} \
|
||||
{{ '--security-opt seccomp=unconfined --group-add video' if gpu_vendor == 'amd' else '' }} \
|
||||
-w /root -v {{ workspace }}:/root -v /mnt:/mnt \
|
||||
{{ docker_image }} bash && \
|
||||
docker exec {{ container }} bash -c \
|
||||
|
|
|
@ -54,6 +54,8 @@ class SuperBenchRunner():
|
|||
InvalidConfigError: If input config is invalid.
|
||||
"""
|
||||
# TODO: add validation and defaulting
|
||||
if not self._sb_config.superbench.env:
|
||||
self._sb_config.superbench.env = {}
|
||||
for name in self._sb_benchmarks:
|
||||
if not self._sb_benchmarks[name].modes:
|
||||
self._sb_benchmarks[name].modes = []
|
||||
|
@ -141,7 +143,13 @@ class SuperBenchRunner():
|
|||
logger.info('Checking SuperBench environment.')
|
||||
OmegaConf.save(config=self._sb_config, f=str(Path(self._output_dir) / 'sb.config.yaml'))
|
||||
self._ansible_client.run(
|
||||
self._ansible_client.get_playbook_config('check_env.yaml', extravars={'output_dir': self._output_dir})
|
||||
self._ansible_client.get_playbook_config(
|
||||
'check_env.yaml',
|
||||
extravars={
|
||||
'output_dir': self._output_dir,
|
||||
'env': '\n'.join(f'{k}={v}' for k, v in self._sb_config.superbench.env.items()),
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
def _run_proc(self, benchmark_name, mode, vars):
|
||||
|
@ -161,7 +169,7 @@ class SuperBenchRunner():
|
|||
self._ansible_client.get_shell_config(
|
||||
(
|
||||
'docker exec sb-workspace bash -c '
|
||||
'"set -o allexport && source sb.env && set +o allexport && {command}"'
|
||||
"'set -o allexport && source sb.env && set +o allexport && {command}'"
|
||||
).format(command=self.__get_mode_command(benchmark_name, mode), )
|
||||
),
|
||||
sudo=True
|
||||
|
|
|
@ -53,7 +53,11 @@ class SuperBenchCLIScenarioTest(ScenarioTest):
|
|||
|
||||
def test_sb_deploy(self):
|
||||
"""Test sb deploy."""
|
||||
self.cmd('sb deploy --host-list localhost', expect_failure=True)
|
||||
self.cmd('sb deploy --host-list localhost', checks=[NoneCheck()])
|
||||
|
||||
def test_sb_deploy_no_host(self):
|
||||
"""Test sb deploy, no host_file or host_list provided, should fail."""
|
||||
self.cmd('sb deploy', expect_failure=True)
|
||||
|
||||
def test_sb_exec(self):
|
||||
"""Test sb exec."""
|
||||
|
|
|
@ -10,6 +10,7 @@ import tempfile
|
|||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
import yaml
|
||||
from omegaconf import OmegaConf
|
||||
|
||||
from superbench.executor import SuperBenchExecutor
|
||||
|
@ -24,7 +25,8 @@ class ExecutorTestCase(unittest.TestCase):
|
|||
def setUp(self):
|
||||
"""Hook method for setting up the test fixture before exercising it."""
|
||||
default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml'
|
||||
self.default_config = OmegaConf.load(str(default_config_file))
|
||||
with default_config_file.open() as fp:
|
||||
self.default_config = OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
|
||||
self.output_dir = tempfile.mkdtemp()
|
||||
|
||||
self.executor = SuperBenchExecutor(self.default_config, self.output_dir)
|
||||
|
@ -61,19 +63,31 @@ class ExecutorTestCase(unittest.TestCase):
|
|||
|
||||
def test_get_arguments(self):
|
||||
"""Test benchmarks arguments."""
|
||||
expected_matmul_args = ''
|
||||
self.assertEqual(
|
||||
self.executor._SuperBenchExecutor__get_arguments(
|
||||
self.default_config.superbench.benchmarks.matmul.parameters
|
||||
), expected_matmul_args
|
||||
)
|
||||
expected_bert_models_args = \
|
||||
'--duration 0 --num_warmup 16 --num_steps 128 --batch_size 16 ' \
|
||||
test_cases = [
|
||||
{
|
||||
'parameters': None,
|
||||
'expected_args': '',
|
||||
},
|
||||
{
|
||||
'parameters': {
|
||||
'duration': 0,
|
||||
'num_warmup': 16,
|
||||
'num_steps': 128,
|
||||
'batch_size': 16,
|
||||
'precision': ['float32', 'float16'],
|
||||
'model_action': ['train', 'inference'],
|
||||
},
|
||||
'expected_args': (
|
||||
'--duration 0 --num_warmup 16 --num_steps 128 --batch_size 16 '
|
||||
'--precision float32 float16 --model_action train inference'
|
||||
),
|
||||
},
|
||||
]
|
||||
for test_case in test_cases:
|
||||
with self.subTest(msg='Testing with case', test_case=test_case):
|
||||
self.assertEqual(
|
||||
self.executor._SuperBenchExecutor__get_arguments(
|
||||
self.default_config.superbench.benchmarks.bert_models.parameters
|
||||
), expected_bert_models_args
|
||||
self.executor._SuperBenchExecutor__get_arguments(test_case['parameters']),
|
||||
test_case['expected_args']
|
||||
)
|
||||
|
||||
def test_create_benchmark_dir(self):
|
||||
|
|
|
@ -9,6 +9,7 @@ import tempfile
|
|||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
import yaml
|
||||
from omegaconf import OmegaConf
|
||||
|
||||
from superbench.runner import SuperBenchRunner
|
||||
|
@ -19,7 +20,8 @@ class RunnerTestCase(unittest.TestCase):
|
|||
def setUp(self):
|
||||
"""Hook method for setting up the test fixture before exercising it."""
|
||||
default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml'
|
||||
self.default_config = OmegaConf.load(str(default_config_file))
|
||||
with default_config_file.open() as fp:
|
||||
self.default_config = OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
|
||||
self.output_dir = tempfile.mkdtemp()
|
||||
|
||||
self.runner = SuperBenchRunner(self.default_config, None, None, self.output_dir)
|
||||
|
|
Загрузка…
Ссылка в новой задаче