Bug bash - Fix bugs in multi GPU benchmarks (#98)
* Add `sb deploy` command content. * Fix inline if-expression syntax in playbook. * Fix quote escape issue in bash command. * Add custom env in config. * Update default config for multi GPU benchmarks. * Update MANIFEST.in to include jinja2 template. * Require jinja2 minimum version. * Fix occasional duplicate output in Ansible runner. * Fix mixed color from Ansible and Python colorlog. * Update according to comments. * Change superbench.env from list to dict in config file.
This commit is contained in:
Родитель
216c5b5c71
Коммит
c0c43b8f81
|
@ -1,5 +1,3 @@
|
||||||
include LICENSE README.md
|
include LICENSE README.md
|
||||||
recursive-include superbench *.py
|
recursive-include superbench *.py *.j2 *.yaml
|
||||||
recursive-include superbench *.yaml
|
global-exclude *.py[cod] __pycache__
|
||||||
global-exclude *.pyc
|
|
||||||
global-exclude __pycache__
|
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -134,11 +134,13 @@ setup(
|
||||||
python_requires='>=3.6, <4',
|
python_requires='>=3.6, <4',
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'ansible_base>=2.10.9;os_name=="posix"',
|
'ansible_base>=2.10.9;os_name=="posix"',
|
||||||
'ansible_runner>=1.4.7',
|
'ansible_runner>=2.0.0rc1',
|
||||||
'colorlog>=4.7.2',
|
'colorlog>=4.7.2',
|
||||||
|
'jinja2>=2.10.1',
|
||||||
'joblib>=1.0.1',
|
'joblib>=1.0.1',
|
||||||
'knack>=0.7.2',
|
'knack>=0.7.2',
|
||||||
'omegaconf==2.0.6',
|
'omegaconf==2.0.6',
|
||||||
|
'pyyaml>=5.3',
|
||||||
],
|
],
|
||||||
extras_require={
|
extras_require={
|
||||||
'dev': ['pre-commit>=2.10.0'],
|
'dev': ['pre-commit>=2.10.0'],
|
||||||
|
|
|
@ -227,8 +227,8 @@ def deploy_command_handler(
|
||||||
private_key=private_key,
|
private_key=private_key,
|
||||||
)
|
)
|
||||||
|
|
||||||
SuperBenchRunner(sb_config, docker_config, ansible_config, output_dir)
|
runner = SuperBenchRunner(sb_config, docker_config, ansible_config, output_dir)
|
||||||
raise NotImplementedError
|
runner.deploy()
|
||||||
|
|
||||||
|
|
||||||
def run_command_handler(
|
def run_command_handler(
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
import yaml
|
||||||
from omegaconf import OmegaConf
|
from omegaconf import OmegaConf
|
||||||
|
|
||||||
|
|
||||||
|
@ -38,4 +39,5 @@ def get_sb_config(config_file):
|
||||||
p = Path(config_file) if config_file else default_config_file
|
p = Path(config_file) if config_file else default_config_file
|
||||||
if not p.is_file():
|
if not p.is_file():
|
||||||
return None
|
return None
|
||||||
return OmegaConf.load(str(p))
|
with p.open() as fp:
|
||||||
|
return OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
|
||||||
|
|
|
@ -41,6 +41,7 @@ class SuperBenchLogger:
|
||||||
)
|
)
|
||||||
if color:
|
if color:
|
||||||
formatter = colorlog.ColoredFormatter(
|
formatter = colorlog.ColoredFormatter(
|
||||||
|
'%(reset)s'
|
||||||
'[%(cyan)s%(asctime)s %(hostname)s:%(process)d%(reset)s]'
|
'[%(cyan)s%(asctime)s %(hostname)s:%(process)d%(reset)s]'
|
||||||
'[%(blue)s%(filename)s:%(lineno)s%(reset)s]'
|
'[%(blue)s%(filename)s:%(lineno)s%(reset)s]'
|
||||||
'[%(log_color)s%(levelname)s%(reset)s] %(message)s'
|
'[%(log_color)s%(levelname)s%(reset)s] %(message)s'
|
||||||
|
|
|
@ -1,115 +1,95 @@
|
||||||
# SuperBench Config
|
# SuperBench Config
|
||||||
superbench:
|
superbench:
|
||||||
enable: null
|
enable: null
|
||||||
benchmarks:
|
var:
|
||||||
kernel-launch:
|
default_local_mode: &default_local_mode
|
||||||
enable: true
|
|
||||||
gemm-flops:
|
|
||||||
enable: true
|
|
||||||
cudnn-function:
|
|
||||||
enable: true
|
|
||||||
cublas-function:
|
|
||||||
enable: true
|
|
||||||
matmul:
|
|
||||||
enable: true
|
enable: true
|
||||||
modes:
|
modes:
|
||||||
- name: local
|
- name: local
|
||||||
proc_num: 8
|
proc_num: 8
|
||||||
prefix: CUDA_VISIBLE_DEVICES={proc_rank}
|
prefix: CUDA_VISIBLE_DEVICES={proc_rank}
|
||||||
parallel: no
|
parallel: yes
|
||||||
frameworks:
|
default_pytorch_mode: &default_pytorch_mode
|
||||||
- pytorch
|
|
||||||
gpt_models:
|
|
||||||
enable: true
|
enable: true
|
||||||
modes:
|
modes:
|
||||||
- name: torch.distributed
|
- name: torch.distributed
|
||||||
proc_num: 8
|
proc_num: 8
|
||||||
node_num: all
|
node_num: 1
|
||||||
frameworks:
|
frameworks:
|
||||||
- pytorch
|
- pytorch
|
||||||
|
common_model_config: &common_model_config
|
||||||
|
duration: 0
|
||||||
|
num_warmup: 16
|
||||||
|
num_steps: 128
|
||||||
|
precision:
|
||||||
|
- float32
|
||||||
|
- float16
|
||||||
|
model_action:
|
||||||
|
- train
|
||||||
|
benchmarks:
|
||||||
|
kernel-launch:
|
||||||
|
<<: *default_local_mode
|
||||||
|
gemm-flops:
|
||||||
|
<<: *default_local_mode
|
||||||
|
cudnn-function:
|
||||||
|
<<: *default_local_mode
|
||||||
|
cublas-function:
|
||||||
|
<<: *default_local_mode
|
||||||
|
matmul:
|
||||||
|
<<: *default_local_mode
|
||||||
|
frameworks:
|
||||||
|
- pytorch
|
||||||
|
sharding-matmul:
|
||||||
|
<<: *default_pytorch_mode
|
||||||
|
computation-communication-overlap:
|
||||||
|
<<: *default_pytorch_mode
|
||||||
|
gpt_models:
|
||||||
|
<<: *default_pytorch_mode
|
||||||
models:
|
models:
|
||||||
- gpt2-small
|
- gpt2-small
|
||||||
- gpt2-large
|
- gpt2-large
|
||||||
parameters:
|
parameters:
|
||||||
duration: 0
|
<<: *common_model_config
|
||||||
num_warmup: 16
|
|
||||||
num_steps: 128
|
|
||||||
batch_size: 4
|
batch_size: 4
|
||||||
precision:
|
|
||||||
- float32
|
|
||||||
- float16
|
|
||||||
model_action:
|
|
||||||
- train
|
|
||||||
- inference
|
|
||||||
bert_models:
|
bert_models:
|
||||||
enable: true
|
<<: *default_pytorch_mode
|
||||||
modes:
|
|
||||||
- name: torch.distributed
|
|
||||||
proc_num: 8
|
|
||||||
node_num: all
|
|
||||||
frameworks:
|
|
||||||
- pytorch
|
|
||||||
models:
|
models:
|
||||||
- bert-base
|
- bert-base
|
||||||
- bert-large
|
- bert-large
|
||||||
parameters:
|
parameters:
|
||||||
duration: 0
|
<<: *common_model_config
|
||||||
num_warmup: 16
|
batch_size: 8
|
||||||
num_steps: 128
|
|
||||||
batch_size: 16
|
|
||||||
precision:
|
|
||||||
- float32
|
|
||||||
- float16
|
|
||||||
model_action:
|
|
||||||
- train
|
|
||||||
- inference
|
|
||||||
lstm_models:
|
lstm_models:
|
||||||
enable: true
|
<<: *default_pytorch_mode
|
||||||
modes:
|
|
||||||
- name: torch.distributed
|
|
||||||
proc_num: 8
|
|
||||||
node_num: all
|
|
||||||
frameworks:
|
|
||||||
- pytorch
|
|
||||||
models:
|
models:
|
||||||
- lstm
|
- lstm
|
||||||
parameters:
|
parameters:
|
||||||
duration: 0
|
<<: *common_model_config
|
||||||
num_warmup: 16
|
|
||||||
num_steps: 128
|
|
||||||
batch_size: 128
|
batch_size: 128
|
||||||
precision:
|
resnet_models:
|
||||||
- float32
|
<<: *default_pytorch_mode
|
||||||
- float16
|
|
||||||
model_action:
|
|
||||||
- train
|
|
||||||
- inference
|
|
||||||
cnn_models:
|
|
||||||
enable: true
|
|
||||||
modes:
|
|
||||||
- name: torch.distributed
|
|
||||||
proc_num: 8
|
|
||||||
node_num: all
|
|
||||||
frameworks:
|
|
||||||
- pytorch
|
|
||||||
models:
|
models:
|
||||||
- resnet50
|
- resnet50
|
||||||
- resnet101
|
- resnet101
|
||||||
- resnet152
|
- resnet152
|
||||||
|
parameters:
|
||||||
|
<<: *common_model_config
|
||||||
|
batch_size: 128
|
||||||
|
densenet_models:
|
||||||
|
<<: *default_pytorch_mode
|
||||||
|
models:
|
||||||
- densenet169
|
- densenet169
|
||||||
- densenet201
|
- densenet201
|
||||||
|
parameters:
|
||||||
|
<<: *common_model_config
|
||||||
|
batch_size: 128
|
||||||
|
vgg_models:
|
||||||
|
<<: *default_pytorch_mode
|
||||||
|
models:
|
||||||
- vgg11
|
- vgg11
|
||||||
- vgg13
|
- vgg13
|
||||||
- vgg16
|
- vgg16
|
||||||
- vgg19
|
- vgg19
|
||||||
parameters:
|
parameters:
|
||||||
duration: 0
|
<<: *common_model_config
|
||||||
num_warmup: 16
|
|
||||||
num_steps: 128
|
|
||||||
batch_size: 128
|
batch_size: 128
|
||||||
precision:
|
|
||||||
- float32
|
|
||||||
- float16
|
|
||||||
model_action:
|
|
||||||
- train
|
|
||||||
- inference
|
|
||||||
|
|
|
@ -22,10 +22,14 @@
|
||||||
container: sb-workspace
|
container: sb-workspace
|
||||||
sb_nodes: '{{ hostvars.values() | map(attribute="ansible_hostname") | sort }}'
|
sb_nodes: '{{ hostvars.values() | map(attribute="ansible_hostname") | sort }}'
|
||||||
sb_env: |
|
sb_env: |
|
||||||
|
# pytorch env
|
||||||
NNODES={{ sb_nodes | length }}
|
NNODES={{ sb_nodes | length }}
|
||||||
NODE_RANK={{ lookup('ansible.utils.index_of', sb_nodes, 'eq', ansible_hostname) }}
|
NODE_RANK={{ lookup('ansible.utils.index_of', sb_nodes, 'eq', ansible_hostname) }}
|
||||||
MASTER_ADDR={{ sb_nodes | first }}
|
MASTER_ADDR={{ sb_nodes | first }}
|
||||||
MASTER_PORT=29500
|
MASTER_PORT=29500
|
||||||
|
OMP_NUM_THREADS=1
|
||||||
|
# config env
|
||||||
|
{{ env | default('') }}
|
||||||
tasks:
|
tasks:
|
||||||
- name: Updating Config
|
- name: Updating Config
|
||||||
copy:
|
copy:
|
||||||
|
|
|
@ -65,8 +65,8 @@
|
||||||
docker rm --force {{ container }} ||: && \
|
docker rm --force {{ container }} ||: && \
|
||||||
docker run -itd --name={{ container }} \
|
docker run -itd --name={{ container }} \
|
||||||
--privileged --net=host --ipc=host \
|
--privileged --net=host --ipc=host \
|
||||||
{{ '--gpus=all' if gpu_vendor == 'nvidia' }} \
|
{{ '--gpus=all' if gpu_vendor == 'nvidia' else '' }} \
|
||||||
{{ '--security-opt seccomp=unconfined --group-add video' if gpu_vendor == 'amd' }} \
|
{{ '--security-opt seccomp=unconfined --group-add video' if gpu_vendor == 'amd' else '' }} \
|
||||||
-w /root -v {{ workspace }}:/root -v /mnt:/mnt \
|
-w /root -v {{ workspace }}:/root -v /mnt:/mnt \
|
||||||
{{ docker_image }} bash && \
|
{{ docker_image }} bash && \
|
||||||
docker exec {{ container }} bash -c \
|
docker exec {{ container }} bash -c \
|
||||||
|
|
|
@ -54,6 +54,8 @@ class SuperBenchRunner():
|
||||||
InvalidConfigError: If input config is invalid.
|
InvalidConfigError: If input config is invalid.
|
||||||
"""
|
"""
|
||||||
# TODO: add validation and defaulting
|
# TODO: add validation and defaulting
|
||||||
|
if not self._sb_config.superbench.env:
|
||||||
|
self._sb_config.superbench.env = {}
|
||||||
for name in self._sb_benchmarks:
|
for name in self._sb_benchmarks:
|
||||||
if not self._sb_benchmarks[name].modes:
|
if not self._sb_benchmarks[name].modes:
|
||||||
self._sb_benchmarks[name].modes = []
|
self._sb_benchmarks[name].modes = []
|
||||||
|
@ -141,7 +143,13 @@ class SuperBenchRunner():
|
||||||
logger.info('Checking SuperBench environment.')
|
logger.info('Checking SuperBench environment.')
|
||||||
OmegaConf.save(config=self._sb_config, f=str(Path(self._output_dir) / 'sb.config.yaml'))
|
OmegaConf.save(config=self._sb_config, f=str(Path(self._output_dir) / 'sb.config.yaml'))
|
||||||
self._ansible_client.run(
|
self._ansible_client.run(
|
||||||
self._ansible_client.get_playbook_config('check_env.yaml', extravars={'output_dir': self._output_dir})
|
self._ansible_client.get_playbook_config(
|
||||||
|
'check_env.yaml',
|
||||||
|
extravars={
|
||||||
|
'output_dir': self._output_dir,
|
||||||
|
'env': '\n'.join(f'{k}={v}' for k, v in self._sb_config.superbench.env.items()),
|
||||||
|
}
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def _run_proc(self, benchmark_name, mode, vars):
|
def _run_proc(self, benchmark_name, mode, vars):
|
||||||
|
@ -161,7 +169,7 @@ class SuperBenchRunner():
|
||||||
self._ansible_client.get_shell_config(
|
self._ansible_client.get_shell_config(
|
||||||
(
|
(
|
||||||
'docker exec sb-workspace bash -c '
|
'docker exec sb-workspace bash -c '
|
||||||
'"set -o allexport && source sb.env && set +o allexport && {command}"'
|
"'set -o allexport && source sb.env && set +o allexport && {command}'"
|
||||||
).format(command=self.__get_mode_command(benchmark_name, mode), )
|
).format(command=self.__get_mode_command(benchmark_name, mode), )
|
||||||
),
|
),
|
||||||
sudo=True
|
sudo=True
|
||||||
|
|
|
@ -53,7 +53,11 @@ class SuperBenchCLIScenarioTest(ScenarioTest):
|
||||||
|
|
||||||
def test_sb_deploy(self):
|
def test_sb_deploy(self):
|
||||||
"""Test sb deploy."""
|
"""Test sb deploy."""
|
||||||
self.cmd('sb deploy --host-list localhost', expect_failure=True)
|
self.cmd('sb deploy --host-list localhost', checks=[NoneCheck()])
|
||||||
|
|
||||||
|
def test_sb_deploy_no_host(self):
|
||||||
|
"""Test sb deploy, no host_file or host_list provided, should fail."""
|
||||||
|
self.cmd('sb deploy', expect_failure=True)
|
||||||
|
|
||||||
def test_sb_exec(self):
|
def test_sb_exec(self):
|
||||||
"""Test sb exec."""
|
"""Test sb exec."""
|
||||||
|
|
|
@ -10,6 +10,7 @@ import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
|
import yaml
|
||||||
from omegaconf import OmegaConf
|
from omegaconf import OmegaConf
|
||||||
|
|
||||||
from superbench.executor import SuperBenchExecutor
|
from superbench.executor import SuperBenchExecutor
|
||||||
|
@ -24,7 +25,8 @@ class ExecutorTestCase(unittest.TestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
"""Hook method for setting up the test fixture before exercising it."""
|
"""Hook method for setting up the test fixture before exercising it."""
|
||||||
default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml'
|
default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml'
|
||||||
self.default_config = OmegaConf.load(str(default_config_file))
|
with default_config_file.open() as fp:
|
||||||
|
self.default_config = OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
|
||||||
self.output_dir = tempfile.mkdtemp()
|
self.output_dir = tempfile.mkdtemp()
|
||||||
|
|
||||||
self.executor = SuperBenchExecutor(self.default_config, self.output_dir)
|
self.executor = SuperBenchExecutor(self.default_config, self.output_dir)
|
||||||
|
@ -61,19 +63,31 @@ class ExecutorTestCase(unittest.TestCase):
|
||||||
|
|
||||||
def test_get_arguments(self):
|
def test_get_arguments(self):
|
||||||
"""Test benchmarks arguments."""
|
"""Test benchmarks arguments."""
|
||||||
expected_matmul_args = ''
|
test_cases = [
|
||||||
self.assertEqual(
|
{
|
||||||
self.executor._SuperBenchExecutor__get_arguments(
|
'parameters': None,
|
||||||
self.default_config.superbench.benchmarks.matmul.parameters
|
'expected_args': '',
|
||||||
), expected_matmul_args
|
},
|
||||||
)
|
{
|
||||||
expected_bert_models_args = \
|
'parameters': {
|
||||||
'--duration 0 --num_warmup 16 --num_steps 128 --batch_size 16 ' \
|
'duration': 0,
|
||||||
|
'num_warmup': 16,
|
||||||
|
'num_steps': 128,
|
||||||
|
'batch_size': 16,
|
||||||
|
'precision': ['float32', 'float16'],
|
||||||
|
'model_action': ['train', 'inference'],
|
||||||
|
},
|
||||||
|
'expected_args': (
|
||||||
|
'--duration 0 --num_warmup 16 --num_steps 128 --batch_size 16 '
|
||||||
'--precision float32 float16 --model_action train inference'
|
'--precision float32 float16 --model_action train inference'
|
||||||
|
),
|
||||||
|
},
|
||||||
|
]
|
||||||
|
for test_case in test_cases:
|
||||||
|
with self.subTest(msg='Testing with case', test_case=test_case):
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
self.executor._SuperBenchExecutor__get_arguments(
|
self.executor._SuperBenchExecutor__get_arguments(test_case['parameters']),
|
||||||
self.default_config.superbench.benchmarks.bert_models.parameters
|
test_case['expected_args']
|
||||||
), expected_bert_models_args
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_create_benchmark_dir(self):
|
def test_create_benchmark_dir(self):
|
||||||
|
|
|
@ -9,6 +9,7 @@ import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
|
import yaml
|
||||||
from omegaconf import OmegaConf
|
from omegaconf import OmegaConf
|
||||||
|
|
||||||
from superbench.runner import SuperBenchRunner
|
from superbench.runner import SuperBenchRunner
|
||||||
|
@ -19,7 +20,8 @@ class RunnerTestCase(unittest.TestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
"""Hook method for setting up the test fixture before exercising it."""
|
"""Hook method for setting up the test fixture before exercising it."""
|
||||||
default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml'
|
default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml'
|
||||||
self.default_config = OmegaConf.load(str(default_config_file))
|
with default_config_file.open() as fp:
|
||||||
|
self.default_config = OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
|
||||||
self.output_dir = tempfile.mkdtemp()
|
self.output_dir = tempfile.mkdtemp()
|
||||||
|
|
||||||
self.runner = SuperBenchRunner(self.default_config, None, None, self.output_dir)
|
self.runner = SuperBenchRunner(self.default_config, None, None, self.output_dir)
|
||||||
|
|
Загрузка…
Ссылка в новой задаче