Benchmarks: Add Feature - Add DistributedImpl and DistributedBackend arguments for micro benchmark. (#100)

This commit is contained in:
guoshzhao 2021-06-21 23:34:05 +08:00 коммит произвёл GitHub
Родитель 3d72c07807
Коммит 216c5b5c71
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 72 добавлений и 23 удалений

Просмотреть файл

@ -6,7 +6,8 @@
import importlib
from superbench.benchmarks.return_code import ReturnCode
from superbench.benchmarks.context import Platform, Framework, Precision, ModelAction, BenchmarkType, BenchmarkContext
from superbench.benchmarks.context import Platform, Framework, Precision, ModelAction, \
DistributedImpl, DistributedBackend, BenchmarkType, BenchmarkContext
from superbench.common.utils import LazyImport
BenchmarkRegistry = LazyImport(
@ -21,6 +22,6 @@ BenchmarkRegistry = LazyImport(
)
__all__ = [
'ReturnCode', 'Platform', 'Framework', 'BenchmarkType', 'Precision', 'ModelAction', 'BenchmarkContext',
'BenchmarkRegistry'
'ReturnCode', 'Platform', 'Framework', 'BenchmarkType', 'Precision', 'ModelAction', 'DistributedImpl',
'DistributedBackend', 'BenchmarkContext', 'BenchmarkRegistry'
]

Просмотреть файл

@ -61,6 +61,22 @@ class ModelAction(Enum):
INFERENCE = 'inference'
class DistributedImpl(Enum):
"""The Enum class representing different distributed implementations."""
DDP = 'ddp'
MIRRORED = 'mirrored'
MW_MIRRORED = 'multiworkermirrored'
PS = 'parameterserver'
HOROVOD = 'horovod'
class DistributedBackend(Enum):
"""The Enum class representing different distributed backends."""
NCCL = 'nccl'
MPI = 'mpi'
GLOO = 'gloo'
class BenchmarkContext():
"""Context class of all benchmarks.

Просмотреть файл

@ -21,7 +21,7 @@ import time
import torch
from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, ReturnCode
from superbench.benchmarks import DistributedImpl, DistributedBackend, BenchmarkRegistry, ReturnCode
from superbench.benchmarks.micro_benchmarks import MicroBenchmark
from superbench.benchmarks.context import Enum
@ -114,6 +114,21 @@ class ComputationCommunicationOverlap(MicroBenchmark):
required=False,
help='The number of test step.',
)
self._parser.add_argument(
'--distributed_impl',
type=DistributedImpl,
default=DistributedImpl.DDP,
required=False,
help='Distributed implementations. E.g. {}.'.format(' '.join(DistributedImpl.get_values())),
)
self._parser.add_argument(
'--distributed_backend',
type=DistributedBackend,
default=DistributedBackend.NCCL,
required=False,
help='Distributed backends. E.g. {}.'.format(' '.join(DistributedBackend.get_values())),
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
@ -124,8 +139,17 @@ class ComputationCommunicationOverlap(MicroBenchmark):
if not super()._preprocess():
return False
if self._args.distributed_impl != DistributedImpl.DDP:
self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
logger.error(
'Unsupported distributed implementation - model: {}, distributed implementation: {}.'.format(
self._name, self._args.distributed_impl
)
)
return False
try:
torch.distributed.init_process_group(backend='nccl')
torch.distributed.init_process_group(backend=self._args.distributed_backend.value)
self.__world_size = int(os.environ['WORLD_SIZE'])
self.__local_rank = int(os.environ['LOCAL_RANK'])
# if self.__world_size < 2:

Просмотреть файл

@ -18,7 +18,7 @@ import time
import torch
from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, ReturnCode
from superbench.benchmarks import DistributedImpl, DistributedBackend, BenchmarkRegistry, ReturnCode
from superbench.benchmarks.micro_benchmarks import MicroBenchmark
from superbench.benchmarks.context import Enum
@ -91,6 +91,21 @@ class ShardingMatmul(MicroBenchmark):
required=False,
help='The number of test step.',
)
self._parser.add_argument(
'--distributed_impl',
type=DistributedImpl,
default=DistributedImpl.DDP,
required=False,
help='Distributed implementations. E.g. {}.'.format(' '.join(DistributedImpl.get_values())),
)
self._parser.add_argument(
'--distributed_backend',
type=DistributedBackend,
default=DistributedBackend.NCCL,
required=False,
help='Distributed backends. E.g. {}.'.format(' '.join(DistributedBackend.get_values())),
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
@ -101,6 +116,15 @@ class ShardingMatmul(MicroBenchmark):
if not super()._preprocess():
return False
if self._args.distributed_impl != DistributedImpl.DDP:
self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
logger.error(
'Unsupported distributed implementation - model: {}, distributed implementation: {}.'.format(
self._name, self._args.distributed_impl
)
)
return False
if ShardingMode.ALLGATHER in self._args.mode or ShardingMode.ALLREDUCE in self._args.mode:
try:
torch.distributed.init_process_group(backend='nccl')

Просмотреть файл

@ -8,7 +8,7 @@ import time
from abc import abstractmethod
from superbench.common.utils import logger
from superbench.benchmarks import Precision, ModelAction, BenchmarkType, ReturnCode
from superbench.benchmarks import Precision, ModelAction, DistributedImpl, DistributedBackend, BenchmarkType, ReturnCode
from superbench.benchmarks.base import Benchmark
from superbench.benchmarks.context import Enum
@ -20,22 +20,6 @@ class Optimizer(Enum):
ADAMW = 'adamw'
class DistributedImpl(Enum):
"""The Enum class representing different distributed implementations."""
DDP = 'ddp'
MIRRORED = 'mirrored'
MW_MIRRORED = 'multiworkermirrored'
PS = 'parameterserver'
HOROVOD = 'horovod'
class DistributedBackend(Enum):
"""The Enum class representing different distributed backends."""
NCCL = 'nccl'
MPI = 'mpi'
GLOO = 'gloo'
class ModelBenchmark(Benchmark):
"""The base class of E2E model benchmarks."""
def __init__(self, name, parameters=''):