Benchmarks: Add Feature - Add DistributedImpl and DistributedBackend arguments for micro benchmark. (#100)
This commit is contained in:
Родитель
3d72c07807
Коммит
216c5b5c71
|
@ -6,7 +6,8 @@
|
|||
import importlib
|
||||
|
||||
from superbench.benchmarks.return_code import ReturnCode
|
||||
from superbench.benchmarks.context import Platform, Framework, Precision, ModelAction, BenchmarkType, BenchmarkContext
|
||||
from superbench.benchmarks.context import Platform, Framework, Precision, ModelAction, \
|
||||
DistributedImpl, DistributedBackend, BenchmarkType, BenchmarkContext
|
||||
from superbench.common.utils import LazyImport
|
||||
|
||||
BenchmarkRegistry = LazyImport(
|
||||
|
@ -21,6 +22,6 @@ BenchmarkRegistry = LazyImport(
|
|||
)
|
||||
|
||||
__all__ = [
|
||||
'ReturnCode', 'Platform', 'Framework', 'BenchmarkType', 'Precision', 'ModelAction', 'BenchmarkContext',
|
||||
'BenchmarkRegistry'
|
||||
'ReturnCode', 'Platform', 'Framework', 'BenchmarkType', 'Precision', 'ModelAction', 'DistributedImpl',
|
||||
'DistributedBackend', 'BenchmarkContext', 'BenchmarkRegistry'
|
||||
]
|
||||
|
|
|
@ -61,6 +61,22 @@ class ModelAction(Enum):
|
|||
INFERENCE = 'inference'
|
||||
|
||||
|
||||
class DistributedImpl(Enum):
|
||||
"""The Enum class representing different distributed implementations."""
|
||||
DDP = 'ddp'
|
||||
MIRRORED = 'mirrored'
|
||||
MW_MIRRORED = 'multiworkermirrored'
|
||||
PS = 'parameterserver'
|
||||
HOROVOD = 'horovod'
|
||||
|
||||
|
||||
class DistributedBackend(Enum):
|
||||
"""The Enum class representing different distributed backends."""
|
||||
NCCL = 'nccl'
|
||||
MPI = 'mpi'
|
||||
GLOO = 'gloo'
|
||||
|
||||
|
||||
class BenchmarkContext():
|
||||
"""Context class of all benchmarks.
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@ import time
|
|||
import torch
|
||||
|
||||
from superbench.common.utils import logger
|
||||
from superbench.benchmarks import BenchmarkRegistry, ReturnCode
|
||||
from superbench.benchmarks import DistributedImpl, DistributedBackend, BenchmarkRegistry, ReturnCode
|
||||
from superbench.benchmarks.micro_benchmarks import MicroBenchmark
|
||||
from superbench.benchmarks.context import Enum
|
||||
|
||||
|
@ -114,6 +114,21 @@ class ComputationCommunicationOverlap(MicroBenchmark):
|
|||
required=False,
|
||||
help='The number of test step.',
|
||||
)
|
||||
self._parser.add_argument(
|
||||
'--distributed_impl',
|
||||
type=DistributedImpl,
|
||||
default=DistributedImpl.DDP,
|
||||
required=False,
|
||||
help='Distributed implementations. E.g. {}.'.format(' '.join(DistributedImpl.get_values())),
|
||||
)
|
||||
|
||||
self._parser.add_argument(
|
||||
'--distributed_backend',
|
||||
type=DistributedBackend,
|
||||
default=DistributedBackend.NCCL,
|
||||
required=False,
|
||||
help='Distributed backends. E.g. {}.'.format(' '.join(DistributedBackend.get_values())),
|
||||
)
|
||||
|
||||
def _preprocess(self):
|
||||
"""Preprocess/preparation operations before the benchmarking.
|
||||
|
@ -124,8 +139,17 @@ class ComputationCommunicationOverlap(MicroBenchmark):
|
|||
if not super()._preprocess():
|
||||
return False
|
||||
|
||||
if self._args.distributed_impl != DistributedImpl.DDP:
|
||||
self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
|
||||
logger.error(
|
||||
'Unsupported distributed implementation - model: {}, distributed implementation: {}.'.format(
|
||||
self._name, self._args.distributed_impl
|
||||
)
|
||||
)
|
||||
return False
|
||||
|
||||
try:
|
||||
torch.distributed.init_process_group(backend='nccl')
|
||||
torch.distributed.init_process_group(backend=self._args.distributed_backend.value)
|
||||
self.__world_size = int(os.environ['WORLD_SIZE'])
|
||||
self.__local_rank = int(os.environ['LOCAL_RANK'])
|
||||
# if self.__world_size < 2:
|
||||
|
|
|
@ -18,7 +18,7 @@ import time
|
|||
import torch
|
||||
|
||||
from superbench.common.utils import logger
|
||||
from superbench.benchmarks import BenchmarkRegistry, ReturnCode
|
||||
from superbench.benchmarks import DistributedImpl, DistributedBackend, BenchmarkRegistry, ReturnCode
|
||||
from superbench.benchmarks.micro_benchmarks import MicroBenchmark
|
||||
from superbench.benchmarks.context import Enum
|
||||
|
||||
|
@ -91,6 +91,21 @@ class ShardingMatmul(MicroBenchmark):
|
|||
required=False,
|
||||
help='The number of test step.',
|
||||
)
|
||||
self._parser.add_argument(
|
||||
'--distributed_impl',
|
||||
type=DistributedImpl,
|
||||
default=DistributedImpl.DDP,
|
||||
required=False,
|
||||
help='Distributed implementations. E.g. {}.'.format(' '.join(DistributedImpl.get_values())),
|
||||
)
|
||||
|
||||
self._parser.add_argument(
|
||||
'--distributed_backend',
|
||||
type=DistributedBackend,
|
||||
default=DistributedBackend.NCCL,
|
||||
required=False,
|
||||
help='Distributed backends. E.g. {}.'.format(' '.join(DistributedBackend.get_values())),
|
||||
)
|
||||
|
||||
def _preprocess(self):
|
||||
"""Preprocess/preparation operations before the benchmarking.
|
||||
|
@ -101,6 +116,15 @@ class ShardingMatmul(MicroBenchmark):
|
|||
if not super()._preprocess():
|
||||
return False
|
||||
|
||||
if self._args.distributed_impl != DistributedImpl.DDP:
|
||||
self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
|
||||
logger.error(
|
||||
'Unsupported distributed implementation - model: {}, distributed implementation: {}.'.format(
|
||||
self._name, self._args.distributed_impl
|
||||
)
|
||||
)
|
||||
return False
|
||||
|
||||
if ShardingMode.ALLGATHER in self._args.mode or ShardingMode.ALLREDUCE in self._args.mode:
|
||||
try:
|
||||
torch.distributed.init_process_group(backend='nccl')
|
||||
|
|
|
@ -8,7 +8,7 @@ import time
|
|||
from abc import abstractmethod
|
||||
|
||||
from superbench.common.utils import logger
|
||||
from superbench.benchmarks import Precision, ModelAction, BenchmarkType, ReturnCode
|
||||
from superbench.benchmarks import Precision, ModelAction, DistributedImpl, DistributedBackend, BenchmarkType, ReturnCode
|
||||
from superbench.benchmarks.base import Benchmark
|
||||
from superbench.benchmarks.context import Enum
|
||||
|
||||
|
@ -20,22 +20,6 @@ class Optimizer(Enum):
|
|||
ADAMW = 'adamw'
|
||||
|
||||
|
||||
class DistributedImpl(Enum):
|
||||
"""The Enum class representing different distributed implementations."""
|
||||
DDP = 'ddp'
|
||||
MIRRORED = 'mirrored'
|
||||
MW_MIRRORED = 'multiworkermirrored'
|
||||
PS = 'parameterserver'
|
||||
HOROVOD = 'horovod'
|
||||
|
||||
|
||||
class DistributedBackend(Enum):
|
||||
"""The Enum class representing different distributed backends."""
|
||||
NCCL = 'nccl'
|
||||
MPI = 'mpi'
|
||||
GLOO = 'gloo'
|
||||
|
||||
|
||||
class ModelBenchmark(Benchmark):
|
||||
"""The base class of E2E model benchmarks."""
|
||||
def __init__(self, name, parameters=''):
|
||||
|
|
Загрузка…
Ссылка в новой задаче