зеркало из https://github.com/microsoft/DeepSpeed.git
Ability to initialize distributed backend outside deepspeed runtime (#608)
This commit is contained in:
Родитель
fd2f970bdf
Коммит
7435b2f10a
|
@ -1 +1 @@
|
||||||
Subproject commit abb270641ca8c33476282bde29916c395a060ae9
|
Subproject commit 78d69cb2f89a27b1e9b072df8c3e47d00c024fdc
|
|
@ -14,6 +14,7 @@ from .runtime.config import DeepSpeedConfig
|
||||||
from .runtime.activation_checkpointing import checkpointing
|
from .runtime.activation_checkpointing import checkpointing
|
||||||
from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
|
from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
|
||||||
from .utils import log_dist
|
from .utils import log_dist
|
||||||
|
from .utils.distributed import init_distributed
|
||||||
|
|
||||||
from .pipe import PipelineModule
|
from .pipe import PipelineModule
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,8 @@
|
||||||
|
'''
|
||||||
|
Copyright 2020 The Microsoft DeepSpeed Team
|
||||||
|
'''
|
||||||
|
|
||||||
|
#############################################
|
||||||
|
# Torch distributed constants
|
||||||
|
#############################################
|
||||||
|
TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
|
|
@ -1,10 +1,5 @@
|
||||||
# Copyright 2020 The Microsoft DeepSpeed Team
|
# Copyright 2020 The Microsoft DeepSpeed Team
|
||||||
|
|
||||||
#############################################
|
|
||||||
# Torch distributed constants
|
|
||||||
#############################################
|
|
||||||
TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
|
|
||||||
|
|
||||||
PDSH_LAUNCHER = 'pdsh'
|
PDSH_LAUNCHER = 'pdsh'
|
||||||
PDSH_MAX_FAN_OUT = 1024
|
PDSH_MAX_FAN_OUT = 1024
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@ import base64
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from argparse import ArgumentParser, REMAINDER
|
from argparse import ArgumentParser, REMAINDER
|
||||||
|
|
||||||
from .constants import TORCH_DISTRIBUTED_DEFAULT_PORT
|
from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
|
||||||
from ..utils import logger
|
from ..utils import logger
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -19,8 +19,8 @@ from copy import deepcopy
|
||||||
import torch.cuda
|
import torch.cuda
|
||||||
|
|
||||||
from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner
|
from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner
|
||||||
from .constants import TORCH_DISTRIBUTED_DEFAULT_PORT, \
|
from .constants import PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER
|
||||||
PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER
|
from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
|
||||||
from ..utils import logger
|
from ..utils import logger
|
||||||
|
|
||||||
DLTS_HOSTFILE = "/job/hostfile"
|
DLTS_HOSTFILE = "/job/hostfile"
|
||||||
|
|
|
@ -73,11 +73,6 @@ MAX_GRAD_NORM = 'max_grad_norm'
|
||||||
ZERO_ALLOW_UNTESTED_OPTIMIZER = "zero_allow_untested_optimizer"
|
ZERO_ALLOW_UNTESTED_OPTIMIZER = "zero_allow_untested_optimizer"
|
||||||
ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT = False
|
ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT = False
|
||||||
|
|
||||||
#############################################
|
|
||||||
# Torch distributed constants
|
|
||||||
#############################################
|
|
||||||
TORCH_DISTRIBUTED_DEFAULT_PORT = "29500"
|
|
||||||
|
|
||||||
# Steps
|
# Steps
|
||||||
STEPS_PER_PRINT = "steps_per_print"
|
STEPS_PER_PRINT = "steps_per_print"
|
||||||
STEPS_PER_PRINT_DEFAULT = 10
|
STEPS_PER_PRINT_DEFAULT = 10
|
||||||
|
|
|
@ -24,12 +24,12 @@ from deepspeed.runtime.config import DeepSpeedConfig, DEEPSPEED_OPTIMIZERS, \
|
||||||
from deepspeed.runtime.dataloader import DeepSpeedDataLoader
|
from deepspeed.runtime.dataloader import DeepSpeedDataLoader
|
||||||
from deepspeed.runtime.constants import \
|
from deepspeed.runtime.constants import \
|
||||||
ROUTE_TRAIN, ROUTE_PREDICT, ROUTE_EVAL, \
|
ROUTE_TRAIN, ROUTE_PREDICT, ROUTE_EVAL, \
|
||||||
TORCH_DISTRIBUTED_DEFAULT_PORT, PLD_THETA, PLD_GAMMA
|
PLD_THETA, PLD_GAMMA
|
||||||
from deepspeed.runtime.zero.constants import \
|
from deepspeed.runtime.zero.constants import \
|
||||||
ZERO_OPTIMIZATION_OPTIMIZER_STATES, ZERO_OPTIMIZATION_GRADIENTS
|
ZERO_OPTIMIZATION_OPTIMIZER_STATES, ZERO_OPTIMIZATION_GRADIENTS
|
||||||
from deepspeed.runtime.csr_tensor import CSRTensor
|
from deepspeed.runtime.csr_tensor import CSRTensor
|
||||||
import deepspeed.runtime.lr_schedules as lr_schedules
|
import deepspeed.runtime.lr_schedules as lr_schedules
|
||||||
from deepspeed.utils import logger, log_dist
|
from deepspeed.utils import logger, log_dist, init_distributed
|
||||||
from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer
|
from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer
|
||||||
from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
|
from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
|
||||||
|
|
||||||
|
@ -130,29 +130,14 @@ class DeepSpeedEngine(Module):
|
||||||
if dist_init_required is False:
|
if dist_init_required is False:
|
||||||
assert (dist.is_initialized()==True), "Torch distributed not initialized. Please set dist_init_required to True or initialize before calling deepspeed.initialize()"
|
assert (dist.is_initialized()==True), "Torch distributed not initialized. Please set dist_init_required to True or initialize before calling deepspeed.initialize()"
|
||||||
|
|
||||||
# DeepSpeed will initialize torch distributed only if the user has not already intialized it.
|
# Initialize torch distributed if needed
|
||||||
if dist_init_required and not dist.is_initialized():
|
init_distributed(dist_backend=self.dist_backend)
|
||||||
# discover using mpi4py if user specifies the flag
|
|
||||||
if hasattr(args, 'deepspeed_mpi') and args.deepspeed_mpi:
|
|
||||||
# if in Azure ML environment and user specified this flag, notify the user to remove the flag.
|
|
||||||
if self._in_aml():
|
|
||||||
logger.warning(
|
|
||||||
"Please remove the --deepspeed_mpi flag if running on AzureML.")
|
|
||||||
self._mpi_check(args, dist_init_required)
|
|
||||||
else:
|
|
||||||
# detect if we are in Azure ML environment
|
|
||||||
if self._in_aml():
|
|
||||||
self._set_environment_variables_for_nccl_backend(args)
|
|
||||||
|
|
||||||
logger.info("Initializing torch distributed with backend: {}".format(
|
|
||||||
self.dist_backend))
|
|
||||||
dist.init_process_group(backend=self.dist_backend)
|
|
||||||
|
|
||||||
self._do_args_sanity_check(args)
|
self._do_args_sanity_check(args)
|
||||||
self._configure_with_arguments(args, mpu)
|
self._configure_with_arguments(args, mpu)
|
||||||
self._do_sanity_check()
|
self._do_sanity_check()
|
||||||
|
|
||||||
self._init_distributed(dist_init_required)
|
self._set_distributed_vars()
|
||||||
|
|
||||||
if self.tensorboard_enabled() and self.global_rank == 0:
|
if self.tensorboard_enabled() and self.global_rank == 0:
|
||||||
self.summary_writer = self.get_summary_writer()
|
self.summary_writer = self.get_summary_writer()
|
||||||
|
@ -209,87 +194,6 @@ class DeepSpeedEngine(Module):
|
||||||
self.flatten = util_ops.flatten
|
self.flatten = util_ops.flatten
|
||||||
self.unflatten = util_ops.unflatten
|
self.unflatten = util_ops.unflatten
|
||||||
|
|
||||||
def _in_aml(self):
|
|
||||||
# read AzureML environment variable to detect if we are using an Azure ML environment
|
|
||||||
if 'AZUREML_EXPERIMENT_ID' in os.environ:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _set_environment_variables_for_nccl_backend(self,
|
|
||||||
args,
|
|
||||||
master_port=6105,
|
|
||||||
verbose=True):
|
|
||||||
"""Helper routine to get and set environment variables.
|
|
||||||
This is adapted from Azure ML's documentation available from:
|
|
||||||
https://azure.github.io/azureml-web/docs/cheatsheet/distributed-training/#environment-variables-from-openmpi
|
|
||||||
"""
|
|
||||||
os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
|
|
||||||
os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"]
|
|
||||||
single_node = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"]) == int(
|
|
||||||
os.environ["WORLD_SIZE"])
|
|
||||||
if not single_node:
|
|
||||||
master_node_params = os.environ["AZ_BATCH_MASTER_NODE"].split(":")
|
|
||||||
os.environ["MASTER_ADDR"] = master_node_params[0]
|
|
||||||
# Do not overwrite master port with that defined in AZ_BATCH_MASTER_NODE
|
|
||||||
if "MASTER_PORT" not in os.environ:
|
|
||||||
os.environ["MASTER_PORT"] = str(master_port)
|
|
||||||
else:
|
|
||||||
os.environ["MASTER_ADDR"] = os.environ["AZ_BATCHAI_MPI_MASTER_NODE"]
|
|
||||||
os.environ["MASTER_PORT"] = "54965"
|
|
||||||
print("NCCL_SOCKET_IFNAME original value = {}".format(
|
|
||||||
os.environ["NCCL_SOCKET_IFNAME"]))
|
|
||||||
|
|
||||||
os.environ["NCCL_SOCKET_IFNAME"] = "^docker0,lo"
|
|
||||||
args.local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
logger.info(
|
|
||||||
"Discovered AzureML settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
|
|
||||||
.format(os.environ['RANK'],
|
|
||||||
args.local_rank,
|
|
||||||
os.environ['WORLD_SIZE'],
|
|
||||||
os.environ['MASTER_ADDR'],
|
|
||||||
os.environ['MASTER_PORT']))
|
|
||||||
|
|
||||||
def _mpi_check(self, args, dist_init_required):
|
|
||||||
from mpi4py import MPI
|
|
||||||
import subprocess
|
|
||||||
comm = MPI.COMM_WORLD
|
|
||||||
rank = comm.Get_rank()
|
|
||||||
world_size = comm.Get_size()
|
|
||||||
|
|
||||||
master_addr = None
|
|
||||||
if rank == 0:
|
|
||||||
hostname_cmd = ["hostname -I"]
|
|
||||||
result = subprocess.check_output(hostname_cmd, shell=True)
|
|
||||||
master_addr = result.decode('utf-8').split()[0]
|
|
||||||
master_addr = comm.bcast(master_addr, root=0)
|
|
||||||
|
|
||||||
# Determine local rank by assuming hostnames are unique
|
|
||||||
proc_name = MPI.Get_processor_name()
|
|
||||||
all_procs = comm.allgather(proc_name)
|
|
||||||
local_rank = sum([i == proc_name for i in all_procs[:rank]])
|
|
||||||
|
|
||||||
os.environ['RANK'] = str(rank)
|
|
||||||
os.environ['WORLD_SIZE'] = str(world_size)
|
|
||||||
args.local_rank = local_rank
|
|
||||||
os.environ['MASTER_ADDR'] = master_addr
|
|
||||||
os.environ['MASTER_PORT'] = TORCH_DISTRIBUTED_DEFAULT_PORT
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
|
|
||||||
.format(os.environ['RANK'],
|
|
||||||
args.local_rank,
|
|
||||||
os.environ['WORLD_SIZE'],
|
|
||||||
os.environ['MASTER_ADDR'],
|
|
||||||
os.environ['MASTER_PORT']))
|
|
||||||
|
|
||||||
if not dist_init_required and dist.is_initialized():
|
|
||||||
assert dist.get_rank() == rank, "MPI rank {} does not match torch rank {}".format(rank, dist.get_rank())
|
|
||||||
assert dist.get_world_size() == world_size, "MPI world size {} does not match torch world size {}".format(
|
|
||||||
world_size, dist.get_world_size())
|
|
||||||
|
|
||||||
def pld_enabled(self):
|
def pld_enabled(self):
|
||||||
return self._config.pld_enabled
|
return self._config.pld_enabled
|
||||||
|
|
||||||
|
@ -497,7 +401,7 @@ class DeepSpeedEngine(Module):
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _init_distributed(self, dist_init_required):
|
def _set_distributed_vars(self):
|
||||||
if self.local_rank >= 0:
|
if self.local_rank >= 0:
|
||||||
torch.cuda.set_device(self.local_rank)
|
torch.cuda.set_device(self.local_rank)
|
||||||
self.device = torch.device("cuda", self.local_rank)
|
self.device = torch.device("cuda", self.local_rank)
|
||||||
|
|
|
@ -1,2 +1,3 @@
|
||||||
from deepspeed.utils.logging import logger, log_dist
|
from .logging import logger, log_dist
|
||||||
|
from .distributed import init_distributed
|
||||||
from deepspeed.runtime.dataloader import RepeatingLoader
|
from deepspeed.runtime.dataloader import RepeatingLoader
|
||||||
|
|
|
@ -0,0 +1,129 @@
|
||||||
|
'''
|
||||||
|
Copyright 2020 The Microsoft DeepSpeed Team
|
||||||
|
'''
|
||||||
|
import os
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from .logging import logger
|
||||||
|
from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
|
||||||
|
|
||||||
|
|
||||||
|
def init_distributed(dist_backend="nccl",
|
||||||
|
auto_mpi_discovery=True,
|
||||||
|
distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT,
|
||||||
|
verbose=True):
|
||||||
|
"""
|
||||||
|
Initialize torch.distributed backend, potentially performing MPI discovery if needed
|
||||||
|
Arguments:
|
||||||
|
dist_backend (str): torch distributed backend, e.g., nccl, mpi, gloo
|
||||||
|
auto_mpi_discovery (bool): if distributed environment variables are not set, attempt to discover them from MPI
|
||||||
|
distributed_port (int, optional): torch distributed backend port
|
||||||
|
verbose (bool, optional): verbose logging
|
||||||
|
"""
|
||||||
|
|
||||||
|
required_env = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
|
||||||
|
if auto_mpi_discovery and not all(map(lambda v: v in os.environ, required_env)):
|
||||||
|
if verbose:
|
||||||
|
logger.info(
|
||||||
|
"Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment..."
|
||||||
|
)
|
||||||
|
if in_aml() and not in_dlts():
|
||||||
|
patch_aml_env_for_torch_nccl_backend(verbose=verbose)
|
||||||
|
else:
|
||||||
|
mpi_discovery(distributed_port=distributed_port, verbose=verbose)
|
||||||
|
|
||||||
|
if not torch.distributed.is_initialized():
|
||||||
|
if verbose:
|
||||||
|
logger.info(
|
||||||
|
"Initializing torch distributed with backend: {}".format(dist_backend))
|
||||||
|
torch.distributed.init_process_group(backend=dist_backend)
|
||||||
|
|
||||||
|
|
||||||
|
def mpi_discovery(distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbose=True):
|
||||||
|
"""
|
||||||
|
Discovery MPI environment via mpi4py and map to relevant torch.distributed state
|
||||||
|
"""
|
||||||
|
from mpi4py import MPI
|
||||||
|
import subprocess
|
||||||
|
comm = MPI.COMM_WORLD
|
||||||
|
rank = comm.Get_rank()
|
||||||
|
world_size = comm.Get_size()
|
||||||
|
|
||||||
|
master_addr = None
|
||||||
|
if rank == 0:
|
||||||
|
hostname_cmd = ["hostname -I"]
|
||||||
|
result = subprocess.check_output(hostname_cmd, shell=True)
|
||||||
|
master_addr = result.decode('utf-8').split()[0]
|
||||||
|
master_addr = comm.bcast(master_addr, root=0)
|
||||||
|
|
||||||
|
# Determine local rank by assuming hostnames are unique
|
||||||
|
proc_name = MPI.Get_processor_name()
|
||||||
|
all_procs = comm.allgather(proc_name)
|
||||||
|
local_rank = sum([i == proc_name for i in all_procs[:rank]])
|
||||||
|
|
||||||
|
os.environ['RANK'] = str(rank)
|
||||||
|
os.environ['WORLD_SIZE'] = str(world_size)
|
||||||
|
os.environ['LOCAL_RANK'] = str(local_rank)
|
||||||
|
os.environ['MASTER_ADDR'] = master_addr
|
||||||
|
os.environ['MASTER_PORT'] = str(distributed_port)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
logger.info(
|
||||||
|
"Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
|
||||||
|
.format(os.environ['RANK'],
|
||||||
|
os.environ['LOCAL_RANK'],
|
||||||
|
os.environ['WORLD_SIZE'],
|
||||||
|
os.environ['MASTER_ADDR'],
|
||||||
|
os.environ['MASTER_PORT']))
|
||||||
|
|
||||||
|
if torch.distributed.is_initialized():
|
||||||
|
assert dist.get_rank() == rank, "MPI rank {} does not match torch rank {}".format(rank, dist.get_rank())
|
||||||
|
assert dist.get_world_size() == world_size, "MPI world size {} does not match torch world size {}".format(
|
||||||
|
world_size, dist.get_world_size())
|
||||||
|
|
||||||
|
|
||||||
|
def in_aml():
|
||||||
|
# Are we running inside an Azure Machine Learning (AML) environment?
|
||||||
|
return 'AZUREML_EXPERIMENT_ID' in os.environ
|
||||||
|
|
||||||
|
|
||||||
|
def in_dlts():
|
||||||
|
# Are we running on a DLTS cluster?
|
||||||
|
return 'DLTS_JOB_ID' in os.environ
|
||||||
|
|
||||||
|
|
||||||
|
def patch_aml_env_for_torch_nccl_backend(master_port=6105, verbose=True):
|
||||||
|
"""Helper routine to get and set environment variables.
|
||||||
|
This is adapted from Azure ML's documentation available from:
|
||||||
|
https://azure.github.io/azureml-web/docs/cheatsheet/distributed-training/#environment-variables-from-openmpi
|
||||||
|
"""
|
||||||
|
os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
|
||||||
|
os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"]
|
||||||
|
single_node = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"]) == int(
|
||||||
|
os.environ["WORLD_SIZE"])
|
||||||
|
|
||||||
|
if not single_node:
|
||||||
|
master_node_params = os.environ["AZ_BATCH_MASTER_NODE"].split(":")
|
||||||
|
os.environ["MASTER_ADDR"] = master_node_params[0]
|
||||||
|
# Do not overwrite master port with that defined in AZ_BATCH_MASTER_NODE
|
||||||
|
if "MASTER_PORT" not in os.environ:
|
||||||
|
os.environ["MASTER_PORT"] = str(master_port)
|
||||||
|
else:
|
||||||
|
os.environ["MASTER_ADDR"] = os.environ["AZ_BATCHAI_MPI_MASTER_NODE"]
|
||||||
|
os.environ["MASTER_PORT"] = "54965"
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
logger.info("NCCL_SOCKET_IFNAME original value = {}".format(
|
||||||
|
os.environ["NCCL_SOCKET_IFNAME"]))
|
||||||
|
|
||||||
|
os.environ["NCCL_SOCKET_IFNAME"] = "^docker0,lo"
|
||||||
|
os.environ['LOCAL_RANK'] = os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
logger.info(
|
||||||
|
"Discovered AzureML settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
|
||||||
|
.format(os.environ['RANK'],
|
||||||
|
os.environ['LOCAL_RANK'],
|
||||||
|
os.environ['WORLD_SIZE'],
|
||||||
|
os.environ['MASTER_ADDR'],
|
||||||
|
os.environ['MASTER_PORT']))
|
|
@ -216,25 +216,27 @@ DeepSpeed will then make sure that these environment variables are set when
|
||||||
launching each process on every node across their training job.
|
launching each process on every node across their training job.
|
||||||
|
|
||||||
|
|
||||||
### MPI Compatibility
|
### MPI and AzureML Compatibility
|
||||||
As described above, DeepSpeed provides its own parallel launcher to help launch
|
As described above, DeepSpeed provides its own parallel launcher to help launch
|
||||||
multi-node/multi-gpu training jobs. If you prefer to launch your training job
|
multi-node/multi-gpu training jobs. If you prefer to launch your training job
|
||||||
using MPI (e.g., mpirun), we provide support for this. It should be noted that
|
using MPI (e.g., mpirun), we provide support for this. It should be noted that
|
||||||
DeepSpeed will still use the torch distributed NCCL backend and *not* the MPI
|
DeepSpeed will still use the torch distributed NCCL backend and *not* the MPI
|
||||||
backend. To launch your training job with mpirun + DeepSpeed you simply pass us
|
backend.
|
||||||
an additional flag `--deepspeed_mpi`. DeepSpeed will then use
|
|
||||||
[mpi4py](https://pypi.org/project/mpi4py/) to discover the MPI environment (e.g.,
|
|
||||||
rank, world size) and properly initialize torch distributed for training. In this
|
|
||||||
case you will explicitly invoke `python` to launch your model script instead of using
|
|
||||||
the `deepspeed` launcher, here is an example:
|
|
||||||
```bash
|
|
||||||
mpirun <mpi-args> python \
|
|
||||||
<client_entry.py> <client args> \
|
|
||||||
--deepspeed_mpi --deepspeed --deepspeed_config ds_config.json
|
|
||||||
```
|
|
||||||
|
|
||||||
If you want to use this feature of DeepSpeed, please ensure that mpi4py is
|
To launch your training job with mpirun + DeepSpeed or with AzureML (which uses
|
||||||
installed via `pip install mpi4py`.
|
mpirun as a launcher backend) you simply need to install the
|
||||||
|
[mpi4py](https://pypi.org/project/mpi4py/) python package. DeepSpeed will use
|
||||||
|
this to discover the MPI environment and pass the necessary state (e.g., world
|
||||||
|
size, rank) to the torch distributed backend.
|
||||||
|
|
||||||
|
If you are using model parallelism, pipeline parallelism, or otherwise require
|
||||||
|
torch.distributed calls before calling `deepspeed.initialize(..)` we provide
|
||||||
|
the same MPI support with an additional DeepSpeed API call. Replace your initial
|
||||||
|
`torch.distributed.init_process_group(..)` call with:
|
||||||
|
|
||||||
|
```python
|
||||||
|
deepspeed.init_distributed()
|
||||||
|
```
|
||||||
|
|
||||||
## Resource Configuration (single-node)
|
## Resource Configuration (single-node)
|
||||||
In the case that we are only running on a single node (with one or more GPUs)
|
In the case that we are only running on a single node (with one or more GPUs)
|
||||||
|
|
|
@ -171,5 +171,5 @@ else
|
||||||
pdcp -w $hosts dist/deepspeed*.whl $tmp_wheel_path/
|
pdcp -w $hosts dist/deepspeed*.whl $tmp_wheel_path/
|
||||||
pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/deepspeed*.whl"
|
pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/deepspeed*.whl"
|
||||||
pdsh -w $hosts "ds_report"
|
pdsh -w $hosts "ds_report"
|
||||||
pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; rmdir $tmp_wheel_path; fi"
|
pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; rm $tmp_wheel_path/*.txt; rmdir $tmp_wheel_path; fi"
|
||||||
fi
|
fi
|
||||||
|
|
|
@ -5,6 +5,8 @@ import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from torch.multiprocessing import Process
|
from torch.multiprocessing import Process
|
||||||
|
|
||||||
|
import deepspeed
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
# Worker timeout *after* the first worker has completed.
|
# Worker timeout *after* the first worker has completed.
|
||||||
|
@ -33,10 +35,12 @@ def distributed_test(world_size=2, backend='nccl'):
|
||||||
"""Initialize torch.distributed and execute the user function. """
|
"""Initialize torch.distributed and execute the user function. """
|
||||||
os.environ['MASTER_ADDR'] = '127.0.0.1'
|
os.environ['MASTER_ADDR'] = '127.0.0.1'
|
||||||
os.environ['MASTER_PORT'] = '29503'
|
os.environ['MASTER_PORT'] = '29503'
|
||||||
dist.init_process_group(backend=backend,
|
os.environ['LOCAL_RANK'] = str(local_rank)
|
||||||
init_method='env://',
|
# NOTE: unit tests don't support multi-node so local_rank == global rank
|
||||||
rank=local_rank,
|
os.environ['RANK'] = str(local_rank)
|
||||||
world_size=num_procs)
|
os.environ['WORLD_SIZE'] = str(num_procs)
|
||||||
|
|
||||||
|
deepspeed.init_distributed(dist_backend=backend)
|
||||||
|
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
torch.cuda.set_device(local_rank)
|
torch.cuda.set_device(local_rank)
|
||||||
|
|
Загрузка…
Ссылка в новой задаче