2023-03-31 03:14:38 +03:00
|
|
|
# Copyright (c) Microsoft Corporation.
|
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
|
|
|
# DeepSpeed Team
|
2021-11-02 00:22:09 +03:00
|
|
|
|
2020-09-02 04:06:15 +03:00
|
|
|
import sys
|
|
|
|
import types
|
2022-11-23 00:06:36 +03:00
|
|
|
import json
|
2021-08-25 23:01:07 +03:00
|
|
|
from typing import Optional, Union
|
|
|
|
import torch
|
|
|
|
from torch.optim import Optimizer
|
|
|
|
from torch.optim.lr_scheduler import _LRScheduler
|
2021-05-24 11:10:39 +03:00
|
|
|
from packaging import version as pkg_version
|
2020-02-01 03:16:04 +03:00
|
|
|
|
2020-09-10 03:14:12 +03:00
|
|
|
from . import ops
|
2021-05-24 11:10:39 +03:00
|
|
|
from . import module_inject
|
2020-09-10 03:14:12 +03:00
|
|
|
|
2023-04-21 02:55:12 +03:00
|
|
|
from .accelerator import get_accelerator
|
2021-08-25 23:01:07 +03:00
|
|
|
from .runtime.engine import DeepSpeedEngine, DeepSpeedOptimizerCallable, DeepSpeedSchedulerCallable
|
2020-10-30 19:01:04 +03:00
|
|
|
from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER
|
2023-04-11 21:53:38 +03:00
|
|
|
from .runtime.hybrid_engine import DeepSpeedHybridEngine
|
2020-09-10 09:14:55 +03:00
|
|
|
from .runtime.pipe.engine import PipelineEngine
|
2021-05-24 11:10:39 +03:00
|
|
|
from .inference.engine import InferenceEngine
|
2022-11-15 03:45:43 +03:00
|
|
|
from .inference.config import DeepSpeedInferenceConfig
|
2020-09-10 03:14:12 +03:00
|
|
|
from .runtime.lr_schedules import add_tuning_arguments
|
2021-01-15 00:38:46 +03:00
|
|
|
from .runtime.config import DeepSpeedConfig, DeepSpeedConfigError
|
2020-09-10 03:14:12 +03:00
|
|
|
from .runtime.activation_checkpointing import checkpointing
|
|
|
|
from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
|
2021-05-24 11:10:39 +03:00
|
|
|
from .module_inject import replace_transformer_layer, revert_transformer_layer
|
|
|
|
|
2023-04-11 21:53:38 +03:00
|
|
|
from .utils import log_dist, OnDevice, logger
|
2022-06-11 02:47:33 +03:00
|
|
|
from .comm.comm import init_distributed
|
2020-09-10 09:14:55 +03:00
|
|
|
|
2021-03-08 23:54:54 +03:00
|
|
|
from .runtime import zero
|
2022-05-11 20:09:06 +03:00
|
|
|
from .runtime import DeepSpeedOptimizer, ZeROOptimizer
|
2021-03-08 23:54:54 +03:00
|
|
|
|
2020-09-10 09:14:55 +03:00
|
|
|
from .pipe import PipelineModule
|
2020-05-19 11:00:53 +03:00
|
|
|
|
2020-11-12 22:51:38 +03:00
|
|
|
from .git_version_info import version, git_hash, git_branch
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_version(version_str):
|
|
|
|
'''Parse a version string and extract the major, minor, and patch versions.'''
|
2021-05-24 11:10:39 +03:00
|
|
|
ver = pkg_version.parse(version_str)
|
2021-05-20 01:42:45 +03:00
|
|
|
return ver.major, ver.minor, ver.micro
|
2020-11-12 22:51:38 +03:00
|
|
|
|
2020-02-01 03:16:04 +03:00
|
|
|
|
2020-02-10 07:03:35 +03:00
|
|
|
# Export version information
|
2020-11-12 22:51:38 +03:00
|
|
|
__version__ = version
|
|
|
|
__version_major__, __version_minor__, __version_patch__ = _parse_version(__version__)
|
2020-02-01 03:16:04 +03:00
|
|
|
__git_hash__ = git_hash
|
|
|
|
__git_branch__ = git_branch
|
|
|
|
|
2023-04-21 02:55:12 +03:00
|
|
|
# Set to torch's distributed package or deepspeed.comm based inside DeepSpeedEngine init
|
|
|
|
dist = None
|
|
|
|
|
2020-02-01 03:16:04 +03:00
|
|
|
|
2021-03-16 22:38:08 +03:00
|
|
|
def initialize(args=None,
|
2021-08-25 23:01:07 +03:00
|
|
|
model: torch.nn.Module = None,
|
2023-03-27 14:55:19 +03:00
|
|
|
optimizer: Optional[Union[Optimizer, DeepSpeedOptimizerCallable]] = None,
|
2021-08-25 23:01:07 +03:00
|
|
|
model_parameters: Optional[torch.nn.Module] = None,
|
|
|
|
training_data: Optional[torch.utils.data.Dataset] = None,
|
2023-03-27 14:55:19 +03:00
|
|
|
lr_scheduler: Optional[Union[_LRScheduler, DeepSpeedSchedulerCallable]] = None,
|
2020-02-01 03:16:04 +03:00
|
|
|
mpu=None,
|
2021-08-25 23:01:07 +03:00
|
|
|
dist_init_required: Optional[bool] = None,
|
2020-05-19 11:00:53 +03:00
|
|
|
collate_fn=None,
|
2021-04-29 03:05:03 +03:00
|
|
|
config=None,
|
2020-05-19 11:00:53 +03:00
|
|
|
config_params=None):
|
2020-04-22 08:18:47 +03:00
|
|
|
"""Initialize the DeepSpeed Engine.
|
2020-02-01 03:16:04 +03:00
|
|
|
|
|
|
|
Arguments:
|
2021-04-29 03:05:03 +03:00
|
|
|
args: an object containing local_rank and deepspeed_config fields.
|
|
|
|
This is optional if `config` is passed.
|
2020-02-01 03:16:04 +03:00
|
|
|
|
|
|
|
model: Required: nn.module class before apply any wrappers
|
|
|
|
|
2021-08-25 23:01:07 +03:00
|
|
|
optimizer: Optional: a user defined Optimizer or Callable that returns an Optimizer object.
|
|
|
|
This overrides any optimizer definition in the DeepSpeed json config.
|
2020-02-01 03:16:04 +03:00
|
|
|
|
2020-02-07 00:14:22 +03:00
|
|
|
model_parameters: Optional: An iterable of torch.Tensors or dicts.
|
2020-02-01 03:16:04 +03:00
|
|
|
Specifies what Tensors should be optimized.
|
|
|
|
|
|
|
|
training_data: Optional: Dataset of type torch.utils.data.Dataset
|
|
|
|
|
2021-08-25 23:01:07 +03:00
|
|
|
lr_scheduler: Optional: Learning Rate Scheduler Object or a Callable that takes an Optimizer and returns a Scheduler object.
|
|
|
|
The scheduler object should define a get_lr(), step(), state_dict(), and load_state_dict() methods
|
2020-02-01 03:16:04 +03:00
|
|
|
|
|
|
|
mpu: Optional: A model parallelism unit object that implements
|
2020-02-20 08:41:57 +03:00
|
|
|
get_{model,data}_parallel_{rank,group,world_size}()
|
2020-02-01 03:16:04 +03:00
|
|
|
|
2022-06-11 02:47:33 +03:00
|
|
|
dist_init_required: Optional: None will auto-initialize torch distributed if needed,
|
2020-02-27 02:07:49 +03:00
|
|
|
otherwise the user can force it to be initialized or not via boolean.
|
2020-02-01 03:16:04 +03:00
|
|
|
|
|
|
|
collate_fn: Optional: Merges a list of samples to form a
|
|
|
|
mini-batch of Tensor(s). Used when using batched loading from a
|
|
|
|
map-style dataset.
|
|
|
|
|
2021-04-29 03:05:03 +03:00
|
|
|
config: Optional: Instead of requiring args.deepspeed_config you can pass your deepspeed config
|
|
|
|
as an argument instead, as a path or a dictionary.
|
|
|
|
|
|
|
|
config_params: Optional: Same as `config`, kept for backwards compatibility.
|
2021-03-16 22:38:08 +03:00
|
|
|
|
2020-04-22 08:18:47 +03:00
|
|
|
Returns:
|
|
|
|
A tuple of ``engine``, ``optimizer``, ``training_dataloader``, ``lr_scheduler``
|
2020-02-01 03:16:04 +03:00
|
|
|
|
2020-04-22 08:18:47 +03:00
|
|
|
* ``engine``: DeepSpeed runtime engine which wraps the client model for distributed training.
|
2020-02-07 00:14:22 +03:00
|
|
|
|
2020-04-22 08:18:47 +03:00
|
|
|
* ``optimizer``: Wrapped optimizer if a user defined ``optimizer`` is supplied, or if
|
|
|
|
optimizer is specified in json config else ``None``.
|
2020-02-07 00:14:22 +03:00
|
|
|
|
2020-04-22 08:18:47 +03:00
|
|
|
* ``training_dataloader``: DeepSpeed dataloader if ``training_data`` was supplied,
|
|
|
|
otherwise ``None``.
|
2020-02-07 00:14:22 +03:00
|
|
|
|
2020-04-22 08:18:47 +03:00
|
|
|
* ``lr_scheduler``: Wrapped lr scheduler if user ``lr_scheduler`` is passed, or
|
|
|
|
if ``lr_scheduler`` specified in JSON configuration. Otherwise ``None``.
|
2020-02-01 03:16:04 +03:00
|
|
|
"""
|
2023-03-27 14:55:19 +03:00
|
|
|
log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(__version__, __git_hash__,
|
|
|
|
__git_branch__),
|
2020-09-10 09:14:55 +03:00
|
|
|
ranks=[0])
|
2022-06-11 02:47:33 +03:00
|
|
|
|
2022-07-30 03:24:46 +03:00
|
|
|
# Disable zero.Init context if it's currently enabled
|
|
|
|
zero.partition_parameters.shutdown_init_context()
|
|
|
|
|
2021-03-16 22:38:08 +03:00
|
|
|
assert model is not None, "deepspeed.initialize requires a model"
|
|
|
|
|
2023-04-21 02:55:12 +03:00
|
|
|
global dist
|
|
|
|
from deepspeed import comm as dist
|
|
|
|
dist_backend = get_accelerator().communication_backend_name()
|
|
|
|
dist.init_distributed(dist_backend=dist_backend, dist_init_required=dist_init_required)
|
|
|
|
|
2023-04-11 21:53:38 +03:00
|
|
|
# Set config using config_params for backwards compat
|
|
|
|
if config is None and config_params is not None:
|
|
|
|
config = config_params
|
|
|
|
|
|
|
|
# Check for deepscale_config for backwards compat
|
|
|
|
if hasattr(args, "deepscale_config") and args.deepscale_config is not None:
|
|
|
|
logger.warning("************ --deepscale_config is deprecated, please use --deepspeed_config ************")
|
|
|
|
if hasattr(args, "deepspeed_config"):
|
|
|
|
assert (args.deepspeed_config is
|
|
|
|
None), "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config"
|
|
|
|
args.deepspeed_config = args.deepscale_config
|
|
|
|
args.deepscale_config = None
|
|
|
|
|
|
|
|
# Check that we have only one config passed
|
|
|
|
if hasattr(args, "deepspeed_config") and args.deepspeed_config is not None:
|
|
|
|
assert config is None, "Not sure how to proceed, we were given deepspeed configs in the deepspeed arguments and deepspeed.initialize() function call"
|
|
|
|
config = args.deepspeed_config
|
|
|
|
assert config != None, "DeepSpeed requires --deepspeed_config to specify configuration file"
|
|
|
|
|
2020-09-10 09:14:55 +03:00
|
|
|
if not isinstance(model, PipelineModule):
|
2023-04-11 21:53:38 +03:00
|
|
|
config_class = DeepSpeedConfig(config, mpu)
|
|
|
|
if config_class.hybrid_engine.enabled:
|
|
|
|
engine = DeepSpeedHybridEngine(args=args,
|
|
|
|
model=model,
|
|
|
|
optimizer=optimizer,
|
|
|
|
model_parameters=model_parameters,
|
|
|
|
training_data=training_data,
|
|
|
|
lr_scheduler=lr_scheduler,
|
|
|
|
mpu=mpu,
|
|
|
|
dist_init_required=dist_init_required,
|
|
|
|
collate_fn=collate_fn,
|
|
|
|
config=config,
|
|
|
|
config_class=config_class)
|
|
|
|
else:
|
|
|
|
engine = DeepSpeedEngine(args=args,
|
|
|
|
model=model,
|
|
|
|
optimizer=optimizer,
|
|
|
|
model_parameters=model_parameters,
|
|
|
|
training_data=training_data,
|
|
|
|
lr_scheduler=lr_scheduler,
|
|
|
|
mpu=mpu,
|
|
|
|
dist_init_required=dist_init_required,
|
|
|
|
collate_fn=collate_fn,
|
|
|
|
config=config,
|
|
|
|
config_class=config_class)
|
2020-09-10 09:14:55 +03:00
|
|
|
else:
|
|
|
|
assert mpu is None, "mpu must be None with pipeline parallelism"
|
2023-04-11 21:53:38 +03:00
|
|
|
mpu = model.mpu()
|
|
|
|
config_class = DeepSpeedConfig(config, mpu)
|
2020-09-10 09:14:55 +03:00
|
|
|
engine = PipelineEngine(args=args,
|
|
|
|
model=model,
|
|
|
|
optimizer=optimizer,
|
|
|
|
model_parameters=model_parameters,
|
|
|
|
training_data=training_data,
|
|
|
|
lr_scheduler=lr_scheduler,
|
2023-04-11 21:53:38 +03:00
|
|
|
mpu=mpu,
|
2020-09-10 09:14:55 +03:00
|
|
|
dist_init_required=dist_init_required,
|
|
|
|
collate_fn=collate_fn,
|
2021-04-29 03:05:03 +03:00
|
|
|
config=config,
|
2023-04-11 21:53:38 +03:00
|
|
|
config_class=config_class)
|
2020-02-01 03:16:04 +03:00
|
|
|
|
2023-03-27 14:55:19 +03:00
|
|
|
return_items = [engine, engine.optimizer, engine.training_dataloader, engine.lr_scheduler]
|
2020-02-01 03:16:04 +03:00
|
|
|
return tuple(return_items)
|
|
|
|
|
|
|
|
|
2020-02-07 00:14:22 +03:00
|
|
|
def _add_core_arguments(parser):
|
|
|
|
r"""Helper (internal) function to update an argument parser with an argument group of the core DeepSpeed arguments.
|
|
|
|
The core set of DeepSpeed arguments include the following:
|
|
|
|
1) --deepspeed: boolean flag to enable DeepSpeed
|
|
|
|
2) --deepspeed_config <json file path>: path of a json configuration file to configure DeepSpeed runtime.
|
|
|
|
|
|
|
|
This is a helper function to the public add_config_arguments()
|
2020-02-01 03:16:04 +03:00
|
|
|
|
|
|
|
Arguments:
|
|
|
|
parser: argument parser
|
|
|
|
Return:
|
|
|
|
parser: Updated Parser
|
|
|
|
"""
|
|
|
|
group = parser.add_argument_group('DeepSpeed', 'DeepSpeed configurations')
|
|
|
|
|
2023-03-27 14:55:19 +03:00
|
|
|
group.add_argument('--deepspeed',
|
|
|
|
default=False,
|
|
|
|
action='store_true',
|
|
|
|
help='Enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)')
|
2020-02-01 03:16:04 +03:00
|
|
|
|
2023-03-27 14:55:19 +03:00
|
|
|
group.add_argument('--deepspeed_config', default=None, type=str, help='DeepSpeed json configuration file.')
|
2020-02-01 03:16:04 +03:00
|
|
|
|
2023-03-27 14:55:19 +03:00
|
|
|
group.add_argument('--deepscale',
|
|
|
|
default=False,
|
|
|
|
action='store_true',
|
|
|
|
help='Deprecated enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)')
|
2020-02-24 23:47:17 +03:00
|
|
|
|
2020-02-21 01:16:41 +03:00
|
|
|
group.add_argument('--deepscale_config',
|
|
|
|
default=None,
|
|
|
|
type=str,
|
|
|
|
help='Deprecated DeepSpeed json configuration file.')
|
2020-02-27 18:22:57 +03:00
|
|
|
|
2023-03-27 14:55:19 +03:00
|
|
|
group.add_argument('--deepspeed_mpi',
|
|
|
|
default=False,
|
|
|
|
action='store_true',
|
|
|
|
help="Run via MPI, this will attempt to discover the necessary variables to initialize torch "
|
|
|
|
"distributed from the MPI environment")
|
2020-02-27 18:22:57 +03:00
|
|
|
|
2020-02-01 03:16:04 +03:00
|
|
|
return parser
|
|
|
|
|
|
|
|
|
|
|
|
def add_config_arguments(parser):
|
2020-02-07 00:14:22 +03:00
|
|
|
r"""Update the argument parser to enabling parsing of DeepSpeed command line arguments.
|
|
|
|
The set of DeepSpeed arguments include the following:
|
|
|
|
1) --deepspeed: boolean flag to enable DeepSpeed
|
|
|
|
2) --deepspeed_config <json file path>: path of a json configuration file to configure DeepSpeed runtime.
|
2020-02-01 03:16:04 +03:00
|
|
|
|
|
|
|
Arguments:
|
|
|
|
parser: argument parser
|
|
|
|
Return:
|
|
|
|
parser: Updated Parser
|
|
|
|
"""
|
2020-02-07 00:14:22 +03:00
|
|
|
parser = _add_core_arguments(parser)
|
2020-02-01 03:16:04 +03:00
|
|
|
|
|
|
|
return parser
|
2021-05-24 11:10:39 +03:00
|
|
|
|
|
|
|
|
2022-11-15 03:45:43 +03:00
|
|
|
def default_inference_config():
|
|
|
|
"""
|
|
|
|
Return a default DeepSpeed inference configuration dictionary.
|
|
|
|
"""
|
|
|
|
return DeepSpeedInferenceConfig().dict()
|
|
|
|
|
|
|
|
|
2022-11-23 05:00:11 +03:00
|
|
|
def init_inference(model, config=None, **kwargs):
|
2021-05-24 11:10:39 +03:00
|
|
|
"""Initialize the DeepSpeed InferenceEngine.
|
|
|
|
|
2022-11-15 03:45:43 +03:00
|
|
|
Description: all four cases are valid and supported in DS init_inference() API.
|
2021-05-24 11:10:39 +03:00
|
|
|
|
2022-11-15 03:45:43 +03:00
|
|
|
# Case 1: user provides no config and no kwargs. Default config will be used.
|
2022-11-16 00:27:22 +03:00
|
|
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
generator.model = deepspeed.init_inference(generator.model)
|
|
|
|
string = generator("DeepSpeed is")
|
|
|
|
print(string)
|
2022-01-19 03:25:01 +03:00
|
|
|
|
2022-11-15 03:45:43 +03:00
|
|
|
# Case 2: user provides a config and no kwargs. User supplied config will be used.
|
2022-11-16 00:27:22 +03:00
|
|
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
generator.model = deepspeed.init_inference(generator.model, config=config)
|
|
|
|
string = generator("DeepSpeed is")
|
|
|
|
print(string)
|
2021-05-24 11:10:39 +03:00
|
|
|
|
2022-11-15 03:45:43 +03:00
|
|
|
# Case 3: user provides no config and uses keyword arguments (kwargs) only.
|
2022-11-16 00:27:22 +03:00
|
|
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
generator.model = deepspeed.init_inference(generator.model,
|
|
|
|
mp_size=world_size,
|
|
|
|
dtype=torch.half,
|
|
|
|
replace_with_kernel_inject=True)
|
|
|
|
string = generator("DeepSpeed is")
|
|
|
|
print(string)
|
2022-04-26 21:50:38 +03:00
|
|
|
|
2022-11-15 03:45:43 +03:00
|
|
|
# Case 4: user provides config and keyword arguments (kwargs). Both config and kwargs are merged and kwargs take precedence.
|
2022-11-16 00:27:22 +03:00
|
|
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
generator.model = deepspeed.init_inference(generator.model, config={"dtype": torch.half}, replace_with_kernel_inject=True)
|
|
|
|
string = generator("DeepSpeed is")
|
|
|
|
print(string)
|
2022-11-15 03:45:43 +03:00
|
|
|
|
|
|
|
Arguments:
|
|
|
|
model: Required: original nn.module object without any wrappers
|
|
|
|
|
2022-11-23 00:06:36 +03:00
|
|
|
config: Optional: instead of arguments, you can pass in a DS inference config dict or path to JSON file
|
2021-05-24 11:10:39 +03:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
A deepspeed.InferenceEngine wrapped model.
|
|
|
|
"""
|
2023-03-27 14:55:19 +03:00
|
|
|
log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(__version__, __git_hash__,
|
|
|
|
__git_branch__),
|
2021-05-24 11:10:39 +03:00
|
|
|
ranks=[0])
|
|
|
|
|
2022-11-23 00:06:36 +03:00
|
|
|
# Load config_dict from config first
|
2022-11-23 05:00:11 +03:00
|
|
|
if config is None:
|
|
|
|
config = {}
|
2022-11-23 00:06:36 +03:00
|
|
|
if isinstance(config, str):
|
|
|
|
with open(config, "r") as f:
|
|
|
|
config_dict = json.load(f)
|
|
|
|
elif isinstance(config, dict):
|
2022-11-15 03:45:43 +03:00
|
|
|
config_dict = config
|
2022-11-23 00:06:36 +03:00
|
|
|
else:
|
2023-03-27 14:55:19 +03:00
|
|
|
raise ValueError(f"'config' argument expected string or dictionary, got {type(config)}")
|
2022-11-23 00:06:36 +03:00
|
|
|
|
|
|
|
# Update with values from kwargs, ensuring no conflicting overlap between config and kwargs
|
|
|
|
overlap_keys = set(config_dict.keys()).intersection(kwargs.keys())
|
|
|
|
# If there is overlap, error out if values are different
|
|
|
|
for key in overlap_keys:
|
|
|
|
if config_dict[key] != kwargs[key]:
|
2023-03-27 14:55:19 +03:00
|
|
|
raise ValueError(f"Conflicting argument '{key}' in 'config':{config_dict[key]} and kwargs:{kwargs[key]}")
|
2022-11-23 00:06:36 +03:00
|
|
|
config_dict.update(kwargs)
|
2022-11-15 03:45:43 +03:00
|
|
|
|
|
|
|
ds_inference_config = DeepSpeedInferenceConfig(**config_dict)
|
|
|
|
|
|
|
|
engine = InferenceEngine(model, config=ds_inference_config)
|
2021-05-24 11:10:39 +03:00
|
|
|
|
|
|
|
return engine
|