DeepSpeed/deepspeed/__init__.py

'''
Copyright 2020 The Microsoft DeepSpeed Team
'''
import sys
import types

from . import ops

from .runtime.engine import DeepSpeedEngine
from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER
from .runtime.pipe.engine import PipelineEngine
from .runtime.lr_schedules import add_tuning_arguments
from .runtime.config import DeepSpeedConfig
from .runtime.activation_checkpointing import checkpointing
from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
from .utils import log_dist
from .utils.distributed import init_distributed

from .pipe import PipelineModule

from .git_version_info import version, git_hash, git_branch


def _parse_version(version_str):
    '''Parse a version string and extract the major, minor, and patch versions.'''
    import re
    matched = re.search('^(\d+)\.(\d+)\.(\d+)', version_str)
    return int(matched.group(1)), int(matched.group(2)), int(matched.group(3))


# Export version information
__version__ = version
__version_major__, __version_minor__, __version_patch__ = _parse_version(__version__)
__git_hash__ = git_hash
__git_branch__ = git_branch

# Provide backwards compatability with old deepspeed.pt module structure, should hopefully not be used
pt = types.ModuleType('pt', 'dummy pt module for backwards compatability')
deepspeed = sys.modules[__name__]
setattr(deepspeed, 'pt', pt)
setattr(deepspeed.pt, 'deepspeed_utils', deepspeed.runtime.utils)
sys.modules['deepspeed.pt'] = deepspeed.pt
sys.modules['deepspeed.pt.deepspeed_utils'] = deepspeed.runtime.utils
setattr(deepspeed.pt, 'deepspeed_config', deepspeed.runtime.config)
sys.modules['deepspeed.pt.deepspeed_config'] = deepspeed.runtime.config
setattr(deepspeed.pt, 'loss_scaler', deepspeed.runtime.fp16.loss_scaler)
sys.modules['deepspeed.pt.loss_scaler'] = deepspeed.runtime.fp16.loss_scaler


def initialize(args,
               model,
               optimizer=None,
               model_parameters=None,
               training_data=None,
               lr_scheduler=None,
               mpu=None,
               dist_init_required=None,
               collate_fn=None,
               config_params=None):
    """Initialize the DeepSpeed Engine.

    Arguments:
        args: a dictionary containing local_rank and deepspeed_config
            file location

        model: Required: nn.module class before apply any wrappers

        optimizer: Optional: a user defined optimizer, this is typically used instead of defining
            an optimizer in the DeepSpeed json config.

        model_parameters: Optional: An iterable of torch.Tensors or dicts.
            Specifies what Tensors should be optimized.

        training_data: Optional: Dataset of type torch.utils.data.Dataset

        lr_scheduler: Optional: Learning Rate Scheduler Object. It should define a get_lr(),
            step(), state_dict(), and load_state_dict() methods

        mpu: Optional: A model parallelism unit object that implements
            get_{model,data}_parallel_{rank,group,world_size}()

        dist_init_required: Optional: None will auto-initialize torch.distributed if needed,
            otherwise the user can force it to be initialized or not via boolean.

        collate_fn: Optional: Merges a list of samples to form a
            mini-batch of Tensor(s).  Used when using batched loading from a
            map-style dataset.

    Returns:
        A tuple of ``engine``, ``optimizer``, ``training_dataloader``, ``lr_scheduler``

        * ``engine``: DeepSpeed runtime engine which wraps the client model for distributed training.

        * ``optimizer``: Wrapped optimizer if a user defined ``optimizer`` is supplied, or if
          optimizer is specified in json config else ``None``.

        * ``training_dataloader``: DeepSpeed dataloader if ``training_data`` was supplied,
          otherwise ``None``.

        * ``lr_scheduler``: Wrapped lr scheduler if user ``lr_scheduler`` is passed, or
          if ``lr_scheduler`` specified in JSON configuration. Otherwise ``None``.
    """
    log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
        __version__,
        __git_hash__,
        __git_branch__),
             ranks=[0])

    if not isinstance(model, PipelineModule):
        engine = DeepSpeedEngine(args=args,
                                 model=model,
                                 optimizer=optimizer,
                                 model_parameters=model_parameters,
                                 training_data=training_data,
                                 lr_scheduler=lr_scheduler,
                                 mpu=mpu,
                                 dist_init_required=dist_init_required,
                                 collate_fn=collate_fn,
                                 config_params=config_params)
    else:
        assert mpu is None, "mpu must be None with pipeline parallelism"
        engine = PipelineEngine(args=args,
                                model=model,
                                optimizer=optimizer,
                                model_parameters=model_parameters,
                                training_data=training_data,
                                lr_scheduler=lr_scheduler,
                                mpu=model.mpu(),
                                dist_init_required=dist_init_required,
                                collate_fn=collate_fn,
                                config_params=config_params)

    return_items = [
        engine,
        engine.optimizer,
        engine.training_dataloader,
        engine.lr_scheduler
    ]
    return tuple(return_items)


def _add_core_arguments(parser):
    r"""Helper (internal) function to update an argument parser with an argument group of the core DeepSpeed arguments.
        The core set of DeepSpeed arguments include the following:
        1) --deepspeed: boolean flag to enable DeepSpeed
        2) --deepspeed_config <json file path>: path of a json configuration file to configure DeepSpeed runtime.

        This is a helper function to the public add_config_arguments()

    Arguments:
        parser: argument parser
    Return:
        parser: Updated Parser
    """
    group = parser.add_argument_group('DeepSpeed', 'DeepSpeed configurations')

    group.add_argument(
        '--deepspeed',
        default=False,
        action='store_true',
        help=
        'Enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)')

    group.add_argument('--deepspeed_config',
                       default=None,
                       type=str,
                       help='DeepSpeed json configuration file.')

    group.add_argument(
        '--deepscale',
        default=False,
        action='store_true',
        help=
        'Deprecated enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)'
    )

    group.add_argument('--deepscale_config',
                       default=None,
                       type=str,
                       help='Deprecated DeepSpeed json configuration file.')

    group.add_argument(
        '--deepspeed_mpi',
        default=False,
        action='store_true',
        help=
        "Run via MPI, this will attempt to discover the necessary variables to initialize torch "
        "distributed from the MPI environment")

    return parser


def add_config_arguments(parser):
    r"""Update the argument parser to enabling parsing of DeepSpeed command line arguments.
        The set of DeepSpeed arguments include the following:
        1) --deepspeed: boolean flag to enable DeepSpeed
        2) --deepspeed_config <json file path>: path of a json configuration file to configure DeepSpeed runtime.

    Arguments:
        parser: argument parser
    Return:
        parser: Updated Parser
    """
    parser = _add_core_arguments(parser)

    return parser
add deepspeed init 2020-02-01 03:16:04 +03:00			`'''`
			`Copyright 2020 The Microsoft DeepSpeed Team`
			`'''`
Sparse attn + ops/runtime refactor + v0.3.0 (#343) * Sparse attn + ops/runtime refactor + v0.3.0 Co-authored-by: Arash Ashari <arashari@microsoft.com> Co-authored-by: Arash Ashari <arashari@microsoft.com> 2020-09-02 04:06:15 +03:00			`import sys`
			`import types`
add deepspeed init 2020-02-01 03:16:04 +03:00
ZeRO-Offload release (#391) * ZeRO-Offload (squash) (#381) Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com> Co-authored-by: Reza Yazdani <reyazda@microsoft.com> Co-authored-by: Jeff Rasley <jerasley@microsoft.com> Co-authored-by: Jie <37380896+jren73@users.noreply.github.com> Co-authored-by: Arash Ashari <arashari@microsoft.com> Co-authored-by: Reza Yazdani <reyazda@microsoft.com> Co-authored-by: Samyam Rajbhandari <samyamr@microsoft.com> Co-authored-by: Shaden Smith <Shaden.Smith@microsoft.com> Co-authored-by: arashashari <arashashari@ArashMSLaptop.redmond.corp.microsoft.com> Co-authored-by: RezaYazdaniAminabadi <44502768+RezaYazdaniAminabadi@users.noreply.github.com> Co-authored-by: Reza Yazdani <reyazda@microsoft.com> Co-authored-by: Samyam Rajbhandari <samyamr@microsoft.com> Co-authored-by: Shaden Smith <Shaden.Smith@microsoft.com> 2020-09-10 03:14:12 +03:00			`from . import ops`

			`from .runtime.engine import DeepSpeedEngine`
Add CPUAdam optimizer for zero-offload in deepspeed engine (#484) * add adamW to CPU-ADAM implementation * supporting cpu-adam optimizer for zero-offload on deepspeed side * bump DSE to match cpu-adam updates Co-authored-by: Jeff Rasley <jerasley@microsoft.com> 2020-10-30 19:01:04 +03:00			`from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER`
Pipeline parallel training engine. (#392) Co-authored-by: Jeff Rasley <jerasley@microsoft.com> 2020-09-10 09:14:55 +03:00			`from .runtime.pipe.engine import PipelineEngine`
ZeRO-Offload release (#391) * ZeRO-Offload (squash) (#381) Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com> Co-authored-by: Reza Yazdani <reyazda@microsoft.com> Co-authored-by: Jeff Rasley <jerasley@microsoft.com> Co-authored-by: Jie <37380896+jren73@users.noreply.github.com> Co-authored-by: Arash Ashari <arashari@microsoft.com> Co-authored-by: Reza Yazdani <reyazda@microsoft.com> Co-authored-by: Samyam Rajbhandari <samyamr@microsoft.com> Co-authored-by: Shaden Smith <Shaden.Smith@microsoft.com> Co-authored-by: arashashari <arashashari@ArashMSLaptop.redmond.corp.microsoft.com> Co-authored-by: RezaYazdaniAminabadi <44502768+RezaYazdaniAminabadi@users.noreply.github.com> Co-authored-by: Reza Yazdani <reyazda@microsoft.com> Co-authored-by: Samyam Rajbhandari <samyamr@microsoft.com> Co-authored-by: Shaden Smith <Shaden.Smith@microsoft.com> 2020-09-10 03:14:12 +03:00			`from .runtime.lr_schedules import add_tuning_arguments`
			`from .runtime.config import DeepSpeedConfig`
			`from .runtime.activation_checkpointing import checkpointing`
			`from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig`
Pipeline parallel training engine. (#392) Co-authored-by: Jeff Rasley <jerasley@microsoft.com> 2020-09-10 09:14:55 +03:00			`from .utils import log_dist`
Ability to initialize distributed backend outside deepspeed runtime (#608) 2020-12-18 10:17:19 +03:00			`from .utils.distributed import init_distributed`
Pipeline parallel training engine. (#392) Co-authored-by: Jeff Rasley <jerasley@microsoft.com> 2020-09-10 09:14:55 +03:00
			`from .pipe import PipelineModule`
ZeRO-2 (#217) Updates for ZeRO stage 2 + ZeRO stage 1 w. RS Co-authored-by: Tunji Ruwase <olruwase@microsoft.com> Co-authored-by: Samyam Rajbhandari <samyamr@microsoft.com> Co-authored-by: Shaden Smith <ShadenTSmith@gmail.com> Co-authored-by: Elton Zheng <eltonz@microsoft.com> Co-authored-by: Shaden Smith <Shaden.Smith@microsoft.com> Co-authored-by: yuxionghe <yuxhe@microsoft.com> Co-authored-by: Arash Ashari <arashari@microsoft.com> 2020-05-19 11:00:53 +03:00
DeepSpeed JIT op + PyPI support (#496) Co-authored-by: Shaden Smith <Shaden.Smith@microsoft.com> Co-authored-by: Reza Yazdani <reyazda@microsoft.com> 2020-11-12 22:51:38 +03:00			`from .git_version_info import version, git_hash, git_branch`


			`def _parse_version(version_str):`
			`'''Parse a version string and extract the major, minor, and patch versions.'''`
			`import re`
			`matched = re.search('^(\d+)\.(\d+)\.(\d+)', version_str)`
			`return int(matched.group(1)), int(matched.group(2)), int(matched.group(3))`

add deepspeed init 2020-02-01 03:16:04 +03:00
Moving to major/minor/patch versioning. (#51) 2020-02-10 07:03:35 +03:00			`# Export version information`
DeepSpeed JIT op + PyPI support (#496) Co-authored-by: Shaden Smith <Shaden.Smith@microsoft.com> Co-authored-by: Reza Yazdani <reyazda@microsoft.com> 2020-11-12 22:51:38 +03:00			`__version__ = version`
			`__version_major__, __version_minor__, __version_patch__ = _parse_version(__version__)`
add deepspeed init 2020-02-01 03:16:04 +03:00			`__git_hash__ = git_hash`
			`__git_branch__ = git_branch`

Sparse attn + ops/runtime refactor + v0.3.0 (#343) * Sparse attn + ops/runtime refactor + v0.3.0 Co-authored-by: Arash Ashari <arashari@microsoft.com> Co-authored-by: Arash Ashari <arashari@microsoft.com> 2020-09-02 04:06:15 +03:00			`# Provide backwards compatability with old deepspeed.pt module structure, should hopefully not be used`
			`pt = types.ModuleType('pt', 'dummy pt module for backwards compatability')`
			`deepspeed = sys.modules[__name__]`
			`setattr(deepspeed, 'pt', pt)`
			`setattr(deepspeed.pt, 'deepspeed_utils', deepspeed.runtime.utils)`
			`sys.modules['deepspeed.pt'] = deepspeed.pt`
			`sys.modules['deepspeed.pt.deepspeed_utils'] = deepspeed.runtime.utils`
			`setattr(deepspeed.pt, 'deepspeed_config', deepspeed.runtime.config)`
			`sys.modules['deepspeed.pt.deepspeed_config'] = deepspeed.runtime.config`
backwards compatability w. v020 ckpts, fix issue with zero-1 ckpts (#543) 2020-11-20 00:48:40 +03:00			`setattr(deepspeed.pt, 'loss_scaler', deepspeed.runtime.fp16.loss_scaler)`
			`sys.modules['deepspeed.pt.loss_scaler'] = deepspeed.runtime.fp16.loss_scaler`
Sparse attn + ops/runtime refactor + v0.3.0 (#343) * Sparse attn + ops/runtime refactor + v0.3.0 Co-authored-by: Arash Ashari <arashari@microsoft.com> Co-authored-by: Arash Ashari <arashari@microsoft.com> 2020-09-02 04:06:15 +03:00
add deepspeed init 2020-02-01 03:16:04 +03:00
			`def initialize(args,`
			`model,`
			`optimizer=None,`
			`model_parameters=None,`
			`training_data=None,`
			`lr_scheduler=None,`
			`mpu=None,`
Init distributed torch only if needed (#108) * add auto-detect to torch dist init * update tests to infer distributed init status * prevent crash if dist_init_required is True but already initiliazed * only init if safe to do so (forgot to add this file in prev commit) 2020-02-27 02:07:49 +03:00			`dist_init_required=None,`
ZeRO-2 (#217) Updates for ZeRO stage 2 + ZeRO stage 1 w. RS Co-authored-by: Tunji Ruwase <olruwase@microsoft.com> Co-authored-by: Samyam Rajbhandari <samyamr@microsoft.com> Co-authored-by: Shaden Smith <ShadenTSmith@gmail.com> Co-authored-by: Elton Zheng <eltonz@microsoft.com> Co-authored-by: Shaden Smith <Shaden.Smith@microsoft.com> Co-authored-by: yuxionghe <yuxhe@microsoft.com> Co-authored-by: Arash Ashari <arashari@microsoft.com> 2020-05-19 11:00:53 +03:00			`collate_fn=None,`
			`config_params=None):`
README and RTD improvements. (#198) 2020-04-22 08:18:47 +03:00			`"""Initialize the DeepSpeed Engine.`
add deepspeed init 2020-02-01 03:16:04 +03:00
			`Arguments:`
			`args: a dictionary containing local_rank and deepspeed_config`
			`file location`

			`model: Required: nn.module class before apply any wrappers`

			`optimizer: Optional: a user defined optimizer, this is typically used instead of defining`
			`an optimizer in the DeepSpeed json config.`

Improve doc string for add_XXX_arguments (#32) Unit tests for add_XXX_arguments 2020-02-07 00:14:22 +03:00			`model_parameters: Optional: An iterable of torch.Tensors or dicts.`
add deepspeed init 2020-02-01 03:16:04 +03:00			`Specifies what Tensors should be optimized.`

			`training_data: Optional: Dataset of type torch.utils.data.Dataset`

			`lr_scheduler: Optional: Learning Rate Scheduler Object. It should define a get_lr(),`
			`step(), state_dict(), and load_state_dict() methods`

			`mpu: Optional: A model parallelism unit object that implements`
Updating MPU docs (#92) 2020-02-20 08:41:57 +03:00			`get_{model,data}_parallel_{rank,group,world_size}()`
add deepspeed init 2020-02-01 03:16:04 +03:00
Init distributed torch only if needed (#108) * add auto-detect to torch dist init * update tests to infer distributed init status * prevent crash if dist_init_required is True but already initiliazed * only init if safe to do so (forgot to add this file in prev commit) 2020-02-27 02:07:49 +03:00			`dist_init_required: Optional: None will auto-initialize torch.distributed if needed,`
			`otherwise the user can force it to be initialized or not via boolean.`
add deepspeed init 2020-02-01 03:16:04 +03:00
			`collate_fn: Optional: Merges a list of samples to form a`
			`mini-batch of Tensor(s). Used when using batched loading from a`
			`map-style dataset.`

README and RTD improvements. (#198) 2020-04-22 08:18:47 +03:00			`Returns:`
			A tuple of ``engine``, ``optimizer``, ``training_dataloader``, ``lr_scheduler``
add deepspeed init 2020-02-01 03:16:04 +03:00
README and RTD improvements. (#198) 2020-04-22 08:18:47 +03:00			* ``engine``: DeepSpeed runtime engine which wraps the client model for distributed training.
Improve doc string for add_XXX_arguments (#32) Unit tests for add_XXX_arguments 2020-02-07 00:14:22 +03:00
README and RTD improvements. (#198) 2020-04-22 08:18:47 +03:00			* ``optimizer``: Wrapped optimizer if a user defined ``optimizer`` is supplied, or if
			optimizer is specified in json config else ``None``.
Improve doc string for add_XXX_arguments (#32) Unit tests for add_XXX_arguments 2020-02-07 00:14:22 +03:00
README and RTD improvements. (#198) 2020-04-22 08:18:47 +03:00			* ``training_dataloader``: DeepSpeed dataloader if ``training_data`` was supplied,
			otherwise ``None``.
Improve doc string for add_XXX_arguments (#32) Unit tests for add_XXX_arguments 2020-02-07 00:14:22 +03:00
README and RTD improvements. (#198) 2020-04-22 08:18:47 +03:00			* ``lr_scheduler``: Wrapped lr scheduler if user ``lr_scheduler`` is passed, or
			if ``lr_scheduler`` specified in JSON configuration. Otherwise ``None``.
add deepspeed init 2020-02-01 03:16:04 +03:00			`"""`
Pipeline parallel training engine. (#392) Co-authored-by: Jeff Rasley <jerasley@microsoft.com> 2020-09-10 09:14:55 +03:00			`log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(`
			`__version__,`
			`__git_hash__,`
			`__git_branch__),`
			`ranks=[0])`

			`if not isinstance(model, PipelineModule):`
			`engine = DeepSpeedEngine(args=args,`
			`model=model,`
			`optimizer=optimizer,`
			`model_parameters=model_parameters,`
			`training_data=training_data,`
			`lr_scheduler=lr_scheduler,`
			`mpu=mpu,`
			`dist_init_required=dist_init_required,`
			`collate_fn=collate_fn,`
			`config_params=config_params)`
			`else:`
			`assert mpu is None, "mpu must be None with pipeline parallelism"`
			`engine = PipelineEngine(args=args,`
			`model=model,`
			`optimizer=optimizer,`
			`model_parameters=model_parameters,`
			`training_data=training_data,`
			`lr_scheduler=lr_scheduler,`
			`mpu=model.mpu(),`
			`dist_init_required=dist_init_required,`
			`collate_fn=collate_fn,`
			`config_params=config_params)`
add deepspeed init 2020-02-01 03:16:04 +03:00
			`return_items = [`
			`engine,`
			`engine.optimizer,`
			`engine.training_dataloader,`
			`engine.lr_scheduler`
			`]`
			`return tuple(return_items)`


Improve doc string for add_XXX_arguments (#32) Unit tests for add_XXX_arguments 2020-02-07 00:14:22 +03:00			`def _add_core_arguments(parser):`
			`r"""Helper (internal) function to update an argument parser with an argument group of the core DeepSpeed arguments.`
			`The core set of DeepSpeed arguments include the following:`
			`1) --deepspeed: boolean flag to enable DeepSpeed`
			`2) --deepspeed_config <json file path>: path of a json configuration file to configure DeepSpeed runtime.`

			`This is a helper function to the public add_config_arguments()`
add deepspeed init 2020-02-01 03:16:04 +03:00
			`Arguments:`
			`parser: argument parser`
			`Return:`
			`parser: Updated Parser`
			`"""`
			`group = parser.add_argument_group('DeepSpeed', 'DeepSpeed configurations')`

add deprecated deepspeed flag for legacy code (#104) 2020-02-24 23:47:17 +03:00			`group.add_argument(`
			`'--deepspeed',`
			`default=False,`
			`action='store_true',`
			`help=`
			`'Enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)')`
add deepspeed init 2020-02-01 03:16:04 +03:00
			`group.add_argument('--deepspeed_config',`
			`default=None,`
			`type=str,`
			`help='DeepSpeed json configuration file.')`

add deprecated deepspeed flag for legacy code (#104) 2020-02-24 23:47:17 +03:00			`group.add_argument(`
			`'--deepscale',`
			`default=False,`
			`action='store_true',`
			`help=`
			`'Deprecated enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)'`
			`)`

Refactor simple model test, fix pythonpath issue (#96) Also a fix for #94 2020-02-21 01:16:41 +03:00			`group.add_argument('--deepscale_config',`
			`default=None,`
			`type=str,`
			`help='Deprecated DeepSpeed json configuration file.')`
MPI 3.x support via mpi4py (#107) * add mpirun support for openmpi 4.0 * add master addr support from args * switch mpi detection to use mpi4py * set constant for default distributed port * Make sure deepspeed_mpi exits in args 2020-02-27 18:22:57 +03:00
			`group.add_argument(`
			`'--deepspeed_mpi',`
			`default=False,`
			`action='store_true',`
			`help=`
			`"Run via MPI, this will attempt to discover the necessary variables to initialize torch "`
			`"distributed from the MPI environment")`

add deepspeed init 2020-02-01 03:16:04 +03:00			`return parser`


			`def add_config_arguments(parser):`
Improve doc string for add_XXX_arguments (#32) Unit tests for add_XXX_arguments 2020-02-07 00:14:22 +03:00			`r"""Update the argument parser to enabling parsing of DeepSpeed command line arguments.`
			`The set of DeepSpeed arguments include the following:`
			`1) --deepspeed: boolean flag to enable DeepSpeed`
			`2) --deepspeed_config <json file path>: path of a json configuration file to configure DeepSpeed runtime.`
add deepspeed init 2020-02-01 03:16:04 +03:00
			`Arguments:`
			`parser: argument parser`
			`Return:`
			`parser: Updated Parser`
			`"""`
Improve doc string for add_XXX_arguments (#32) Unit tests for add_XXX_arguments 2020-02-07 00:14:22 +03:00			`parser = _add_core_arguments(parser)`
add deepspeed init 2020-02-01 03:16:04 +03:00
			`return parser`