DeepSpeed/deepspeed/utils/bwc.py

# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team


def bwc_tensor_model_parallel_rank(mpu=None):
    """Backwards-compatible way of querying the tensor model parallel rank from
    an ``mpu`` object.

    *Tensor* model parallelism means that tensors are physically split across
    processes. This contrasts with *pipeline* model parallelism, in which the
    layers are partitioned but tensors left intact.

    The API for tensor model parallelism has changed across versions and this
    helper provides a best-effort implementation across versions of ``mpu``
    objects.  The preferred mechanism is
    ``mpu.get_tensor_model_parallel_rank()``.

    This should "just work" with both Megatron-LM and DeepSpeed's pipeline
    parallelism.

    Args:
        mpu (model parallel unit, optional): The tensor model parallel rank.
            If ``mpu=None``, returns 0. Defaults to ``None``.

    Returns:
        int: the rank
    """
    if mpu is None:
        # No model parallelism in easy :)
        return 0

    if hasattr(mpu, 'get_tensor_model_parallel_rank'):
        # New Megatron and DeepSpeed convention (post pipeline-parallelism release)
        return mpu.get_tensor_model_parallel_rank()
    elif hasattr(mpu, 'get_slice_parallel_rank'):
        # Some DeepSpeed + pipeline parallelism versions
        return mpu.get_slice_parallel_rank()
    else:
        # Deprecated Megatron and DeepSpeed convention
        return mpu.get_model_parallel_rank()


def bwc_tensor_model_parallel_world_size(mpu=None):
    """Backwards-compatible way of querying the tensor model parallel world size.
       Similar to bwc_tensor_model_parallel_rank.
    """
    if mpu is None:
        return 1

    if hasattr(mpu, 'get_tensor_model_parallel_world_size'):
        # New Megatron and DeepSpeed convention (post pipeline-parallelism release)
        return mpu.get_tensor_model_parallel_world_size()
    elif hasattr(mpu, 'get_slice_parallel_world_size'):
        # Some DeepSpeed + pipeline parallelism versions
        return mpu.get_slice_parallel_world_size()
    else:
        # Deprecated Megatron and DeepSpeed convention
        return mpu.get_model_parallel_world_size()


def bwc_tensor_model_parallel_group(mpu=None):
    """Backwards-compatible way of querying the tensor model parallel group.
       Similar to bwc_tensor_model_parallel_rank.
    """
    if mpu is None:
        return None

    if hasattr(mpu, 'get_tensor_model_parallel_group'):
        # New Megatron and DeepSpeed convention (post pipeline-parallelism release)
        return mpu.get_tensor_model_parallel_group()
    elif hasattr(mpu, 'get_slice_parallel_group'):
        # Some DeepSpeed + pipeline parallelism versions
        return mpu.get_slice_parallel_group()
    else:
        # Deprecated Megatron and DeepSpeed convention
        return mpu.get_model_parallel_group()


def bwc_pipeline_parallel_world_size(mpu=None):
    """Backwards-compatible way of querying the pipeline parallel world size."""
    world_size = 1
    if mpu is not None:
        if hasattr(mpu, 'get_pipeline_model_parallel_world_size'):
            # New Megatron and DeepSpeed convention (post pipeline-parallelism release)
            world_size = mpu.get_pipeline_model_parallel_world_size()
        elif hasattr(mpu, 'get_pipe_parallel_world_size'):
            # DeepSpeed Topology
            world_size = mpu.get_pipe_parallel_world_size()
    return world_size


def bwc_pipeline_parallel_group(mpu=None):
    """Backwards-compatible way of querying the pipeline parallel group."""
    if mpu is None:
        return None
    if hasattr(mpu, 'get_pipeline_model_parallel_group'):
        # Megatron
        return mpu.get_pipeline_model_parallel_group()
    elif hasattr(mpu, 'get_pipe_parallel_group'):
        # DeepSpeed Topology
        return mpu.get_pipe_parallel_group()
    assert False, 'mpu does not support pipeline parallel group'
Support MoE for pipeline models (#5338) This PR enhances DeepSpeed to support MoE for pipeline models (e.g. GPTModelPipe from Megatron-DeepSpeed). Main changes: - Enhance expert groups creation for pipeline (enhance both flavors: DP/PP/EP and DP/TP/PP/EP) - Fix MoE save/load checkpoint for PipelineModule based models. - Display MoE loss for PipelineModule based models. - Support gradients reduce for BF16_Optimizer for PipelineModule.<br>Note that same commit also fixes gradients reduction error when using Megatron-DeepSpeed GPTModelPipe with BF16_Optimizer also for a dense (no MOE) model. - When using no-drop tokens, all-reduce the capacity (op=max) using expert parallel group instead of world group --------- Signed-off-by: Moshe Island <misland@habana.ai> Co-authored-by: Moshe Island <misland@habana.ai> 2024-04-08 18:35:53 +03:00			`# Copyright (c) Microsoft Corporation.`
			`# SPDX-License-Identifier: Apache-2.0`

			`# DeepSpeed Team`


			`def bwc_tensor_model_parallel_rank(mpu=None):`
			`"""Backwards-compatible way of querying the tensor model parallel rank from`
			an ``mpu`` object.

			`Tensor model parallelism means that tensors are physically split across`
			`processes. This contrasts with pipeline model parallelism, in which the`
			`layers are partitioned but tensors left intact.`

			`The API for tensor model parallelism has changed across versions and this`
			helper provides a best-effort implementation across versions of ``mpu``
			`objects. The preferred mechanism is`
			``mpu.get_tensor_model_parallel_rank()``.

			`This should "just work" with both Megatron-LM and DeepSpeed's pipeline`
			`parallelism.`

			`Args:`
			`mpu (model parallel unit, optional): The tensor model parallel rank.`
			If ``mpu=None``, returns 0. Defaults to ``None``.

			`Returns:`
			`int: the rank`
			`"""`
			`if mpu is None:`
			`# No model parallelism in easy :)`
			`return 0`

			`if hasattr(mpu, 'get_tensor_model_parallel_rank'):`
			`# New Megatron and DeepSpeed convention (post pipeline-parallelism release)`
			`return mpu.get_tensor_model_parallel_rank()`
			`elif hasattr(mpu, 'get_slice_parallel_rank'):`
			`# Some DeepSpeed + pipeline parallelism versions`
			`return mpu.get_slice_parallel_rank()`
			`else:`
			`# Deprecated Megatron and DeepSpeed convention`
			`return mpu.get_model_parallel_rank()`


			`def bwc_tensor_model_parallel_world_size(mpu=None):`
			`"""Backwards-compatible way of querying the tensor model parallel world size.`
			`Similar to bwc_tensor_model_parallel_rank.`
			`"""`
			`if mpu is None:`
			`return 1`

			`if hasattr(mpu, 'get_tensor_model_parallel_world_size'):`
			`# New Megatron and DeepSpeed convention (post pipeline-parallelism release)`
			`return mpu.get_tensor_model_parallel_world_size()`
			`elif hasattr(mpu, 'get_slice_parallel_world_size'):`
			`# Some DeepSpeed + pipeline parallelism versions`
			`return mpu.get_slice_parallel_world_size()`
			`else:`
			`# Deprecated Megatron and DeepSpeed convention`
			`return mpu.get_model_parallel_world_size()`


			`def bwc_tensor_model_parallel_group(mpu=None):`
			`"""Backwards-compatible way of querying the tensor model parallel group.`
			`Similar to bwc_tensor_model_parallel_rank.`
			`"""`
			`if mpu is None:`
			`return None`

			`if hasattr(mpu, 'get_tensor_model_parallel_group'):`
			`# New Megatron and DeepSpeed convention (post pipeline-parallelism release)`
			`return mpu.get_tensor_model_parallel_group()`
			`elif hasattr(mpu, 'get_slice_parallel_group'):`
			`# Some DeepSpeed + pipeline parallelism versions`
			`return mpu.get_slice_parallel_group()`
			`else:`
			`# Deprecated Megatron and DeepSpeed convention`
			`return mpu.get_model_parallel_group()`


			`def bwc_pipeline_parallel_world_size(mpu=None):`
			`"""Backwards-compatible way of querying the pipeline parallel world size."""`
			`world_size = 1`
			`if mpu is not None:`
			`if hasattr(mpu, 'get_pipeline_model_parallel_world_size'):`
			`# New Megatron and DeepSpeed convention (post pipeline-parallelism release)`
			`world_size = mpu.get_pipeline_model_parallel_world_size()`
			`elif hasattr(mpu, 'get_pipe_parallel_world_size'):`
			`# DeepSpeed Topology`
			`world_size = mpu.get_pipe_parallel_world_size()`
			`return world_size`


			`def bwc_pipeline_parallel_group(mpu=None):`
			`"""Backwards-compatible way of querying the pipeline parallel group."""`
			`if mpu is None:`
			`return None`
			`if hasattr(mpu, 'get_pipeline_model_parallel_group'):`
			`# Megatron`
			`return mpu.get_pipeline_model_parallel_group()`
			`elif hasattr(mpu, 'get_pipe_parallel_group'):`
			`# DeepSpeed Topology`
			`return mpu.get_pipe_parallel_group()`
			`assert False, 'mpu does not support pipeline parallel group'`