зеркало из https://github.com/microsoft/DeepSpeed.git
fix typo in comments with deepspeed/ (#3537)
* fix spelling error with deepspeed/runtime/ * fix typo docs/ * fix typo in comments with deepspeed/ --------- Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com> Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
This commit is contained in:
Родитель
9685eb92ab
Коммит
c8d3f5eb19
|
@ -673,7 +673,7 @@ def _split(input_):
|
|||
|
||||
|
||||
def _gather(input_):
|
||||
"""Gather tensors and concatinate along the last dimension."""
|
||||
"""Gather tensors and concatenate along the last dimension."""
|
||||
group = g_mpu.get_model_parallel_group()
|
||||
|
||||
# Bypass the function if we are using only 1 GPU.
|
||||
|
@ -708,7 +708,7 @@ class _CopyToModelParallelRegion(torch.autograd.Function):
|
|||
|
||||
|
||||
class _ReduceFromModelParallelRegion(torch.autograd.Function):
|
||||
"""All-redcue the input from the model parallel region."""
|
||||
"""All-reduce the input from the model parallel region."""
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, input_):
|
||||
|
@ -732,7 +732,7 @@ class _ScatterToModelParallelRegion(torch.autograd.Function):
|
|||
|
||||
|
||||
class _GatherFromModelParallelRegion(torch.autograd.Function):
|
||||
"""Gather the input from model parallel region and concatinate."""
|
||||
"""Gather the input from model parallel region and concatenate."""
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, input_):
|
||||
|
|
|
@ -212,7 +212,7 @@ def student_initialization(student_model, teacher_model, deepspeed_config):
|
|||
The prefix name before the layer #.
|
||||
Example 1: bert.encoder.layer, for BERT_base model's prefix name
|
||||
Example 2: transformer.h, for GPT-2 hugging face prefix name
|
||||
teacher_layer (`list of intergers`)
|
||||
teacher_layer (`list of integers`)
|
||||
The layer of teacher will be used for student's reinitializedion
|
||||
Example 1: [1,3,5,7,9], means we want to matches the 2nd/4th/6th/8th/10th layer of teacher to the first 5 layers of student
|
||||
student_layer (`list` or None)
|
||||
|
|
|
@ -29,7 +29,7 @@ NEBULA_ENABLED_DEFAULT = False
|
|||
# There is a case where customer want to load the checkpoint saved
|
||||
# by raw torch. Because nebula cannot load torch checkpoint directly
|
||||
# as they have different folder structures to bring the gap for
|
||||
# loading(the data are totaly same in bytes for torch and enbula s
|
||||
# loading(the data are totally same in bytes for torch and nebula s
|
||||
# aving).
|
||||
# In this case, we must disable nebula load to use raw torch load.
|
||||
# Customer can just set NEBULA_ENABLE_NEBULA_LOAD to False. Then use
|
||||
|
@ -60,7 +60,7 @@ NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT = 100
|
|||
NEBULA_NUM_OF_VERSION_IN_RETENTION = "num_of_version_in_retention"
|
||||
NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT = 2
|
||||
|
||||
# Neubla envs
|
||||
# Nebula envs
|
||||
NEBULA_EXPORT_ENVS = [
|
||||
'DLTS_JOB_ID', 'DLTS_NUM_WORKER', 'NEBULA_PERSISTENT_STORAGE_PATH', 'NEBULA_PERSISTENT_TIME_INTERVAL',
|
||||
'AML_RUN_ID', 'AZUREML_RUN_TOKEN', 'AZUREML_WORKSPACE_SCOPE', 'AZUREML_EXPERIMENT_SCOPE',
|
||||
|
|
|
@ -63,7 +63,7 @@ class DeepSpeedCPUAdam(torch.optim.Optimizer):
|
|||
algorithm from the paper `On the Convergence of Adam and Beyond`_
|
||||
(default: False) NOT SUPPORTED in DeepSpeed CPUAdam!
|
||||
adamw_mode: select between Adam and AdamW implementations (default: AdamW)
|
||||
full_precision_optimizer_states: creates momementum and variance in full precision regardless of
|
||||
full_precision_optimizer_states: creates momentum and variance in full precision regardless of
|
||||
the precision of the parameters (default: True)
|
||||
"""
|
||||
|
||||
|
|
|
@ -148,7 +148,7 @@ class FusedAdam(torch.optim.Optimizer):
|
|||
# State initialization
|
||||
if len(state) == 0:
|
||||
# DeepSpeed ZeRO 3 processes each subgroup a time, so we need to keep tracking step count for each tensor separately.
|
||||
# While this is not an issue for ZeRO 1 & 2, since they apply a single optimizatin step to the whole param group at the same time.
|
||||
# While this is not an issue for ZeRO 1 & 2, since they apply a single optimization step to the whole param group at the same time.
|
||||
# In order to keep backward compatibility for the existing checkpoints, we use group['state'] to initialize state['step'] if it exists.
|
||||
state['step'] = group.get('step', 0)
|
||||
# Exponential moving average of gradient values
|
||||
|
|
|
@ -32,7 +32,7 @@ def gpt_sample_tokens(reserved_length: int,
|
|||
sampled_indices = random_ltd_module.token_sort_(sampled_indices, seq_length)
|
||||
|
||||
# Not certain the optimized kernel is actually better here, cause it kind of screws
|
||||
# with alignment right if the sequence length is not divisble by like 16
|
||||
# with alignment right if the sequence length is not divisible by like 16
|
||||
# new_mask = random_ltd_module.mask_gather_gpt(attn_mask, reserved_length)
|
||||
if attn_mask is not None:
|
||||
new_mask = attn_mask[:, :, :reserved_length, :reserved_length]
|
||||
|
|
|
@ -78,7 +78,7 @@ class DenseSparsityConfig(SparsityConfig):
|
|||
super().__init__(num_heads, block, different_layout_per_head)
|
||||
|
||||
def make_layout(self, seq_len):
|
||||
"""Set 1 to all blocks of the layout meanins the pattern is dense; not sparse.
|
||||
"""Set 1 to all blocks of the layout meaning the pattern is dense; not sparse.
|
||||
|
||||
Arguments:
|
||||
seq_len: required: an integer determining the underling sequence length; must be <= max sequence length
|
||||
|
@ -702,7 +702,7 @@ class LocalSlidingWindowSparsityConfig(SparsityConfig):
|
|||
num_blocks = layout.shape[1]
|
||||
if (num_blocks < self.num_sliding_window_blocks):
|
||||
raise ValueError(
|
||||
f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
|
||||
f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overall number of blocks in a row, {num_blocks}!'
|
||||
)
|
||||
|
||||
w = self.num_sliding_window_blocks // 2
|
||||
|
|
|
@ -115,7 +115,7 @@ def get_header_c(fp):
|
|||
# multiline comment not closed on same line
|
||||
in_multiline = True
|
||||
elif l.endswith(C_ML_CLOSE):
|
||||
# Ended a multline comment
|
||||
# Ended a multiline comment
|
||||
in_multiline = False
|
||||
elif not in_multiline or l.startswith(C_SL_COMMENT) or l.isspace():
|
||||
# Not in a comment
|
||||
|
|
|
@ -54,7 +54,7 @@ _gpt_models = [
|
|||
]
|
||||
_opt_models = [
|
||||
"facebook/opt-125m", # 125m, 1.7B, ..., 175B variants have the same model architecture.
|
||||
"facebook/opt-350m", # 350m applies layer norm after attnention layer which is different than other variants.
|
||||
"facebook/opt-350m", # 350m applies layer norm after attention layer which is different than other variants.
|
||||
]
|
||||
_all_models = HfApi().list_models()
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче