зеркало из https://github.com/microsoft/DeepSpeed.git
Add codespell to pre-commit checks (#1717)
This commit is contained in:
Родитель
09c065b4c3
Коммит
4cf970e6bb
|
@ -32,3 +32,16 @@ repos:
|
|||
hooks:
|
||||
- id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
|
||||
args: []
|
||||
|
||||
- repo: https://github.com/codespell-project/codespell
|
||||
rev: v2.1.0
|
||||
hooks:
|
||||
- id: codespell
|
||||
args: [
|
||||
# Do not check files that are automatically generated
|
||||
'--skip=docs/Gemfile.lock,tests/unit/gpt2-merges.txt,tests/unit/gpt2-vocab.json',
|
||||
'--ignore-regex=\\n', # Do not count the 'n' in an escaped newline as part of a word
|
||||
'--ignore-words-list=unsupport', # Word used in error messages that need rewording
|
||||
--check-filenames,
|
||||
--check-hidden
|
||||
]
|
||||
|
|
|
@ -55,7 +55,7 @@ confidence=
|
|||
# can either give multiple identifiers separated by comma (,) or put this
|
||||
# option multiple times (only on the command line, not in the configuration
|
||||
# file where it should appear only once). You can also use "--disable=all" to
|
||||
# disable everything first and then reenable specific checks. For example, if
|
||||
# disable everything first and then re-enable specific checks. For example, if
|
||||
# you want to run only the similarities checker, you can use "--disable=all
|
||||
# --enable=similarities". If you want to run only the classes checker, but have
|
||||
# no Warning level messages displayed, use "--disable=all --enable=classes
|
||||
|
|
|
@ -11,9 +11,9 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
|
|||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
|
||||
{
|
||||
m.def("aio_read", &deepspeed_py_aio_read, "DeepSpeed Asynchornous I/O Read");
|
||||
m.def("aio_read", &deepspeed_py_aio_read, "DeepSpeed Asynchronous I/O Read");
|
||||
|
||||
m.def("aio_write", &deepspeed_py_aio_write, "DeepSpeed Asynchornous I/O Write");
|
||||
m.def("aio_write", &deepspeed_py_aio_write, "DeepSpeed Asynchronous I/O Write");
|
||||
|
||||
m.def("deepspeed_memcpy", &deepspeed_py_memcpy, "DeepSpeed Memory Copy");
|
||||
|
||||
|
|
|
@ -61,7 +61,7 @@ at::Tensor lamb(at::Tensor& p,
|
|||
|
||||
// intermediate for weight L2 reduction
|
||||
// make sure that the threads per block is at least 512 during the kernel launch otherwise the
|
||||
// behavious is unexpected
|
||||
// behaviour is unexpected
|
||||
at::Tensor w_l2_i = at::empty(
|
||||
{512},
|
||||
p.options().dtype(p.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float
|
||||
|
@ -69,7 +69,7 @@ at::Tensor lamb(at::Tensor& p,
|
|||
|
||||
// intermediate for update L2 reduction
|
||||
// make sure that the threads per block is at least 512 during the kernel launch otherwise the
|
||||
// behavious is unexpected
|
||||
// behaviour is unexpected
|
||||
at::Tensor u_l2_i = at::empty(
|
||||
{512},
|
||||
p.options().dtype(p.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float
|
||||
|
|
|
@ -167,7 +167,7 @@ For example, the following section in the DeepSpeed configuration file limits th
|
|||
}
|
||||
```
|
||||
|
||||
The entry bellow asks the Autotuner to use `4` as the micro-batch size per GPU in tuning (micro-batch size per GPU is fixed as 4). Note that it's different from using ` "train_micro_batch_size_per_gpu": [4]` which asks the Autotuner to tune micro-batch size per GPU starting from `4`.
|
||||
The entry below asks the Autotuner to use `4` as the micro-batch size per GPU in tuning (micro-batch size per GPU is fixed as 4). Note that it's different from using ` "train_micro_batch_size_per_gpu": [4]` which asks the Autotuner to tune micro-batch size per GPU starting from `4`.
|
||||
```json
|
||||
{
|
||||
"train_micro_batch_size_per_gpu": [4],
|
||||
|
|
|
@ -341,7 +341,7 @@ class Autotuner:
|
|||
else:
|
||||
return exps
|
||||
|
||||
# replace the corresponding parameter values if the user specfies them in the DeepSpeed configuration file
|
||||
# replace the corresponding parameter values if the user specifies them in the DeepSpeed configuration file
|
||||
replace_dict(tuning_space,
|
||||
self.user_config,
|
||||
[ZERO_OPTIMIZATION,
|
||||
|
@ -511,7 +511,7 @@ class Autotuner:
|
|||
max_train_batch_size_per_gpu = 0
|
||||
tuning_micro_batch_sizes_overwritten = False
|
||||
|
||||
# calcuate max micro batch size using gpu memory, model instatiation memory and activation memory
|
||||
# calculate max micro batch size using gpu memory, model instantiation memory and activation memory
|
||||
# calculated_max_micro_batch_size = (memory_per_gpu - instantiation_memory) // activation_memory_micro_batch_size_1
|
||||
calculated_max_micro_batch_size = int(
|
||||
self.gpu_mem -
|
||||
|
@ -584,11 +584,11 @@ class Autotuner:
|
|||
logger.info(f"End tuning for space: {tuning_space_name}")
|
||||
return max_micro_batch_size, fast_best_mbs, fast_best_metric_val
|
||||
|
||||
# if the best metric or the micro batch size for that best metric in the current Zero stage after tuning micro batch size is less than the corrresponding value in the prevous Zero stage, return, do not tune other Zero configuration paramerts
|
||||
# if the best metric or the micro batch size for that best metric in the current Zero stage after tuning micro batch size is less than the corresponding value in the previous Zero stage, return, do not tune other Zero configuration parameters
|
||||
if stage > 0:
|
||||
if fast_best_mbs <= prev_best_mbs or fast_best_metric_val < prev_best_metric_val:
|
||||
logger.info(
|
||||
f"End tuning for space: {tuning_space_name}. No need to tune other Zero configuration paramerts."
|
||||
f"End tuning for space: {tuning_space_name}. No need to tune other Zero configuration parameters."
|
||||
)
|
||||
return max_micro_batch_size, fast_best_mbs, fast_best_metric_val
|
||||
|
||||
|
@ -665,7 +665,7 @@ class Autotuner:
|
|||
"""Does a model information profling experiment that collects the number of model parameters and activation memory.\
|
||||
The experiment produces a "profile_model_info" folder under self.results_dir.
|
||||
Returns:
|
||||
[dict]: a model inforation dictionary, e.g., {"num_params": 335144976, "trainable_num_params": 335144976, "activation_mem_per_gpu": 324358144, "rank": 0}
|
||||
[dict]: a model information dictionary, e.g., {"num_params": 335144976, "trainable_num_params": 335144976, "activation_mem_per_gpu": 324358144, "rank": 0}
|
||||
"""
|
||||
logger.info("Starting model info profile run.")
|
||||
model_info = self.autotuning_config.model_info
|
||||
|
|
|
@ -137,7 +137,7 @@ MODEL_INFO_KEY_DEFAULT_DICT = {
|
|||
}
|
||||
|
||||
#########################################
|
||||
# autotunner serach space constants
|
||||
# autotunner search space constants
|
||||
#########################################
|
||||
|
||||
DEFAULT_HF_CONFIG = {
|
||||
|
|
|
@ -241,7 +241,7 @@ class ResourceManager:
|
|||
for exp_id, (exp, err) in self.finished_experiments.items():
|
||||
if err:
|
||||
logger.info(
|
||||
f"The experiment exp_id = {exp_id}, exp_name = {exp['name']}, did not run succesfully with error = {err}, thus a metrics.txt does not exist for it. Check the stderr.log in {exp['result_dir']}"
|
||||
f"The experiment exp_id = {exp_id}, exp_name = {exp['name']}, did not run successfully with error = {err}, thus a metrics.txt does not exist for it. Check the stderr.log in {exp['result_dir']}"
|
||||
)
|
||||
continue
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Tuner
|
||||
|
||||
|
||||
`exps` is a list of experiment descriptions (dictionarys).
|
||||
`exps` is a list of experiment descriptions (dictionaries).
|
||||
An experimentation description has a `ds_config` field that stores the DeepSpeed configuration to be used in the experiment.
|
||||
|
||||
A tuner is based on BaseTuner and at least implements the `next_batch` method. It can implement a different `tune` method from the BaseTuner's.
|
||||
|
|
|
@ -120,7 +120,7 @@ class ModelBasedTuner(BaseTuner):
|
|||
feature_val = []
|
||||
if err:
|
||||
logger.info(
|
||||
f"Skipping exp_id = {exp_id}, exp_name = {exp['name']}, the experiment did not run succesfully with error = {err}, thus a metrics.txt does not exist for it. Please check the stderr.log in {exp['result_dir']}"
|
||||
f"Skipping exp_id = {exp_id}, exp_name = {exp['name']}, the experiment did not run successfully with error = {err}, thus a metrics.txt does not exist for it. Please check the stderr.log in {exp['result_dir']}"
|
||||
)
|
||||
ds_config = exp["ds_config"]
|
||||
flattened_ds_config = flatten(ds_config)
|
||||
|
|
|
@ -317,7 +317,7 @@ def canonical_name(config: dict, tuning_keys=None, prefix="", omit_val=False):
|
|||
Args:
|
||||
config (dict): the config dict used to generate the name
|
||||
tuning_keys (list, optional): the tuning keys used to generate the name. Defaults to None.
|
||||
prefix (str, optional): a string added to the begining of the name. Defaults to None.
|
||||
prefix (str, optional): a string added to the beginning of the name. Defaults to None.
|
||||
"""
|
||||
if TRAIN_MICRO_BATCH_SIZE_PER_GPU not in tuning_keys:
|
||||
tuning_keys.append(TRAIN_MICRO_BATCH_SIZE_PER_GPU)
|
||||
|
|
|
@ -107,7 +107,7 @@ def parse_arguments():
|
|||
'--hide_operator_status',
|
||||
action='store_true',
|
||||
help=
|
||||
'Suppress display of installation and compatiblity statuses of DeepSpeed operators. '
|
||||
'Suppress display of installation and compatibility statuses of DeepSpeed operators. '
|
||||
)
|
||||
parser.add_argument('--hide_errors_and_warnings',
|
||||
action='store_true',
|
||||
|
|
|
@ -16,7 +16,7 @@ Below is an example output for BERT-Large(NVIDIA) on an A100 GPU with batch size
|
|||
-------------------------- DeepSpeed Flops Profiler --------------------------
|
||||
Profile Summary at step 10:
|
||||
Notations:
|
||||
data parallel size (dp_size), model paralel size(mp_size),
|
||||
data parallel size (dp_size), model parallel size(mp_size),
|
||||
number of parameters (params), number of multiply-accumulate operations(MACs),
|
||||
number of floating-point operations (flops), floating-point operations per second (FLOPS),
|
||||
fwd latency (forward propagation latency), bwd latency (backward propagation latency),
|
||||
|
@ -24,7 +24,7 @@ step (weights update latency), iter latency (sum of fwd, bwd and step latency)
|
|||
|
||||
world size: 1
|
||||
data parallel size: 1
|
||||
model paralel size: 1
|
||||
model parallel size: 1
|
||||
batch size per GPU: 80
|
||||
params per gpu: 336.23 M
|
||||
params of model = params per GPU * mp_size: 336.23 M
|
||||
|
@ -160,7 +160,7 @@ The DeepSpeed Flops Profiler can be used with the DeepSpeed runtime or as a stan
|
|||
- [Example Training Workflow](#example-training-workflow)
|
||||
### Usage With the DeepSpeed Runtime
|
||||
|
||||
When using DeepSpeed for model training, the profiler can be configured in the deepspeed configuration file. No explict API calls are needed to use the profiler. The profiler can be enabled by adding the following field to the `deepspeed_config` json file. Refer to [flops profiler](https://www.deepspeed.ai/docs/config-json/#flops-profiler) for details.
|
||||
When using DeepSpeed for model training, the profiler can be configured in the deepspeed configuration file. No explicit API calls are needed to use the profiler. The profiler can be enabled by adding the following field to the `deepspeed_config` json file. Refer to [flops profiler](https://www.deepspeed.ai/docs/config-json/#flops-profiler) for details.
|
||||
|
||||
```json
|
||||
{
|
||||
|
@ -185,7 +185,7 @@ An example output of 12-layer Megatron-LM model (`hidden_size = 8192, num_attent
|
|||
-------------------------- DeepSpeed Flops Profiler --------------------------
|
||||
Profile Summary at step 10:
|
||||
Notations:
|
||||
data parallel size (dp_size), model paralel size(mp_size),
|
||||
data parallel size (dp_size), model parallel size(mp_size),
|
||||
number of parameters (params), number of multiply-accumulate operations(MACs),
|
||||
number of floating-point operations (flops), floating-point operations per second (FLOPS),
|
||||
fwd latency (forward propagation latency), bwd latency (backward propagation latency),
|
||||
|
@ -193,7 +193,7 @@ step (weights update latency), iter latency (sum of fwd, bwd and step latency)
|
|||
|
||||
world size: 1
|
||||
data parallel size: 1
|
||||
model paralel size: 1
|
||||
model parallel size: 1
|
||||
batch size per GPU: 1024
|
||||
params per gpu: 1.29 M
|
||||
params of model = params per GPU * mp_size: 1.29 M
|
||||
|
|
|
@ -47,7 +47,7 @@ def reduce_scatter_coalesced(
|
|||
"""simultaneously reduce-scatter a list of tensors - this can be done more
|
||||
efficiently than individual reduce scatter calls
|
||||
|
||||
TODO. see if PyTorch team wants a c++ verson of this for ProcessGroupNCCL
|
||||
TODO. see if PyTorch team wants a c++ version of this for ProcessGroupNCCL
|
||||
"""
|
||||
this_rank = torch.distributed.get_rank(group)
|
||||
world_sz = torch.distributed.get_world_size(group)
|
||||
|
|
|
@ -861,7 +861,7 @@ class DeepSpeedEngine(Module):
|
|||
def _configure_with_arguments(self, args, mpu):
|
||||
# After the distributed backend is initialized we are guaranteed the LOCAL_RANK
|
||||
# environment variable is set. We must align args.local_rank to this value for
|
||||
# backwards compatability with scripts relying on [args|self].local_rank containing
|
||||
# backwards compatibility with scripts relying on [args|self].local_rank containing
|
||||
# the correct local rank info. _do_args_sanity_check will ensure this is the case.
|
||||
|
||||
if "OMPI_COMM_WORLD_LOCAL_RANK" in os.environ:
|
||||
|
@ -2005,7 +2005,7 @@ class DeepSpeedEngine(Module):
|
|||
msg["latency"]
|
||||
print_json_dist(msg, [0], path=self.autotuning_metric_path())
|
||||
import atexit
|
||||
atexit.register(print, "Autotuning: done with runing current ds config.")
|
||||
atexit.register(print, "Autotuning: done with running current ds config.")
|
||||
exit()
|
||||
|
||||
def _write_tensorboard(self):
|
||||
|
@ -2290,7 +2290,7 @@ class DeepSpeedEngine(Module):
|
|||
global_expert_id = expp_rank * num_local_experts + local_expert_id
|
||||
expert_state_dict = torch.load(self._get_expert_ckpt_name(
|
||||
checkpoint_path,
|
||||
-1, # -1 means ingore layer_id
|
||||
-1, # -1 means ignore layer_id
|
||||
global_expert_id,
|
||||
tag),
|
||||
map_location=torch.device('cpu'))
|
||||
|
|
|
@ -55,7 +55,7 @@ class FP16_UnfusedOptimizer(object):
|
|||
#copied to fp16 weights
|
||||
fp32_group = [p.clone().float().detach() for p in param_group['params']]
|
||||
|
||||
#incase the internal optimizer needs it
|
||||
#in case the internal optimizer needs it
|
||||
for p in fp32_group:
|
||||
p.requires_grad = True
|
||||
|
||||
|
|
|
@ -314,7 +314,7 @@ class LRRangeTest(object):
|
|||
the paper `A disciplined approach to neural network hyper-parameters: Part1`_.
|
||||
|
||||
LRRT policy is used for finding maximum LR that trains a model without divergence, and can be used to
|
||||
configure the LR boundaries for Cylic LR schedules.
|
||||
configure the LR boundaries for Cyclic LR schedules.
|
||||
|
||||
LRRT changes the learning rate after every batch.
|
||||
`step` should be called after a batch has been used for training.
|
||||
|
@ -325,7 +325,7 @@ class LRRangeTest(object):
|
|||
lower boundary in the range test for each parameter group.
|
||||
lr_range_test_step_size (int): Interval of training steps to increase learning rate. Default: 2000
|
||||
lr_range_test_step_rate (float): Scaling rate for range test. Default: 1.0
|
||||
lr_range_test_staircase (bool): Scale in staircase fashion, rather than continous. Default: False.
|
||||
lr_range_test_staircase (bool): Scale in staircase fashion, rather than continuous. Default: False.
|
||||
last_batch_iteration (int): The index of the last batch. This parameter is used when
|
||||
resuming a training job. Since `step()` should be invoked after each
|
||||
batch instead of after each epoch, this number represents the total
|
||||
|
|
|
@ -156,7 +156,7 @@ class PipelineEngine(DeepSpeedEngine):
|
|||
f'TOTAL_PARAMS={total_params} ({total_params/1e6:0.3f}M) '
|
||||
f'UNIQUE_PARAMS={unique_params} ({unique_params/1e6:0.3f}M)')
|
||||
|
||||
#intialize peer-2-peer communication and allreduce groups
|
||||
#initialize peer-2-peer communication and allreduce groups
|
||||
if self.is_pipe_parallel:
|
||||
p2p.init_process_groups(self.grid)
|
||||
|
||||
|
|
|
@ -47,7 +47,7 @@ def _is_valid_send_recv(src_stage, dest_stage):
|
|||
|
||||
def send(tensor, dest_stage, async_op=False):
|
||||
global _groups
|
||||
assert async_op == False, "Doesnt support async_op true"
|
||||
assert async_op == False, "Doesn't support async_op true"
|
||||
src_stage = _grid.get_stage_id()
|
||||
_is_valid_send_recv(src_stage, dest_stage)
|
||||
|
||||
|
@ -68,7 +68,7 @@ def send(tensor, dest_stage, async_op=False):
|
|||
|
||||
def recv(tensor, src_stage, async_op=False):
|
||||
global _groups
|
||||
assert async_op == False, "Doesnt support async_op true"
|
||||
assert async_op == False, "Doesn't support async_op true"
|
||||
dest_stage = _grid.get_stage_id()
|
||||
_is_valid_send_recv(src_stage, dest_stage)
|
||||
|
||||
|
|
|
@ -191,7 +191,7 @@ class ProcessTopology:
|
|||
return True
|
||||
|
||||
coords = filter(_filter_helper, self.mapping.keys())
|
||||
return [self.mapping[coo] for coo in coords]
|
||||
return [self.mapping[coord] for coord in coords]
|
||||
|
||||
def get_axis_list(self, axis, idx):
|
||||
"""Returns the list of global ranks whose coordinate in an axis is idx.
|
||||
|
|
|
@ -125,7 +125,7 @@ ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE_DEFAULT = False
|
|||
ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS = 'ignore_unused_parameters'
|
||||
ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS_DEFAULT = True
|
||||
|
||||
# Use deepspeed < v0.3.17 zero stage 1, kept for backwards compatability reasons
|
||||
# Use deepspeed < v0.3.17 zero stage 1, kept for backwards compatibility reasons
|
||||
ZERO_OPTIMIZATION_LEGACY_STAGE1 = "legacy_stage1"
|
||||
ZERO_OPTIMIZATION_LEGACY_STAGE1_DEFAULT = False
|
||||
|
||||
|
|
|
@ -96,7 +96,7 @@ class ContiguousMemoryAllocator(object):
|
|||
print_rank_0(
|
||||
f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}."
|
||||
)
|
||||
assert self.total_free - tensor_size == free_before, "Release bookeeping error"
|
||||
assert self.total_free - tensor_size == free_before, "Release bookkeeping error"
|
||||
|
||||
def release_tensor_with_id(self, tensor_id):
|
||||
free_before = self.total_free
|
||||
|
@ -109,7 +109,7 @@ class ContiguousMemoryAllocator(object):
|
|||
print_rank_0(
|
||||
f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}."
|
||||
)
|
||||
assert self.total_free - tensor_size == free_before, "Release bookeeping error"
|
||||
assert self.total_free - tensor_size == free_before, "Release bookkeeping error"
|
||||
|
||||
#shows the current memory allocation at specified resolution
|
||||
def print_allocation(self, resolution=200):
|
||||
|
|
|
@ -691,7 +691,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
|
|||
|
||||
self._validate_remote_device(remote_device, _ds_config)
|
||||
|
||||
# Remote device is the device where parameter partiitons are stored
|
||||
# Remote device is the device where parameter partitions are stored
|
||||
# It can be same as local_device or it could be CPU or NVMe.
|
||||
self.remote_device = self.local_device if remote_device is None else remote_device
|
||||
self.pin_memory = pin_memory if (self.remote_device
|
||||
|
|
|
@ -232,7 +232,7 @@ class PartitionedParameterCoordinator:
|
|||
# this is a much less elegant way of fixing this vs something like using
|
||||
# cudaMallocAsync/cudaFreeAsync. Choosing to not expose this to the user now
|
||||
# because ideally in the future its replaced by an async allocation
|
||||
# mechanism which doesnt require any configuration by the user.
|
||||
# mechanism which doesn't require any configuration by the user.
|
||||
self.__ongoing_fetch_events: Deque[Event] = collections.deque()
|
||||
# TODO. make this configurable via JSON
|
||||
self.__max_ongoing_fetch_events: int = 2
|
||||
|
@ -250,7 +250,7 @@ class PartitionedParameterCoordinator:
|
|||
"""adds sub module to trace"""
|
||||
if self.trace_complete:
|
||||
raise RuntimeError(
|
||||
"attemted to record trace when trace was already complete")
|
||||
"attempted to record trace when trace was already complete")
|
||||
|
||||
self.__submodule_order.append(sub_module)
|
||||
for param in sorted(set(iter_params(sub_module)), key=lambda p: p.ds_id):
|
||||
|
@ -597,7 +597,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
|
|||
# - assume all model params in fp16
|
||||
# - assume all params requires grad
|
||||
# - flat by groups, not keeping state. TODO: remove state explicitly?
|
||||
# - master gard and unflat master weight never exist. TODO: a way to save out unflat master?
|
||||
# - master grad and unflat master weight never exist. TODO: a way to save out unflat master?
|
||||
if not torch.cuda.is_available:
|
||||
raise SystemError("Cannot use fp16 without CUDA.")
|
||||
self.optimizer = init_optimizer
|
||||
|
@ -867,7 +867,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
|
|||
# stores if a grad in a partition has been computed or not
|
||||
self.is_grad_computed = {}
|
||||
|
||||
# will store the averaged gradients required by this paritition
|
||||
# will store the averaged gradients required by this partition
|
||||
self.averaged_gradients = {}
|
||||
|
||||
#creates backward hooks for gradient partitioning
|
||||
|
@ -1011,8 +1011,8 @@ class DeepSpeedZeroOptimizer_Stage3(object):
|
|||
|
||||
def _move_to_flat_buffer(self, param_list, flat_buffer, avoid_copy=False):
|
||||
'''If flat buffer is None then the parameters in the param_list are
|
||||
not copied to the flat buffer. This is because they excede the number of max_params_in_cpu
|
||||
Some of these parameters may aready be in CPU in unflattened buffers
|
||||
not copied to the flat buffer. This is because they exceed the number of max_params_in_cpu
|
||||
Some of these parameters may already be in CPU in unflattened buffers
|
||||
or they maybe in GPU, or they maybe in NVME. If they are in NVME, then
|
||||
they will be marked as NOT_AVAILABLE, and will be moved to CPU when they are
|
||||
needed during training.'''
|
||||
|
|
|
@ -132,7 +132,7 @@ class DeepSpeedZeroOptimizer(object):
|
|||
# - assume all model params in fp16
|
||||
# - assume all params requires grad
|
||||
# - flat by groups, not keeping state. TODO: remove state explicitly?
|
||||
# - master gard and unflat master weight never exist. TODO: a way to save out unflat master?
|
||||
# - master grad and unflat master weight never exist. TODO: a way to save out unflat master?
|
||||
if not torch.cuda.is_available:
|
||||
raise SystemError("Cannot use fp16 without CUDA.")
|
||||
self.optimizer = init_optimizer
|
||||
|
@ -391,7 +391,7 @@ class DeepSpeedZeroOptimizer(object):
|
|||
# simplified param id
|
||||
self.param_id = {}
|
||||
|
||||
#interesting code: unique ids being assigned to individual paramters
|
||||
#interesting code: unique ids being assigned to individual parameters
|
||||
largest_param_numel = 0
|
||||
count = 0
|
||||
for i, params_group in enumerate(self.bit16_groups):
|
||||
|
|
|
@ -653,9 +653,9 @@ Configuring the asynchronous I/O module for offloading parameter and optimizer s
|
|||
|
||||
<i>**overwrite**</i>: [boolean]
|
||||
|
||||
| Description | Default |
|
||||
| ----------------------------------------------------------------------------------------- | ------- |
|
||||
| Whether to run autotuing experiments whose results alreay exsit. Setting it to true would overwrite the existing result. | `false` |
|
||||
| Description | Default |
|
||||
|---------------------------------------------------------------------------------------------------------------------------| ------- |
|
||||
| Whether to run autotuing experiments whose results already exist. Setting it to true would overwrite the existing result. | `false` |
|
||||
|
||||
|
||||
<i>**metric**</i>: [string]
|
||||
|
|
|
@ -28,7 +28,7 @@ There are three stages in ZeRO corresponding to three model states, as shown in
|
|||
</a>
|
||||
Figure 1. Overview of ZeRO memory savings
|
||||
|
||||
In addition to these three stages, ZeRO family of technology also consists of ZeRO-2 Offload. ZeRO-2 Offload is a heterogenous DL training technology that works in conjunction with ZeRO-2 to offload partitioned optimizer states and gradients to CPU memory. ZeRO-2 Offload offers the full memory advantage of ZeRO-2 even on a single GPU, while at the same time offering great scalability of ZeRO-2 on multi-GPU setup. DeepSpeed library has been offering ZeRO-2 Offload since Sept 2020. For details, please see below:
|
||||
In addition to these three stages, ZeRO family of technology also consists of ZeRO-2 Offload. ZeRO-2 Offload is a heterogeneous DL training technology that works in conjunction with ZeRO-2 to offload partitioned optimizer states and gradients to CPU memory. ZeRO-2 Offload offers the full memory advantage of ZeRO-2 even on a single GPU, while at the same time offering great scalability of ZeRO-2 on multi-GPU setup. DeepSpeed library has been offering ZeRO-2 Offload since Sept 2020. For details, please see below:
|
||||
|
||||
* ZeRO: [Stage 1 blog](https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/), [Stage 2 blog](https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/), [Tutorial](/tutorials/zero)
|
||||
* ZeRO-Offload: [Blog](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/#toc-heading-3), [Tutorials](/tutorials/zero-offload), [Paper link](https://arxiv.org/abs/2101.06840)
|
||||
|
|
|
@ -27,20 +27,20 @@ MoQ quantization schedule is defined by a number of parameters which allow users
|
|||
|
||||
`quantize_groups`: Quantization groups, which shows the number of scales used to quantize a model, default is 1.
|
||||
|
||||
`quantize_bits`, The numer of bits to control the data-precision transition from a start-bit to the final target-bits (e.g. starting from 16-bit down to 8-bit).
|
||||
`quantize_bits`, The number of bits to control the data-precision transition from a start-bit to the final target-bits (e.g. starting from 16-bit down to 8-bit).
|
||||
|
||||
`start_bits`: The start bits in quantization training. Default is set to 16.
|
||||
`target_bits`: The target bits in quantization training. Default is set to 16.
|
||||
|
||||
`quantize_schedule`, This determines how to schedule the training steps at each precision level.
|
||||
|
||||
`quantize_period`: indicates the period by which we reduce down the precison (number of bits) for quantization. By default, we use a period of 100 training steps, that will be doubled every time the precision reduces by 1 bit.
|
||||
`quantize_period`: indicates the period by which we reduce down the precision (number of bits) for quantization. By default, we use a period of 100 training steps, that will be doubled every time the precision reduces by 1 bit.
|
||||
`schedule_offset`: indicates when the quantization starts to happen (before this offset, we just use the normal training precision which can be either FP32/FP16). Default is set to 100 steps.
|
||||
|
||||
`quantize_algo`, The algorithm used to quantize the model.
|
||||
|
||||
`q_type`: we currently support symmetric and asymmetric quantization that result in signed and unsigned integer values, respectively. Default is set to symmetric
|
||||
`rounding`: for the rounding of the quantized values, we can either round to the nearest value or use stocahstic rounding. Default is set to nearest.
|
||||
`rounding`: for the rounding of the quantized values, we can either round to the nearest value or use stochastic rounding. Default is set to nearest.
|
||||
|
||||
### Eigenvalue Parameters
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ Below is an example output for BERT-Large(NVIDIA) on an A100 GPU with batch size
|
|||
-------------------------- DeepSpeed Flops Profiler --------------------------
|
||||
Profile Summary at step 10:
|
||||
Notations:
|
||||
data parallel size (dp_size), model paralel size(mp_size),
|
||||
data parallel size (dp_size), model parallel size(mp_size),
|
||||
number of parameters (params), number of multiply-accumulate operations(MACs),
|
||||
number of floating-point operations (flops), floating-point operations per second (FLOPS),
|
||||
fwd latency (forward propagation latency), bwd latency (backward propagation latency),
|
||||
|
@ -28,7 +28,7 @@ step (weights update latency), iter latency (sum of fwd, bwd and step latency)
|
|||
|
||||
world size: 1
|
||||
data parallel size: 1
|
||||
model paralel size: 1
|
||||
model parallel size: 1
|
||||
batch size per GPU: 80
|
||||
params per gpu: 336.23 M
|
||||
params of model = params per GPU * mp_size: 336.23 M
|
||||
|
@ -166,7 +166,7 @@ The DeepSpeed Flops Profiler can be used with the DeepSpeed runtime or as a stan
|
|||
|
||||
### Usage With the DeepSpeed Runtime
|
||||
|
||||
When using DeepSpeed for model training, the profiler can be configured in the deepspeed [configuration file](/docs/config-json/#flops-profiler). No explict API calls are needed to use the profiler. The profiler can be enabled by adding the following field to deepspeed's configuration json file. Refer to [flops profiler](/docs/config-json/#flops-profiler) for details.
|
||||
When using DeepSpeed for model training, the profiler can be configured in the deepspeed [configuration file](/docs/config-json/#flops-profiler). No explicit API calls are needed to use the profiler. The profiler can be enabled by adding the following field to deepspeed's configuration json file. Refer to [flops profiler](/docs/config-json/#flops-profiler) for details.
|
||||
|
||||
```json
|
||||
{
|
||||
|
@ -191,7 +191,7 @@ An example output of 12-layer Megatron-LM model (`hidden_size = 8192, num_attent
|
|||
-------------------------- DeepSpeed Flops Profiler --------------------------
|
||||
Profile Summary at step 10:
|
||||
Notations:
|
||||
data parallel size (dp_size), model paralel size(mp_size),
|
||||
data parallel size (dp_size), model parallel size(mp_size),
|
||||
number of parameters (params), number of multiply-accumulate operations(MACs),
|
||||
number of floating-point operations (flops), floating-point operations per second (FLOPS),
|
||||
fwd latency (forward propagation latency), bwd latency (backward propagation latency),
|
||||
|
@ -199,7 +199,7 @@ step (weights update latency), iter latency (sum of fwd, bwd and step latency)
|
|||
|
||||
world size: 1
|
||||
data parallel size: 1
|
||||
model paralel size: 1
|
||||
model parallel size: 1
|
||||
batch size per GPU: 1024
|
||||
params per gpu: 1.29 M
|
||||
params of model = params per GPU * mp_size: 1.29 M
|
||||
|
|
|
@ -80,7 +80,7 @@ self.experts = deepspeed.moe.layer.MoE(hidden_size=input_dim, expert=ExpertModul
|
|||
```
|
||||
With the above two commands, the DeepSpeed runtime will be set to train an MoE model with a total of 8 experts on 4 GPUs in 4 experts/GPU mode. We call this the E + D mode as described earlier in the table.
|
||||
|
||||
For more advanced use case of the groups API including the inter-operability with Megatron style mpu object, watch this space!
|
||||
For more advanced use case of the groups API including the interoperability with Megatron style mpu object, watch this space!
|
||||
|
||||
|
||||
```python
|
||||
|
|
|
@ -24,7 +24,7 @@ In this part, we elaborate the usage of MoE inference support in the DeepSpeed l
|
|||
|
||||
First step to use DeepSpeed-MoE inferenece is to initialize the expert-parallel groups. To do so, one can use the group utility from DeepSpeed to initialize the group (`deepspeed.utils.groups.initialize`). This function creates the groups based on minimum of the world\_size (total number of GPUs) and expert size. By using this group, we can partition the experts among the expert-parallel GPUs. If number of experts is lower than total number of GPUs, DeepSpeed-MoE leverages expert-slicing for partitioning the expert parameters between the expert-parallel GPUs.
|
||||
|
||||
For inference with DeepSpeed-MoE, use `init_inference` API to load the MoE model for inference. Here, you can specify the Model-parallelism/tensor-slicing (MP) degree, number of experts, and if the model has not been loaded with the appropriate checkpoint, you can also provide the checkpoint description using a `json` file or simply pass the `'checkpoint'` path to load the moddel. To inject the high-performance inference kernels, you can pass int the `replace_method` as `'auto'` and set the `replace_with_kernel_inject` to True.
|
||||
For inference with DeepSpeed-MoE, use `init_inference` API to load the MoE model for inference. Here, you can specify the Model-parallelism/tensor-slicing (MP) degree, number of experts, and if the model has not been loaded with the appropriate checkpoint, you can also provide the checkpoint description using a `json` file or simply pass the `'checkpoint'` path to load the model. To inject the high-performance inference kernels, you can pass int the `replace_method` as `'auto'` and set the `replace_with_kernel_inject` to True.
|
||||
|
||||
```python
|
||||
|
||||
|
|
|
@ -184,7 +184,7 @@ This structure also combines the idea of local, global and random attention. Fur
|
|||
* `global_block_end_indices`: a list of integers determining end indices of global window blocks. By default this is not used. But if it is set, it must have the same size as `global_block_indices` parameter, and combining this two parameters, for each index `i`, blocks from `global_block_indices[i]` to `global_block_end_indices[i]` (exclusive) are considered as global attention block.
|
||||
* `attention`: a string determining attention type. Attention can be `unidirectional`, such as autoregressive models, in which tokens attend only to tokens appear before them in the context. Considering that, the upper triangular of attention matrix is empty as above figure. Or it can be `bidirectional`, such as BERT, in which tokens can attend to any other tokens before or after them. Then, the upper triangular part of the attention matrix is mirror of the lower triangular in the above figure.
|
||||
* `horizontal_global_attention`: a boolean determining if blocks that are global representative of a local window, also attend to all other blocks. This is valid only if attention type is `bidirectional`. Looking at the attention matrix, that means global attention not only includes the vertical blocks, but also horizontal blocks
|
||||
Figure bellow illustrates an example of `variable` sparsity, in which blue, orange and green blocks illustrate local, global, and random attention blocks respectively.
|
||||
Figure below illustrates an example of `variable` sparsity, in which blue, orange and green blocks illustrate local, global, and random attention blocks respectively.
|
||||
|
||||
![Variable sparsity structure](/assets/images/sa_variable_sparsity_structure.png)
|
||||
|
||||
|
|
|
@ -265,7 +265,7 @@ def test_onebitadam_checkpointing(tmpdir):
|
|||
load_optimizer_states=True,
|
||||
load_lr_scheduler_states=True)
|
||||
assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Momentum mask should not change after loading checkpoint"
|
||||
# Test whether worker&server error is resetted
|
||||
# Test whether worker&server error is reset
|
||||
for v in optimizer_2.state.values():
|
||||
assert 'worker_error' not in v, f"Incorrect worker error"
|
||||
assert 'server_error' not in v, f"Incorrect server error"
|
||||
|
@ -291,7 +291,7 @@ def test_onebitadam_checkpointing(tmpdir):
|
|||
load_optimizer_states=True,
|
||||
load_lr_scheduler_states=True)
|
||||
assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Momentum mask should not change after loading checkpoint"
|
||||
# Test whether worker&server error is resetted
|
||||
# Test whether worker&server error is reset
|
||||
for v in optimizer_3.state.values():
|
||||
assert 'worker_error' not in v, f"Incorrect worker error"
|
||||
assert 'server_error' not in v, f"Incorrect server error"
|
||||
|
@ -682,7 +682,7 @@ def test_onebitlamb_checkpointing(tmpdir):
|
|||
load_optimizer_states=True,
|
||||
load_lr_scheduler_states=True)
|
||||
assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Momentum mask should not change after loading checkpoint"
|
||||
# Test whether worker&server error is resetted
|
||||
# Test whether worker&server error is reset
|
||||
assert len(optimizer_2.optimizer.worker_errors) == 0, f"Incorrect worker error"
|
||||
assert len(optimizer_2.optimizer.server_errors) == 0, f"Incorrect server error"
|
||||
# Test whether scaling_coeffs is loaded correctly
|
||||
|
@ -713,10 +713,10 @@ def test_onebitlamb_checkpointing(tmpdir):
|
|||
load_optimizer_states=True,
|
||||
load_lr_scheduler_states=True)
|
||||
assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Momentum mask should not change after loading checkpoint"
|
||||
# Test whether worker&server error is resetted
|
||||
# Test whether worker&server error is reset
|
||||
assert len(optimizer_3.optimizer.worker_errors) == 0, f"Incorrect worker error"
|
||||
assert len(optimizer_3.optimizer.server_errors) == 0, f"Incorrect server error"
|
||||
# Test whether scaling_coeffs, lamb_coeff_freeze, last_factor are resetted
|
||||
# Test whether scaling_coeffs, lamb_coeff_freeze, last_factor are reset
|
||||
for v in optimizer_3.state.values():
|
||||
assert v['lamb_coeff_freeze'] == 0.0, f"Incorrect lamb_coeff_freeze"
|
||||
assert v['last_factor'] == 1.0, f"Incorrect last_factor"
|
||||
|
|
|
@ -13,7 +13,7 @@ def test_parser_mutual_exclusive():
|
|||
|
||||
def test_parser_local():
|
||||
''' Test cases with only one node. '''
|
||||
# First try no incude/exclude
|
||||
# First try no include/exclude
|
||||
hosts = {'worker-0': [0, 1, 2, 3]}
|
||||
ret = dsrun.parse_resource_filter(hosts)
|
||||
assert (ret == hosts)
|
||||
|
@ -49,7 +49,7 @@ def test_parser_local():
|
|||
|
||||
|
||||
def test_parser_multinode():
|
||||
# First try no incude/exclude
|
||||
# First try no include/exclude
|
||||
hosts = {'worker-0': [0, 1, 2, 3], 'worker-1': [0, 1, 2, 3]}
|
||||
ret = dsrun.parse_resource_filter(hosts)
|
||||
assert (ret == hosts)
|
||||
|
|
|
@ -736,7 +736,7 @@ def test_zero3_param_partitioning_base(
|
|||
ds_engine.optimizer.zero_grad()
|
||||
|
||||
# TODO. add testing for this - for now we just call it to make sure it
|
||||
# doesnt throw
|
||||
# doesn't throw
|
||||
ds_engine.optimizer.step()
|
||||
# taking an optimizer step invalidates all parameters, make sure everything
|
||||
# has been partitioned afterwards
|
||||
|
@ -978,7 +978,7 @@ def test_zero3_init_for_parent_weight_initialization(world_sz):
|
|||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="depends on upgraded pytorch and nccl that isnt always available")
|
||||
reason="depends on upgraded pytorch and nccl that isn't always available")
|
||||
@pytest.mark.parametrize("param_persistence_threshold", [0, 10])
|
||||
@pytest.mark.parametrize("contiguous_gradients", [True, False])
|
||||
@pytest.mark.parametrize("offload_optimizer", [True, False])
|
||||
|
@ -1174,7 +1174,7 @@ def test_zero3_param_partitioning_base_bf16(
|
|||
ds_engine.optimizer.zero_grad()
|
||||
|
||||
# TODO. add testing for this - for now we just call it to make sure it
|
||||
# doesnt throw
|
||||
# doesn't throw
|
||||
ds_engine.optimizer.step()
|
||||
_assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE})
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче