Add codespell to pre-commit checks (#1717)

This commit is contained in:
Alex Hedges 2022-01-22 17:45:58 -05:00 коммит произвёл GitHub
Родитель 09c065b4c3
Коммит 4cf970e6bb
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
35 изменённых файлов: 83 добавлений и 70 удалений

Просмотреть файл

@ -32,3 +32,16 @@ repos:
hooks:
- id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
args: []
- repo: https://github.com/codespell-project/codespell
rev: v2.1.0
hooks:
- id: codespell
args: [
# Do not check files that are automatically generated
'--skip=docs/Gemfile.lock,tests/unit/gpt2-merges.txt,tests/unit/gpt2-vocab.json',
'--ignore-regex=\\n', # Do not count the 'n' in an escaped newline as part of a word
'--ignore-words-list=unsupport', # Word used in error messages that need rewording
--check-filenames,
--check-hidden
]

Просмотреть файл

@ -55,7 +55,7 @@ confidence=
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once). You can also use "--disable=all" to
# disable everything first and then reenable specific checks. For example, if
# disable everything first and then re-enable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use "--disable=all --enable=classes

Просмотреть файл

@ -11,9 +11,9 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
m.def("aio_read", &deepspeed_py_aio_read, "DeepSpeed Asynchornous I/O Read");
m.def("aio_read", &deepspeed_py_aio_read, "DeepSpeed Asynchronous I/O Read");
m.def("aio_write", &deepspeed_py_aio_write, "DeepSpeed Asynchornous I/O Write");
m.def("aio_write", &deepspeed_py_aio_write, "DeepSpeed Asynchronous I/O Write");
m.def("deepspeed_memcpy", &deepspeed_py_memcpy, "DeepSpeed Memory Copy");

Просмотреть файл

@ -61,7 +61,7 @@ at::Tensor lamb(at::Tensor& p,
// intermediate for weight L2 reduction
// make sure that the threads per block is at least 512 during the kernel launch otherwise the
// behavious is unexpected
// behaviour is unexpected
at::Tensor w_l2_i = at::empty(
{512},
p.options().dtype(p.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float
@ -69,7 +69,7 @@ at::Tensor lamb(at::Tensor& p,
// intermediate for update L2 reduction
// make sure that the threads per block is at least 512 during the kernel launch otherwise the
// behavious is unexpected
// behaviour is unexpected
at::Tensor u_l2_i = at::empty(
{512},
p.options().dtype(p.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float

Просмотреть файл

@ -167,7 +167,7 @@ For example, the following section in the DeepSpeed configuration file limits th
}
```
The entry bellow asks the Autotuner to use `4` as the micro-batch size per GPU in tuning (micro-batch size per GPU is fixed as 4). Note that it's different from using ` "train_micro_batch_size_per_gpu": [4]` which asks the Autotuner to tune micro-batch size per GPU starting from `4`.
The entry below asks the Autotuner to use `4` as the micro-batch size per GPU in tuning (micro-batch size per GPU is fixed as 4). Note that it's different from using ` "train_micro_batch_size_per_gpu": [4]` which asks the Autotuner to tune micro-batch size per GPU starting from `4`.
```json
{
"train_micro_batch_size_per_gpu": [4],

Просмотреть файл

@ -341,7 +341,7 @@ class Autotuner:
else:
return exps
# replace the corresponding parameter values if the user specfies them in the DeepSpeed configuration file
# replace the corresponding parameter values if the user specifies them in the DeepSpeed configuration file
replace_dict(tuning_space,
self.user_config,
[ZERO_OPTIMIZATION,
@ -511,7 +511,7 @@ class Autotuner:
max_train_batch_size_per_gpu = 0
tuning_micro_batch_sizes_overwritten = False
# calcuate max micro batch size using gpu memory, model instatiation memory and activation memory
# calculate max micro batch size using gpu memory, model instantiation memory and activation memory
# calculated_max_micro_batch_size = (memory_per_gpu - instantiation_memory) // activation_memory_micro_batch_size_1
calculated_max_micro_batch_size = int(
self.gpu_mem -
@ -584,11 +584,11 @@ class Autotuner:
logger.info(f"End tuning for space: {tuning_space_name}")
return max_micro_batch_size, fast_best_mbs, fast_best_metric_val
# if the best metric or the micro batch size for that best metric in the current Zero stage after tuning micro batch size is less than the corrresponding value in the prevous Zero stage, return, do not tune other Zero configuration paramerts
# if the best metric or the micro batch size for that best metric in the current Zero stage after tuning micro batch size is less than the corresponding value in the previous Zero stage, return, do not tune other Zero configuration parameters
if stage > 0:
if fast_best_mbs <= prev_best_mbs or fast_best_metric_val < prev_best_metric_val:
logger.info(
f"End tuning for space: {tuning_space_name}. No need to tune other Zero configuration paramerts."
f"End tuning for space: {tuning_space_name}. No need to tune other Zero configuration parameters."
)
return max_micro_batch_size, fast_best_mbs, fast_best_metric_val
@ -665,7 +665,7 @@ class Autotuner:
"""Does a model information profling experiment that collects the number of model parameters and activation memory.\
The experiment produces a "profile_model_info" folder under self.results_dir.
Returns:
[dict]: a model inforation dictionary, e.g., {"num_params": 335144976, "trainable_num_params": 335144976, "activation_mem_per_gpu": 324358144, "rank": 0}
[dict]: a model information dictionary, e.g., {"num_params": 335144976, "trainable_num_params": 335144976, "activation_mem_per_gpu": 324358144, "rank": 0}
"""
logger.info("Starting model info profile run.")
model_info = self.autotuning_config.model_info

Просмотреть файл

@ -137,7 +137,7 @@ MODEL_INFO_KEY_DEFAULT_DICT = {
}
#########################################
# autotunner serach space constants
# autotunner search space constants
#########################################
DEFAULT_HF_CONFIG = {

Просмотреть файл

@ -241,7 +241,7 @@ class ResourceManager:
for exp_id, (exp, err) in self.finished_experiments.items():
if err:
logger.info(
f"The experiment exp_id = {exp_id}, exp_name = {exp['name']}, did not run succesfully with error = {err}, thus a metrics.txt does not exist for it. Check the stderr.log in {exp['result_dir']}"
f"The experiment exp_id = {exp_id}, exp_name = {exp['name']}, did not run successfully with error = {err}, thus a metrics.txt does not exist for it. Check the stderr.log in {exp['result_dir']}"
)
continue

Просмотреть файл

@ -1,7 +1,7 @@
# Tuner
`exps` is a list of experiment descriptions (dictionarys).
`exps` is a list of experiment descriptions (dictionaries).
An experimentation description has a `ds_config` field that stores the DeepSpeed configuration to be used in the experiment.
A tuner is based on BaseTuner and at least implements the `next_batch` method. It can implement a different `tune` method from the BaseTuner's.

Просмотреть файл

@ -120,7 +120,7 @@ class ModelBasedTuner(BaseTuner):
feature_val = []
if err:
logger.info(
f"Skipping exp_id = {exp_id}, exp_name = {exp['name']}, the experiment did not run succesfully with error = {err}, thus a metrics.txt does not exist for it. Please check the stderr.log in {exp['result_dir']}"
f"Skipping exp_id = {exp_id}, exp_name = {exp['name']}, the experiment did not run successfully with error = {err}, thus a metrics.txt does not exist for it. Please check the stderr.log in {exp['result_dir']}"
)
ds_config = exp["ds_config"]
flattened_ds_config = flatten(ds_config)

Просмотреть файл

@ -317,7 +317,7 @@ def canonical_name(config: dict, tuning_keys=None, prefix="", omit_val=False):
Args:
config (dict): the config dict used to generate the name
tuning_keys (list, optional): the tuning keys used to generate the name. Defaults to None.
prefix (str, optional): a string added to the begining of the name. Defaults to None.
prefix (str, optional): a string added to the beginning of the name. Defaults to None.
"""
if TRAIN_MICRO_BATCH_SIZE_PER_GPU not in tuning_keys:
tuning_keys.append(TRAIN_MICRO_BATCH_SIZE_PER_GPU)

Просмотреть файл

@ -107,7 +107,7 @@ def parse_arguments():
'--hide_operator_status',
action='store_true',
help=
'Suppress display of installation and compatiblity statuses of DeepSpeed operators. '
'Suppress display of installation and compatibility statuses of DeepSpeed operators. '
)
parser.add_argument('--hide_errors_and_warnings',
action='store_true',

Просмотреть файл

@ -16,7 +16,7 @@ Below is an example output for BERT-Large(NVIDIA) on an A100 GPU with batch size
-------------------------- DeepSpeed Flops Profiler --------------------------
Profile Summary at step 10:
Notations:
data parallel size (dp_size), model paralel size(mp_size),
data parallel size (dp_size), model parallel size(mp_size),
number of parameters (params), number of multiply-accumulate operations(MACs),
number of floating-point operations (flops), floating-point operations per second (FLOPS),
fwd latency (forward propagation latency), bwd latency (backward propagation latency),
@ -24,7 +24,7 @@ step (weights update latency), iter latency (sum of fwd, bwd and step latency)
world size: 1
data parallel size: 1
model paralel size: 1
model parallel size: 1
batch size per GPU: 80
params per gpu: 336.23 M
params of model = params per GPU * mp_size: 336.23 M
@ -160,7 +160,7 @@ The DeepSpeed Flops Profiler can be used with the DeepSpeed runtime or as a stan
- [Example Training Workflow](#example-training-workflow)
### Usage With the DeepSpeed Runtime
When using DeepSpeed for model training, the profiler can be configured in the deepspeed configuration file. No explict API calls are needed to use the profiler. The profiler can be enabled by adding the following field to the `deepspeed_config` json file. Refer to [flops profiler](https://www.deepspeed.ai/docs/config-json/#flops-profiler) for details.
When using DeepSpeed for model training, the profiler can be configured in the deepspeed configuration file. No explicit API calls are needed to use the profiler. The profiler can be enabled by adding the following field to the `deepspeed_config` json file. Refer to [flops profiler](https://www.deepspeed.ai/docs/config-json/#flops-profiler) for details.
```json
{
@ -185,7 +185,7 @@ An example output of 12-layer Megatron-LM model (`hidden_size = 8192, num_attent
-------------------------- DeepSpeed Flops Profiler --------------------------
Profile Summary at step 10:
Notations:
data parallel size (dp_size), model paralel size(mp_size),
data parallel size (dp_size), model parallel size(mp_size),
number of parameters (params), number of multiply-accumulate operations(MACs),
number of floating-point operations (flops), floating-point operations per second (FLOPS),
fwd latency (forward propagation latency), bwd latency (backward propagation latency),
@ -193,7 +193,7 @@ step (weights update latency), iter latency (sum of fwd, bwd and step latency)
world size: 1
data parallel size: 1
model paralel size: 1
model parallel size: 1
batch size per GPU: 1024
params per gpu: 1.29 M
params of model = params per GPU * mp_size: 1.29 M

Просмотреть файл

@ -47,7 +47,7 @@ def reduce_scatter_coalesced(
"""simultaneously reduce-scatter a list of tensors - this can be done more
efficiently than individual reduce scatter calls
TODO. see if PyTorch team wants a c++ verson of this for ProcessGroupNCCL
TODO. see if PyTorch team wants a c++ version of this for ProcessGroupNCCL
"""
this_rank = torch.distributed.get_rank(group)
world_sz = torch.distributed.get_world_size(group)

Просмотреть файл

@ -861,7 +861,7 @@ class DeepSpeedEngine(Module):
def _configure_with_arguments(self, args, mpu):
# After the distributed backend is initialized we are guaranteed the LOCAL_RANK
# environment variable is set. We must align args.local_rank to this value for
# backwards compatability with scripts relying on [args|self].local_rank containing
# backwards compatibility with scripts relying on [args|self].local_rank containing
# the correct local rank info. _do_args_sanity_check will ensure this is the case.
if "OMPI_COMM_WORLD_LOCAL_RANK" in os.environ:
@ -2005,7 +2005,7 @@ class DeepSpeedEngine(Module):
msg["latency"]
print_json_dist(msg, [0], path=self.autotuning_metric_path())
import atexit
atexit.register(print, "Autotuning: done with runing current ds config.")
atexit.register(print, "Autotuning: done with running current ds config.")
exit()
def _write_tensorboard(self):
@ -2290,7 +2290,7 @@ class DeepSpeedEngine(Module):
global_expert_id = expp_rank * num_local_experts + local_expert_id
expert_state_dict = torch.load(self._get_expert_ckpt_name(
checkpoint_path,
-1, # -1 means ingore layer_id
-1, # -1 means ignore layer_id
global_expert_id,
tag),
map_location=torch.device('cpu'))

Просмотреть файл

@ -55,7 +55,7 @@ class FP16_UnfusedOptimizer(object):
#copied to fp16 weights
fp32_group = [p.clone().float().detach() for p in param_group['params']]
#incase the internal optimizer needs it
#in case the internal optimizer needs it
for p in fp32_group:
p.requires_grad = True

Просмотреть файл

@ -314,7 +314,7 @@ class LRRangeTest(object):
the paper `A disciplined approach to neural network hyper-parameters: Part1`_.
LRRT policy is used for finding maximum LR that trains a model without divergence, and can be used to
configure the LR boundaries for Cylic LR schedules.
configure the LR boundaries for Cyclic LR schedules.
LRRT changes the learning rate after every batch.
`step` should be called after a batch has been used for training.
@ -325,7 +325,7 @@ class LRRangeTest(object):
lower boundary in the range test for each parameter group.
lr_range_test_step_size (int): Interval of training steps to increase learning rate. Default: 2000
lr_range_test_step_rate (float): Scaling rate for range test. Default: 1.0
lr_range_test_staircase (bool): Scale in staircase fashion, rather than continous. Default: False.
lr_range_test_staircase (bool): Scale in staircase fashion, rather than continuous. Default: False.
last_batch_iteration (int): The index of the last batch. This parameter is used when
resuming a training job. Since `step()` should be invoked after each
batch instead of after each epoch, this number represents the total

Просмотреть файл

@ -156,7 +156,7 @@ class PipelineEngine(DeepSpeedEngine):
f'TOTAL_PARAMS={total_params} ({total_params/1e6:0.3f}M) '
f'UNIQUE_PARAMS={unique_params} ({unique_params/1e6:0.3f}M)')
#intialize peer-2-peer communication and allreduce groups
#initialize peer-2-peer communication and allreduce groups
if self.is_pipe_parallel:
p2p.init_process_groups(self.grid)

Просмотреть файл

@ -47,7 +47,7 @@ def _is_valid_send_recv(src_stage, dest_stage):
def send(tensor, dest_stage, async_op=False):
global _groups
assert async_op == False, "Doesnt support async_op true"
assert async_op == False, "Doesn't support async_op true"
src_stage = _grid.get_stage_id()
_is_valid_send_recv(src_stage, dest_stage)
@ -68,7 +68,7 @@ def send(tensor, dest_stage, async_op=False):
def recv(tensor, src_stage, async_op=False):
global _groups
assert async_op == False, "Doesnt support async_op true"
assert async_op == False, "Doesn't support async_op true"
dest_stage = _grid.get_stage_id()
_is_valid_send_recv(src_stage, dest_stage)

Просмотреть файл

@ -191,7 +191,7 @@ class ProcessTopology:
return True
coords = filter(_filter_helper, self.mapping.keys())
return [self.mapping[coo] for coo in coords]
return [self.mapping[coord] for coord in coords]
def get_axis_list(self, axis, idx):
"""Returns the list of global ranks whose coordinate in an axis is idx.

Просмотреть файл

@ -125,7 +125,7 @@ ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE_DEFAULT = False
ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS = 'ignore_unused_parameters'
ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS_DEFAULT = True
# Use deepspeed < v0.3.17 zero stage 1, kept for backwards compatability reasons
# Use deepspeed < v0.3.17 zero stage 1, kept for backwards compatibility reasons
ZERO_OPTIMIZATION_LEGACY_STAGE1 = "legacy_stage1"
ZERO_OPTIMIZATION_LEGACY_STAGE1_DEFAULT = False

Просмотреть файл

@ -96,7 +96,7 @@ class ContiguousMemoryAllocator(object):
print_rank_0(
f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}."
)
assert self.total_free - tensor_size == free_before, "Release bookeeping error"
assert self.total_free - tensor_size == free_before, "Release bookkeeping error"
def release_tensor_with_id(self, tensor_id):
free_before = self.total_free
@ -109,7 +109,7 @@ class ContiguousMemoryAllocator(object):
print_rank_0(
f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}."
)
assert self.total_free - tensor_size == free_before, "Release bookeeping error"
assert self.total_free - tensor_size == free_before, "Release bookkeeping error"
#shows the current memory allocation at specified resolution
def print_allocation(self, resolution=200):

Просмотреть файл

@ -691,7 +691,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
self._validate_remote_device(remote_device, _ds_config)
# Remote device is the device where parameter partiitons are stored
# Remote device is the device where parameter partitions are stored
# It can be same as local_device or it could be CPU or NVMe.
self.remote_device = self.local_device if remote_device is None else remote_device
self.pin_memory = pin_memory if (self.remote_device

Просмотреть файл

@ -232,7 +232,7 @@ class PartitionedParameterCoordinator:
# this is a much less elegant way of fixing this vs something like using
# cudaMallocAsync/cudaFreeAsync. Choosing to not expose this to the user now
# because ideally in the future its replaced by an async allocation
# mechanism which doesnt require any configuration by the user.
# mechanism which doesn't require any configuration by the user.
self.__ongoing_fetch_events: Deque[Event] = collections.deque()
# TODO. make this configurable via JSON
self.__max_ongoing_fetch_events: int = 2
@ -250,7 +250,7 @@ class PartitionedParameterCoordinator:
"""adds sub module to trace"""
if self.trace_complete:
raise RuntimeError(
"attemted to record trace when trace was already complete")
"attempted to record trace when trace was already complete")
self.__submodule_order.append(sub_module)
for param in sorted(set(iter_params(sub_module)), key=lambda p: p.ds_id):
@ -597,7 +597,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
# - assume all model params in fp16
# - assume all params requires grad
# - flat by groups, not keeping state. TODO: remove state explicitly?
# - master gard and unflat master weight never exist. TODO: a way to save out unflat master?
# - master grad and unflat master weight never exist. TODO: a way to save out unflat master?
if not torch.cuda.is_available:
raise SystemError("Cannot use fp16 without CUDA.")
self.optimizer = init_optimizer
@ -867,7 +867,7 @@ class DeepSpeedZeroOptimizer_Stage3(object):
# stores if a grad in a partition has been computed or not
self.is_grad_computed = {}
# will store the averaged gradients required by this paritition
# will store the averaged gradients required by this partition
self.averaged_gradients = {}
#creates backward hooks for gradient partitioning
@ -1011,8 +1011,8 @@ class DeepSpeedZeroOptimizer_Stage3(object):
def _move_to_flat_buffer(self, param_list, flat_buffer, avoid_copy=False):
'''If flat buffer is None then the parameters in the param_list are
not copied to the flat buffer. This is because they excede the number of max_params_in_cpu
Some of these parameters may aready be in CPU in unflattened buffers
not copied to the flat buffer. This is because they exceed the number of max_params_in_cpu
Some of these parameters may already be in CPU in unflattened buffers
or they maybe in GPU, or they maybe in NVME. If they are in NVME, then
they will be marked as NOT_AVAILABLE, and will be moved to CPU when they are
needed during training.'''

Просмотреть файл

@ -132,7 +132,7 @@ class DeepSpeedZeroOptimizer(object):
# - assume all model params in fp16
# - assume all params requires grad
# - flat by groups, not keeping state. TODO: remove state explicitly?
# - master gard and unflat master weight never exist. TODO: a way to save out unflat master?
# - master grad and unflat master weight never exist. TODO: a way to save out unflat master?
if not torch.cuda.is_available:
raise SystemError("Cannot use fp16 without CUDA.")
self.optimizer = init_optimizer
@ -391,7 +391,7 @@ class DeepSpeedZeroOptimizer(object):
# simplified param id
self.param_id = {}
#interesting code: unique ids being assigned to individual paramters
#interesting code: unique ids being assigned to individual parameters
largest_param_numel = 0
count = 0
for i, params_group in enumerate(self.bit16_groups):

Просмотреть файл

@ -653,9 +653,9 @@ Configuring the asynchronous I/O module for offloading parameter and optimizer s
<i>**overwrite**</i>: [boolean]
| Description | Default |
| ----------------------------------------------------------------------------------------- | ------- |
| Whether to run autotuing experiments whose results alreay exsit. Setting it to true would overwrite the existing result. | `false` |
| Description | Default |
|---------------------------------------------------------------------------------------------------------------------------| ------- |
| Whether to run autotuing experiments whose results already exist. Setting it to true would overwrite the existing result. | `false` |
<i>**metric**</i>: [string]

Просмотреть файл

@ -28,7 +28,7 @@ There are three stages in ZeRO corresponding to three model states, as shown in
</a>
Figure 1. Overview of ZeRO memory savings
In addition to these three stages, ZeRO family of technology also consists of ZeRO-2 Offload. ZeRO-2 Offload is a heterogenous DL training technology that works in conjunction with ZeRO-2 to offload partitioned optimizer states and gradients to CPU memory. ZeRO-2 Offload offers the full memory advantage of ZeRO-2 even on a single GPU, while at the same time offering great scalability of ZeRO-2 on multi-GPU setup. DeepSpeed library has been offering ZeRO-2 Offload since Sept 2020. For details, please see below:
In addition to these three stages, ZeRO family of technology also consists of ZeRO-2 Offload. ZeRO-2 Offload is a heterogeneous DL training technology that works in conjunction with ZeRO-2 to offload partitioned optimizer states and gradients to CPU memory. ZeRO-2 Offload offers the full memory advantage of ZeRO-2 even on a single GPU, while at the same time offering great scalability of ZeRO-2 on multi-GPU setup. DeepSpeed library has been offering ZeRO-2 Offload since Sept 2020. For details, please see below:
* ZeRO: [Stage 1 blog](https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/), [Stage 2 blog](https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/), [Tutorial](/tutorials/zero)
* ZeRO-Offload: [Blog](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/#toc-heading-3), [Tutorials](/tutorials/zero-offload), [Paper link](https://arxiv.org/abs/2101.06840)

Просмотреть файл

@ -27,20 +27,20 @@ MoQ quantization schedule is defined by a number of parameters which allow users
`quantize_groups`: Quantization groups, which shows the number of scales used to quantize a model, default is 1.
`quantize_bits`, The numer of bits to control the data-precision transition from a start-bit to the final target-bits (e.g. starting from 16-bit down to 8-bit).
`quantize_bits`, The number of bits to control the data-precision transition from a start-bit to the final target-bits (e.g. starting from 16-bit down to 8-bit).
`start_bits`: The start bits in quantization training. Default is set to 16.
`target_bits`: The target bits in quantization training. Default is set to 16.
`quantize_schedule`, This determines how to schedule the training steps at each precision level.
`quantize_period`: indicates the period by which we reduce down the precison (number of bits) for quantization. By default, we use a period of 100 training steps, that will be doubled every time the precision reduces by 1 bit.
`quantize_period`: indicates the period by which we reduce down the precision (number of bits) for quantization. By default, we use a period of 100 training steps, that will be doubled every time the precision reduces by 1 bit.
`schedule_offset`: indicates when the quantization starts to happen (before this offset, we just use the normal training precision which can be either FP32/FP16). Default is set to 100 steps.
`quantize_algo`, The algorithm used to quantize the model.
`q_type`: we currently support symmetric and asymmetric quantization that result in signed and unsigned integer values, respectively. Default is set to symmetric
`rounding`: for the rounding of the quantized values, we can either round to the nearest value or use stocahstic rounding. Default is set to nearest.
`rounding`: for the rounding of the quantized values, we can either round to the nearest value or use stochastic rounding. Default is set to nearest.
### Eigenvalue Parameters

Просмотреть файл

@ -20,7 +20,7 @@ Below is an example output for BERT-Large(NVIDIA) on an A100 GPU with batch size
-------------------------- DeepSpeed Flops Profiler --------------------------
Profile Summary at step 10:
Notations:
data parallel size (dp_size), model paralel size(mp_size),
data parallel size (dp_size), model parallel size(mp_size),
number of parameters (params), number of multiply-accumulate operations(MACs),
number of floating-point operations (flops), floating-point operations per second (FLOPS),
fwd latency (forward propagation latency), bwd latency (backward propagation latency),
@ -28,7 +28,7 @@ step (weights update latency), iter latency (sum of fwd, bwd and step latency)
world size: 1
data parallel size: 1
model paralel size: 1
model parallel size: 1
batch size per GPU: 80
params per gpu: 336.23 M
params of model = params per GPU * mp_size: 336.23 M
@ -166,7 +166,7 @@ The DeepSpeed Flops Profiler can be used with the DeepSpeed runtime or as a stan
### Usage With the DeepSpeed Runtime
When using DeepSpeed for model training, the profiler can be configured in the deepspeed [configuration file](/docs/config-json/#flops-profiler). No explict API calls are needed to use the profiler. The profiler can be enabled by adding the following field to deepspeed's configuration json file. Refer to [flops profiler](/docs/config-json/#flops-profiler) for details.
When using DeepSpeed for model training, the profiler can be configured in the deepspeed [configuration file](/docs/config-json/#flops-profiler). No explicit API calls are needed to use the profiler. The profiler can be enabled by adding the following field to deepspeed's configuration json file. Refer to [flops profiler](/docs/config-json/#flops-profiler) for details.
```json
{
@ -191,7 +191,7 @@ An example output of 12-layer Megatron-LM model (`hidden_size = 8192, num_attent
-------------------------- DeepSpeed Flops Profiler --------------------------
Profile Summary at step 10:
Notations:
data parallel size (dp_size), model paralel size(mp_size),
data parallel size (dp_size), model parallel size(mp_size),
number of parameters (params), number of multiply-accumulate operations(MACs),
number of floating-point operations (flops), floating-point operations per second (FLOPS),
fwd latency (forward propagation latency), bwd latency (backward propagation latency),
@ -199,7 +199,7 @@ step (weights update latency), iter latency (sum of fwd, bwd and step latency)
world size: 1
data parallel size: 1
model paralel size: 1
model parallel size: 1
batch size per GPU: 1024
params per gpu: 1.29 M
params of model = params per GPU * mp_size: 1.29 M

Просмотреть файл

@ -80,7 +80,7 @@ self.experts = deepspeed.moe.layer.MoE(hidden_size=input_dim, expert=ExpertModul
```
With the above two commands, the DeepSpeed runtime will be set to train an MoE model with a total of 8 experts on 4 GPUs in 4 experts/GPU mode. We call this the E + D mode as described earlier in the table.
For more advanced use case of the groups API including the inter-operability with Megatron style mpu object, watch this space!
For more advanced use case of the groups API including the interoperability with Megatron style mpu object, watch this space!
```python

Просмотреть файл

@ -24,7 +24,7 @@ In this part, we elaborate the usage of MoE inference support in the DeepSpeed l
First step to use DeepSpeed-MoE inferenece is to initialize the expert-parallel groups. To do so, one can use the group utility from DeepSpeed to initialize the group (`deepspeed.utils.groups.initialize`). This function creates the groups based on minimum of the world\_size (total number of GPUs) and expert size. By using this group, we can partition the experts among the expert-parallel GPUs. If number of experts is lower than total number of GPUs, DeepSpeed-MoE leverages expert-slicing for partitioning the expert parameters between the expert-parallel GPUs.
For inference with DeepSpeed-MoE, use `init_inference` API to load the MoE model for inference. Here, you can specify the Model-parallelism/tensor-slicing (MP) degree, number of experts, and if the model has not been loaded with the appropriate checkpoint, you can also provide the checkpoint description using a `json` file or simply pass the `'checkpoint'` path to load the moddel. To inject the high-performance inference kernels, you can pass int the `replace_method` as `'auto'` and set the `replace_with_kernel_inject` to True.
For inference with DeepSpeed-MoE, use `init_inference` API to load the MoE model for inference. Here, you can specify the Model-parallelism/tensor-slicing (MP) degree, number of experts, and if the model has not been loaded with the appropriate checkpoint, you can also provide the checkpoint description using a `json` file or simply pass the `'checkpoint'` path to load the model. To inject the high-performance inference kernels, you can pass int the `replace_method` as `'auto'` and set the `replace_with_kernel_inject` to True.
```python

Просмотреть файл

@ -184,7 +184,7 @@ This structure also combines the idea of local, global and random attention. Fur
* `global_block_end_indices`: a list of integers determining end indices of global window blocks. By default this is not used. But if it is set, it must have the same size as `global_block_indices` parameter, and combining this two parameters, for each index `i`, blocks from `global_block_indices[i]` to `global_block_end_indices[i]` (exclusive) are considered as global attention block.
* `attention`: a string determining attention type. Attention can be `unidirectional`, such as autoregressive models, in which tokens attend only to tokens appear before them in the context. Considering that, the upper triangular of attention matrix is empty as above figure. Or it can be `bidirectional`, such as BERT, in which tokens can attend to any other tokens before or after them. Then, the upper triangular part of the attention matrix is mirror of the lower triangular in the above figure.
* `horizontal_global_attention`: a boolean determining if blocks that are global representative of a local window, also attend to all other blocks. This is valid only if attention type is `bidirectional`. Looking at the attention matrix, that means global attention not only includes the vertical blocks, but also horizontal blocks
Figure bellow illustrates an example of `variable` sparsity, in which blue, orange and green blocks illustrate local, global, and random attention blocks respectively.
Figure below illustrates an example of `variable` sparsity, in which blue, orange and green blocks illustrate local, global, and random attention blocks respectively.
![Variable sparsity structure](/assets/images/sa_variable_sparsity_structure.png)

Просмотреть файл

@ -265,7 +265,7 @@ def test_onebitadam_checkpointing(tmpdir):
load_optimizer_states=True,
load_lr_scheduler_states=True)
assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Momentum mask should not change after loading checkpoint"
# Test whether worker&server error is resetted
# Test whether worker&server error is reset
for v in optimizer_2.state.values():
assert 'worker_error' not in v, f"Incorrect worker error"
assert 'server_error' not in v, f"Incorrect server error"
@ -291,7 +291,7 @@ def test_onebitadam_checkpointing(tmpdir):
load_optimizer_states=True,
load_lr_scheduler_states=True)
assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Momentum mask should not change after loading checkpoint"
# Test whether worker&server error is resetted
# Test whether worker&server error is reset
for v in optimizer_3.state.values():
assert 'worker_error' not in v, f"Incorrect worker error"
assert 'server_error' not in v, f"Incorrect server error"
@ -682,7 +682,7 @@ def test_onebitlamb_checkpointing(tmpdir):
load_optimizer_states=True,
load_lr_scheduler_states=True)
assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Momentum mask should not change after loading checkpoint"
# Test whether worker&server error is resetted
# Test whether worker&server error is reset
assert len(optimizer_2.optimizer.worker_errors) == 0, f"Incorrect worker error"
assert len(optimizer_2.optimizer.server_errors) == 0, f"Incorrect server error"
# Test whether scaling_coeffs is loaded correctly
@ -713,10 +713,10 @@ def test_onebitlamb_checkpointing(tmpdir):
load_optimizer_states=True,
load_lr_scheduler_states=True)
assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Momentum mask should not change after loading checkpoint"
# Test whether worker&server error is resetted
# Test whether worker&server error is reset
assert len(optimizer_3.optimizer.worker_errors) == 0, f"Incorrect worker error"
assert len(optimizer_3.optimizer.server_errors) == 0, f"Incorrect server error"
# Test whether scaling_coeffs, lamb_coeff_freeze, last_factor are resetted
# Test whether scaling_coeffs, lamb_coeff_freeze, last_factor are reset
for v in optimizer_3.state.values():
assert v['lamb_coeff_freeze'] == 0.0, f"Incorrect lamb_coeff_freeze"
assert v['last_factor'] == 1.0, f"Incorrect last_factor"

Просмотреть файл

@ -13,7 +13,7 @@ def test_parser_mutual_exclusive():
def test_parser_local():
''' Test cases with only one node. '''
# First try no incude/exclude
# First try no include/exclude
hosts = {'worker-0': [0, 1, 2, 3]}
ret = dsrun.parse_resource_filter(hosts)
assert (ret == hosts)
@ -49,7 +49,7 @@ def test_parser_local():
def test_parser_multinode():
# First try no incude/exclude
# First try no include/exclude
hosts = {'worker-0': [0, 1, 2, 3], 'worker-1': [0, 1, 2, 3]}
ret = dsrun.parse_resource_filter(hosts)
assert (ret == hosts)

Просмотреть файл

@ -736,7 +736,7 @@ def test_zero3_param_partitioning_base(
ds_engine.optimizer.zero_grad()
# TODO. add testing for this - for now we just call it to make sure it
# doesnt throw
# doesn't throw
ds_engine.optimizer.step()
# taking an optimizer step invalidates all parameters, make sure everything
# has been partitioned afterwards
@ -978,7 +978,7 @@ def test_zero3_init_for_parent_weight_initialization(world_sz):
@pytest.mark.skip(
reason="depends on upgraded pytorch and nccl that isnt always available")
reason="depends on upgraded pytorch and nccl that isn't always available")
@pytest.mark.parametrize("param_persistence_threshold", [0, 10])
@pytest.mark.parametrize("contiguous_gradients", [True, False])
@pytest.mark.parametrize("offload_optimizer", [True, False])
@ -1174,7 +1174,7 @@ def test_zero3_param_partitioning_base_bf16(
ds_engine.optimizer.zero_grad()
# TODO. add testing for this - for now we just call it to make sure it
# doesnt throw
# doesn't throw
ds_engine.optimizer.step()
_assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE})