* Fix typos in docs/

* Fix typos in code comments and output strings

* Fix typos in the code itself

* Fix typos in tests/

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
This commit is contained in:
Alex Hedges 2021-10-01 22:56:32 -04:00 коммит произвёл GitHub
Родитель 30965ea734
Коммит be789b1665
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
70 изменённых файлов: 227 добавлений и 227 удалений

Просмотреть файл

@ -196,7 +196,7 @@ bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op,
{
const auto op_string = read_op ? "Read" : "Write";
if (num_bytes % get_thread_count()) {
std::cout << "deepseed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
<< " not divisible by thread count = " << get_thread_count() << std::endl;
return false;
}

Просмотреть файл

@ -130,7 +130,7 @@ def _aio_handle_tasklet(pool_params):
return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
def _init_takslet(b):
def _init_tasklet(b):
global aio_barrier
aio_barrier = b
@ -138,7 +138,7 @@ def _init_takslet(b):
def aio_basic_multiprocessing(args, read_op):
b = Barrier(args.threads)
pool_params = [(args, p, read_op) for p in range(args.threads)]
with Pool(processes=args.threads, initializer=_init_takslet, initargs=(b, )) as p:
with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
pool_results = p.map(_aio_handle_tasklet, pool_params)
report_results(args, read_op, pool_results)

Просмотреть файл

@ -162,7 +162,7 @@ def _aio_handle_tasklet(pool_params):
return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
def _init_takslet(b):
def _init_tasklet(b):
global aio_barrier
aio_barrier = b
@ -170,7 +170,7 @@ def _init_takslet(b):
def aio_handle_multiprocessing(args, read_op):
b = Barrier(args.threads)
pool_params = [(args, p, read_op) for p in range(args.threads)]
with Pool(processes=args.threads, initializer=_init_takslet, initargs=(b, )) as p:
with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
pool_results = p.map(_aio_handle_tasklet, pool_params)
report_results(args, read_op, pool_results)

Просмотреть файл

@ -5,7 +5,7 @@ if [[ $# -ne 2 ]]; then
fi
function validate_enviroment()
function validate_environment()
{
validate_cmd="python ./validate_async_io.py"
eval ${validate_cmd}
@ -18,7 +18,7 @@ function validate_enviroment()
}
validate_enviroment
validate_environment
INPUT_FILE=$1
if [[ ! -f ${INPUT_FILE} ]]; then

Просмотреть файл

@ -9,7 +9,7 @@ function prep_folder()
fi
}
function validate_enviroment()
function validate_environment()
{
validate_cmd="python ./validate_async_io.py"
eval ${validate_cmd}
@ -23,7 +23,7 @@ function validate_enviroment()
validate_enviroment
validate_environment
if [[ $# -ne 3 ]]; then
echo "Usage: $0 <write size in MB> <write dir ><output log dir>"

Просмотреть файл

@ -27,25 +27,25 @@
#define MAX_REG 256
template <typename T>
void launch_qunatize_kernel(T* vals,
void launch_quantize_kernel(T* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template <typename T>
void launch_sr_qunatize_kernel(T* vals,
void launch_sr_quantize_kernel(T* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template <typename T>
void launch_qunatize_kernel_asym(T* vals,
void launch_quantize_kernel_asym(T* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template <typename T>
void launch_sr_qunatize_kernel_asym(T* vals,
void launch_sr_quantize_kernel_asym(T* vals,
int total_count,
int group_num,
int num_bits,

Просмотреть файл

@ -179,6 +179,6 @@ private:
bool _normalize_invertible;
bool _gelu_checkpoint;
// High Performace flags
// High Performance flags
bool _stochastic_mode;
};

Просмотреть файл

@ -17,14 +17,14 @@ public:
size_t heads;
size_t seq_length;
size_t prob_depth;
float temprature;
float temperature;
bool mem_alloc;
Config(size_t batch, size_t h, size_t seq, int prob_size = 0, bool mem_alloc = false)
: batchSize(batch),
heads(h),
seq_length(seq),
prob_depth(prob_size),
temprature(1.0),
temperature(1.0),
mem_alloc(mem_alloc)
{
}

Просмотреть файл

@ -1,7 +1,7 @@
/* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
#include <ATen/ATen.h>
// Forward/backward compatiblity hack around
// Forward/backward compatibility hack around
// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
// pending more future-proof guidance from upstream.
// struct TypeShim

Просмотреть файл

@ -11,7 +11,7 @@ at::Tensor ds_quantize(at::Tensor& vals, int groups, int bits)
for (auto dim : t_size) size *= dim;
if ((((size / groups) - 1) / 4096 + 1) <= MAX_REG) {
launch_qunatize_kernel(
launch_quantize_kernel(
(T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
}
return vals;
@ -25,7 +25,7 @@ at::Tensor ds_sr_quantize(at::Tensor& vals, int groups, int bits)
for (auto dim : t_size) size *= dim;
if (((size / groups) / 4 / 1024) <= 256) {
launch_sr_qunatize_kernel(
launch_sr_quantize_kernel(
(T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
}
return vals;
@ -39,7 +39,7 @@ at::Tensor ds_quantize_asym(at::Tensor& vals, int groups, int bits)
for (auto dim : t_size) size *= dim;
if ((((size / groups) - 1) / 4096 + 1) <= MAX_REG) {
launch_qunatize_kernel_asym(
launch_quantize_kernel_asym(
(T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
}
return vals;
@ -53,7 +53,7 @@ at::Tensor ds_sr_quantize_asym(at::Tensor& vals, int groups, int bits)
for (auto dim : t_size) size *= dim;
if (((size / groups) / 4 / 1024) <= 256) {
launch_sr_qunatize_kernel_asym(
launch_sr_quantize_kernel_asym(
(T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
}
return vals;

Просмотреть файл

@ -3,7 +3,7 @@
namespace cg = cooperative_groups;
__global__ void qunatize_kernel(__half* vals, int group_size, int num_bits)
__global__ void quantize_kernel(__half* vals, int group_size, int num_bits)
{
#if __CUDA_ARCH__ >= 700
@ -93,7 +93,7 @@ __global__ void qunatize_kernel(__half* vals, int group_size, int num_bits)
#endif
}
__global__ void qunatize_kernel(float* vals, int group_size, int num_bits)
__global__ void quantize_kernel(float* vals, int group_size, int num_bits)
{
cg::thread_block b = cg::this_thread_block();
cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@ -176,7 +176,7 @@ __global__ void qunatize_kernel(float* vals, int group_size, int num_bits)
}
template <typename T>
void launch_qunatize_kernel(T* vals,
void launch_quantize_kernel(T* vals,
int total_count,
int group_num,
int num_bits,
@ -185,22 +185,22 @@ void launch_qunatize_kernel(T* vals,
dim3 grid_dim(group_num);
dim3 block_dim(1024);
qunatize_kernel<<<grid_dim, block_dim, 0, stream>>>(
quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
vals, (total_count / group_num) / 4, num_bits);
}
template void launch_qunatize_kernel(float* vals,
template void launch_quantize_kernel(float* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template void launch_qunatize_kernel(__half* vals,
template void launch_quantize_kernel(__half* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
__global__ void sr_qunatize_kernel(__half* vals,
__global__ void sr_quantize_kernel(__half* vals,
int token_size,
int token_num,
int num_bits,
@ -336,7 +336,7 @@ __global__ void sr_qunatize_kernel(__half* vals,
#endif
}
__global__ void sr_qunatize_kernel(float* vals,
__global__ void sr_quantize_kernel(float* vals,
int token_size,
int token_num,
int num_bits,
@ -456,7 +456,7 @@ __global__ void sr_qunatize_kernel(float* vals,
}
template <typename T>
void launch_sr_qunatize_kernel(T* vals,
void launch_sr_quantize_kernel(T* vals,
int total_count,
int group_num,
int num_bits,
@ -468,21 +468,21 @@ void launch_sr_qunatize_kernel(T* vals,
uint64_t inc = total_count / grid_dim.x / block_dim.x;
std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
sr_qunatize_kernel<<<grid_dim, block_dim, 0, stream>>>(
sr_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
vals, (total_count / group_num) / 4, group_num, num_bits, seed);
}
template void launch_sr_qunatize_kernel(float* vals,
template void launch_sr_quantize_kernel(float* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template void launch_sr_qunatize_kernel(__half* vals,
template void launch_sr_quantize_kernel(__half* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
__global__ void qunatize_kernel_asym(__half* vals, int group_size, int num_bits)
__global__ void quantize_kernel_asym(__half* vals, int group_size, int num_bits)
{
#if __CUDA_ARCH__ >= 700
@ -595,7 +595,7 @@ __global__ void qunatize_kernel_asym(__half* vals, int group_size, int num_bits)
#endif
}
__global__ void qunatize_kernel_asym(float* vals, int group_size, int num_bits)
__global__ void quantize_kernel_asym(float* vals, int group_size, int num_bits)
{
cg::thread_block b = cg::this_thread_block();
cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@ -699,7 +699,7 @@ __global__ void qunatize_kernel_asym(float* vals, int group_size, int num_bits)
}
template <typename T>
void launch_qunatize_kernel_asym(T* vals,
void launch_quantize_kernel_asym(T* vals,
int total_count,
int group_num,
int num_bits,
@ -708,22 +708,22 @@ void launch_qunatize_kernel_asym(T* vals,
dim3 grid_dim(group_num);
dim3 block_dim(1024);
qunatize_kernel_asym<<<grid_dim, block_dim, 0, stream>>>(
quantize_kernel_asym<<<grid_dim, block_dim, 0, stream>>>(
vals, (total_count / group_num) / 4, num_bits);
}
template void launch_qunatize_kernel_asym(float* vals,
template void launch_quantize_kernel_asym(float* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template void launch_qunatize_kernel_asym(__half* vals,
template void launch_quantize_kernel_asym(__half* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
__global__ void sr_qunatize_kernel_asym(__half* vals,
__global__ void sr_quantize_kernel_asym(__half* vals,
int token_size,
int token_num,
int num_bits,
@ -879,7 +879,7 @@ __global__ void sr_qunatize_kernel_asym(__half* vals,
#endif
}
__global__ void sr_qunatize_kernel_asym(float* vals,
__global__ void sr_quantize_kernel_asym(float* vals,
int token_size,
int token_num,
int num_bits,
@ -1010,7 +1010,7 @@ __global__ void sr_qunatize_kernel_asym(float* vals,
}
}
template <typename T>
void launch_sr_qunatize_kernel_asym(T* vals,
void launch_sr_quantize_kernel_asym(T* vals,
int total_count,
int group_num,
int num_bits,
@ -1022,15 +1022,15 @@ void launch_sr_qunatize_kernel_asym(T* vals,
uint64_t inc = total_count / grid_dim.x / block_dim.x;
std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
sr_qunatize_kernel<<<grid_dim, block_dim, 0, stream>>>(
sr_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
vals, (total_count / group_num) / 4, group_num, num_bits, seed);
}
template void launch_sr_qunatize_kernel_asym(float* vals,
template void launch_sr_quantize_kernel_asym(float* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template void launch_sr_qunatize_kernel_asym(__half* vals,
template void launch_sr_quantize_kernel_asym(__half* vals,
int total_count,
int group_num,
int num_bits,

Просмотреть файл

@ -102,7 +102,7 @@ class InferenceEngine(Module):
self.mp_group = InferenceEngine.inference_mp_group
def _check_quantize_setting(self, quantization_setting):
self.quatize_bits = 8
self.quantize_bits = 8
self.mlp_extra_grouping = False
self.quantize_groups = 1
if quantization_setting is None:
@ -177,7 +177,7 @@ class InferenceEngine(Module):
quantizer = WeightQuantization(mlp_extra_grouping=self.mlp_extra_grouping)
model, self.quantization_scales = quantizer.model_quantize(self.module,
self.injection_dict,
self.quatize_bits,
self.quantize_bits,
self.quantize_groups)
elif self.dtype == torch.half:
self.module.half()

Просмотреть файл

@ -1,7 +1,7 @@
# Copyright 2020 The Microsoft DeepSpeed Team
"""
DeepSpeed launcher, this is similar to torch.distributed.launch but supports
additional features such as abitrary gpu exclusion.
additional features such as arbitrary gpu exclusion.
deepspeed.launcher.launch is intended to be run on a single worker node and
will spawn several worker sub-processes depending on how many devices/ranks

Просмотреть файл

@ -2,7 +2,7 @@
"""
DeepSpeed runner is the main front-end to launching multi-worker
training jobs with DeepSpeed. By default this uses pdsh to parallel
ssh into multiple worker nodes and launch all the neccisary processes
ssh into multiple worker nodes and launch all the necessary processes
per rank for training.
"""

Просмотреть файл

@ -19,7 +19,7 @@ class ReplaceWithTensorSlicing:
assert dim1 > dim2, \
'Merging tensors is not allowed here! Please use deepspeed load_checkpoint\
for merging your checkpoints before replacing the transformer layer with\
inference-kerenls'
inference-kernels'
def qkv_copy(self, dst, src):
if src is None:
@ -114,9 +114,9 @@ def replace_transformer_layer(orig_layer_impl,
seed (int): random seed value
max_seq_length (int): max sequence length for training
hidden_size (int): hidden dimension
num_attention_heads (int): numebr of attention heads
num_attention_heads (int): number of attention heads
mp_size (int): model_parallelism degree
mp_group : model_parallel gropu initialized on the modeling side
mp_group : model_parallel group initialized on the modeling side
preln (bool): does the original layer implementation do pre or post layer norm?
fp16 (bool): fp16 or fp32
local_rank (int): GPU rank (optional),
@ -401,7 +401,7 @@ def replace_module(model, orig_class, replace_fn, _replace_policy):
if plcy._orig_layer_class is not None:
policy.update({plcy._orig_layer_class: (replace_fn, plcy)})
assert len(policy.items()) > 0,\
"No default policy found! Please specifiy your policy injection_policy (like {BertLayer:HFBEertLayerPolicy})." +\
"No default policy found! Please specify your policy injection_policy (like {BertLayer:HFBEertLayerPolicy})." +\
"You can find some samples here: https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py"
replaced_module, _ = _replace_module(model, policy)

Просмотреть файл

@ -33,7 +33,7 @@ exp_selection_uniform_map: Dict[torch.device, Callable] = {}
def multiplicative_jitter(x, device: torch.device, epsilon=1e-2):
"""
Modified from swtich transformer paper. mesh transformers
Modified from switch transformer paper. mesh transformers
Multiply values by a random number between 1-epsilon and 1+epsilon.
Makes models more resilient to rounding errors introduced by bfloat16.
This seems particularly important for logits.
@ -147,7 +147,7 @@ def top1gating(logits: torch.Tensor,
mask1_rand = mask1 * uniform(mask1.shape)
assert logits.shape[0] >= min_capacity, "No. of tokens (batch-size) should be greater than min_capacity. Either set min_capacity to 0 or inrease your batch size."
assert logits.shape[0] >= min_capacity, "No. of tokens (batch-size) should be greater than min_capacity. Either set min_capacity to 0 or increase your batch size."
_, top_idx = torch.topk(mask1_rand, k=capacity, dim=0)

Просмотреть файл

@ -33,9 +33,9 @@ class DeepSpeedCPUAdam(torch.optim.Optimizer):
In order to apply this optimizer, the model requires to have its master parameter (in FP32)
reside on the CPU memory.
To train on a hetrogeneous system, such as coordinating CPU and GPU, DeepSpeed offers
To train on a heterogeneous system, such as coordinating CPU and GPU, DeepSpeed offers
the ZeRO-Offload technology which efficiently offloads the optimizer states into CPU memory,
with minimal impact on training througput. DeepSpeedCPUAdam plays an important role to minimize
with minimal impact on training throughput. DeepSpeedCPUAdam plays an important role to minimize
the overhead of the optimizer's latency on CPU. Please refer to ZeRO-Offload tutorial
(https://www.deepspeed.ai/tutorials/zero-offload/) for more information on how to enable this technology.

Просмотреть файл

@ -24,7 +24,7 @@ class BertSparseSelfAttention(nn.Module):
Arguments:
config: required: Bert model config
sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on FixedSparsityConfig class.
sparsity_config: optional: this parameter determines sparsity pattern configuration; it is based on FixedSparsityConfig class.
"""
super(BertSparseSelfAttention, self).__init__()
@ -53,11 +53,11 @@ class BertSparseSelfAttention(nn.Module):
"""Applies forward phase of bert sparse self attention
Arguments:
hidden_states: required: hidde_states tensor of the bert model
hidden_states: required: hidden_states tensor of the bert model
attn_mask: required: a mask tensor of size (SequenceLength X SequenceLength); currently only 2D is supported
Return:
context_layer: a dense tensor containing attnetion context
context_layer: a dense tensor containing attention context
"""
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(hidden_states)

Просмотреть файл

@ -94,10 +94,10 @@ class SparseAttentionUtils:
Arguments:
model: required: a transformer model
max_position: required: an integer determining new position embedding size
sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class
sparsity_config: optional: this parameter determines sparsity pattern configuration; it is based on SparsityConfig class
Return:
model: updated model; in which self attention layer has been repleaced with DeepSpeed Sparse Self Attention layer.
model: updated model; in which self attention layer has been replaced with DeepSpeed Sparse Self Attention layer.
"""
if hasattr(model, 'bert'):
@ -131,10 +131,10 @@ class SparseAttentionUtils:
Arguments:
config: required: transformer model config
layers: required: transformer model attention layers
sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class
sparsity_config: optional: this parameter determines sparsity pattern configuration; it is based on SparsityConfig class
Return:
layers: updated attention layers; in which self attention layers have been repleaced with DeepSpeed Sparse Self Attention layer.
layers: updated attention layers; in which self attention layers have been replaced with DeepSpeed Sparse Self Attention layer.
"""
for layer in layers:
@ -161,7 +161,7 @@ class SparseAttentionUtils:
It needs to be called in your model, such as BertModel, right before you calculate the embedding outputs.
Note)
1- instead of passing your embedding layer to this function, you can simply add this function to your model. It can be more simplified if given attention_mask and/or token_type_ids are none.
2- you need to call unpdad function before returning your model output to unpad the encoder sequence output.
2- you need to call unpad function before returning your model output to unpad the encoder sequence output.
Arguments:
block_size: required: an integer determining the block size of sparsity config.

Просмотреть файл

@ -26,7 +26,7 @@ class SparseSelfAttention(nn.Module):
max_seq_length=2048):
"""Initialize the sparse self attention layer.
Arguments:
sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class.
sparsity_config: optional: this parameter determines sparsity pattern configuration; it is based on SparsityConfig class.
key_padding_mask_mode: optional: a string determining if key padding mask needs to be added, `add`, or be multiplied, `mul`.
attn_mask_mode: optional: a string determining if attention mask needs to be added, `add`, or be multiplied, `mul`.
max_seq_length: optional: the maximum sequence length this sparse attention module will be applied to; it controls the size of the master_layout.
@ -121,7 +121,7 @@ class SparseSelfAttention(nn.Module):
attn_mask_mode: optional: a boolean determining if attn_mask needs to be added or multiplied
Return:
attn_output: a dense tensor containing attnetion context
attn_output: a dense tensor containing attention context
"""
assert query.dtype == torch.half, "sparse attention only supports training in fp16 currently, please file a github issue if you need fp32 support"
bsz, num_heads, tgt_len, head_dim = query.size()

Просмотреть файл

@ -49,7 +49,7 @@ class SparsityConfig:
"""If all heads require same sparsity layout, it propagate first head layout to all heads
Arguments:
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step
Return:
layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head
@ -152,11 +152,11 @@ class FixedSparsityConfig(SparsityConfig):
self.num_different_global_patterns = num_different_global_patterns
def set_local_layout(self, h, layout):
"""Sets local attantion layout used by the given head in the sparse attention.
"""Sets local attention layout used by the given head in the sparse attention.
Arguments:
h: required: an integer determining head index
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step
Return:
layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which local layout is set
@ -173,14 +173,14 @@ class FixedSparsityConfig(SparsityConfig):
return layout
def set_global_layout(self, h, layout):
"""Sets global attantion layout used by the given head in the sparse attention.
"""Sets global attention layout used by the given head in the sparse attention.
Currently we set global blocks starting from the last block of a local window to the first one. That means if a local window consists of 4 blocks and global attention size is one block, we use block #4 in each local window as global. If we have different layout per head, then other heads will get #3, #2, and #1. And if we have more heads (and different layout has set) than num of global attentions, multiple head may have same global attentions.
Note) if horizontal_global_attention is set, global blocks will be set both horizontally and vertically.
Arguments:
h: required: an integer determining head index
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step
Return:
layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which global layout is set
@ -307,12 +307,12 @@ class VariableSparsityConfig(SparsityConfig):
self.horizontal_global_attention = horizontal_global_attention
def set_random_layout(self, h, layout):
"""Sets random attantion layout used by the given head in the sparse attention.
"""Sets random attention layout used by the given head in the sparse attention.
Note) By default, it assumes there will be a unique random block layout for all heads; unless `different_layout_per_head` parameter is set in which each head can have a different random layout.
Arguments:
h: required: an integer determining head index
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step
Return:
layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which random layout is set
@ -321,7 +321,7 @@ class VariableSparsityConfig(SparsityConfig):
num_blocks = layout.shape[1]
if (num_blocks < self.num_random_blocks):
raise ValueError(
f'Number of random blocks, {self.num_random_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
f'Number of random blocks, {self.num_random_blocks}, must be smaller than overall number of blocks in a row, {num_blocks}!'
)
for row in range(0, num_blocks):
rnd_cols = random.sample(range(0, num_blocks), self.num_random_blocks)
@ -329,10 +329,10 @@ class VariableSparsityConfig(SparsityConfig):
return layout
def set_local_layout(self, h, layout):
"""Sets local attantion layout used by the given head in the sparse attention.
"""Sets local attention layout used by the given head in the sparse attention.
Arguments:
h: required: an integer determining head index
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step
Return:
layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which local layout is set
@ -362,11 +362,11 @@ class VariableSparsityConfig(SparsityConfig):
return layout
def set_global_layout(self, h, layout):
"""Sets global attantion layout used by the given head in the sparse attention.
"""Sets global attention layout used by the given head in the sparse attention.
Arguments:
h: required: an integer determining head index
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step
Return:
layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which global layout is set
@ -375,7 +375,7 @@ class VariableSparsityConfig(SparsityConfig):
num_blocks = layout.shape[1]
if (self.global_block_end_indices is None):
for idx in self.global_block_indices:
# if global block idx is in the range of the sequnce blocks
# if global block idx is in the range of the sequence blocks
if (idx < num_blocks):
#global rows
if (self.horizontal_global_attention):
@ -386,7 +386,7 @@ class VariableSparsityConfig(SparsityConfig):
layout[h, first_row:, idx] = 1
else:
for _, (start_idx, end_idx) in enumerate(zip(self.global_block_indices, self.global_block_end_indices)):
# if global block idx is in the range of the sequnce blocks
# if global block idx is in the range of the sequence blocks
if (start_idx < num_blocks):
end_idx = min(end_idx, num_blocks)
#global rows
@ -450,12 +450,12 @@ class BigBirdSparsityConfig(SparsityConfig):
self.num_global_blocks = num_global_blocks
def set_random_layout(self, h, layout):
"""Sets random attantion layout used by the given head in the sparse attention.
"""Sets random attention layout used by the given head in the sparse attention.
Note) By default, it assumes there will be a unique random block layout for all heads; unless `different_layout_per_head` parameter is set in which each head can have a different random layout.
Arguments:
h: required: an integer determining head index
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step
Return:
layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which random layout is set
@ -464,7 +464,7 @@ class BigBirdSparsityConfig(SparsityConfig):
num_blocks = layout.shape[1]
if (num_blocks < self.num_random_blocks):
raise ValueError(
f'Number of random blocks, {self.num_random_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
f'Number of random blocks, {self.num_random_blocks}, must be smaller than overall number of blocks in a row, {num_blocks}!'
)
for row in range(0, num_blocks):
@ -473,11 +473,11 @@ class BigBirdSparsityConfig(SparsityConfig):
return layout
def set_sliding_window_layout(self, h, layout):
"""Sets sliding local attantion layout used by the given head in the sparse attention.
"""Sets sliding local attention layout used by the given head in the sparse attention.
Arguments:
h: required: an integer determining head index
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step
Return:
layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which local sliding window layout is set
@ -486,7 +486,7 @@ class BigBirdSparsityConfig(SparsityConfig):
num_blocks = layout.shape[1]
if (num_blocks < self.num_sliding_window_blocks):
raise ValueError(
f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overall number of blocks in a row, {num_blocks}!'
)
w = self.num_sliding_window_blocks // 2
@ -497,11 +497,11 @@ class BigBirdSparsityConfig(SparsityConfig):
return layout
def set_global_layout_itc(self, h, layout):
"""Sets global attantion layout used by the given head in the sparse attention.
"""Sets global attention layout used by the given head in the sparse attention.
Arguments:
h: required: an integer determining head index
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step
Return:
layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which global layout is set
@ -510,7 +510,7 @@ class BigBirdSparsityConfig(SparsityConfig):
num_blocks = layout.shape[1]
if (num_blocks < self.num_global_blocks):
raise ValueError(
f'Number of global blocks, {self.num_global_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
f'Number of global blocks, {self.num_global_blocks}, must be smaller than overall number of blocks in a row, {num_blocks}!'
)
#global rows
@ -588,11 +588,11 @@ class BSLongformerSparsityConfig(SparsityConfig):
self.global_block_end_indices = global_block_end_indices
def set_sliding_window_layout(self, h, layout):
"""Sets sliding local attantion layout used by the given head in the sparse attention.
"""Sets sliding local attention layout used by the given head in the sparse attention.
Arguments:
h: required: an integer determining head index
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step
Return:
layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which local sliding window layout is set
@ -601,7 +601,7 @@ class BSLongformerSparsityConfig(SparsityConfig):
num_blocks = layout.shape[1]
if (num_blocks < self.num_sliding_window_blocks):
raise ValueError(
f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overall number of blocks in a row, {num_blocks}!'
)
w = self.num_sliding_window_blocks // 2
@ -612,11 +612,11 @@ class BSLongformerSparsityConfig(SparsityConfig):
return layout
def set_global_layout(self, h, layout):
"""Sets global attantion layout used by the given head in the sparse attention.
"""Sets global attention layout used by the given head in the sparse attention.
Arguments:
h: required: an integer determining head index
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step
Return:
layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which global layout is set
@ -625,7 +625,7 @@ class BSLongformerSparsityConfig(SparsityConfig):
num_blocks = layout.shape[1]
if (self.global_block_end_indices is None):
for idx in self.global_block_indices:
# if global block idx is in the range of the sequnce blocks
# if global block idx is in the range of the sequence blocks
if (idx < num_blocks):
#global rows
layout[h, idx, :] = 1
@ -634,7 +634,7 @@ class BSLongformerSparsityConfig(SparsityConfig):
layout[h, :, idx] = 1
else:
for _, (start_idx, end_idx) in enumerate(zip(self.global_block_indices, self.global_block_end_indices)):
# if global block idx is in the range of the sequnce blocks
# if global block idx is in the range of the sequence blocks
if (start_idx < num_blocks):
end_idx = min(end_idx, num_blocks)
#global rows

Просмотреть файл

@ -525,7 +525,7 @@ class DeepSpeedTransformerInference(nn.Module):
config: An object of DeepSpeedInferenceConfig
mp_group: Model parallelism group initialized on the modeling side.
quantize_scales: This arguement groups all the layers' scales used for quantization
quantize_scales: This argument groups all the layers' scales used for quantization
quantize_groups: Number of groups used for quantizing the model
merge_count: Shows the number of model-parallel checkpoints merged before running inference.
We use this argument to control the quantization scale for the model parameters if a bigger

Просмотреть файл

@ -88,7 +88,7 @@ class DeepSpeedTransformerConfig(TransformerConfig):
a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend
to turn it off in order to be able to reproduce the same result through the regular kernel execution.
huggingface: Enbale if using the HuggingFace interface style for sending out the forward results.
huggingface: Enable if using the HuggingFace interface style for sending out the forward results.
training: Enable for training rather than inference.
"""

Просмотреть файл

@ -135,14 +135,14 @@ The DeepSpeed Flops Profiler can be used with the DeepSpeed runtime without any
## Flops Measurement
Similar to exsiting flops calculation tools or methods, the DeepSpeed Flops Profiler measures the flops of the forward pass of a module and the flops of the backward pass is estimated as `2` times of that of the forward pass.
Different from the PyTorch profiler which calculates the flops of PyTorch operators, the DeepSpeed Flops Profiler measures the flops witin modules in a model and provides more insights to the users about the model execution.
Similar to existing flops calculation tools or methods, the DeepSpeed Flops Profiler measures the flops of the forward pass of a module and the flops of the backward pass is estimated as `2` times of that of the forward pass.
Different from the PyTorch profiler which calculates the flops of PyTorch operators, the DeepSpeed Flops Profiler measures the flops within modules in a model and provides more insights to the users about the model execution.
The flops estimation is partly inspired by [ptflops](https://github.com/sovrasov/flops-counter.pytorch) with the major difference being that the DeepSpeed Flops Profiler not only supports flops computation directly at module level, but can also capture ```torch.nn.functional``` invoked in a module to estimate the flops.
Thus the DeepSpeed Flops Profiler allows for customized modules in the model, e.g., ```ParallelTransformerLayerworks, ParallelSelfAttention, RowParallelLinear, etc.``` in [Megatron-LM](https://github.com/NVIDIA/Megatron-LM). This is in contrast to ptflops which requires users to write customized flops calculation functions for each customized module.
## Multi-GPU, Multi-node, Data Parallelism, and Model Parallelism
The DeepSpeed Flops Profiler outputs the per GPU profile as well as the world size, data parallel size, and model paralel size. 1
The DeepSpeed Flops Profiler outputs the per GPU profile as well as the world size, data parallel size, and model parallel size. 1
For models running on multi-GPU or multi-node, only change of the model parallelism (e.g. ```--model-parallel-size``` in [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)) affects the number of flops and parameters profiled, i.e.,
`model_parallel_size * flops = total_flops` and `model_parallel_size * parameters = total_parameters`. The data parallel size or world size (related to the number of GPUs or nodes) does not affect the per GPU profile.
@ -372,7 +372,7 @@ with torch.cuda.device(0):
#### In Model Training Workflow
To profile model forward in a training workflow, use the `FlopsProfiler`class.
The `FlopsProfiler`class provides the follwing methods:
The `FlopsProfiler`class provides the following methods:
* `start_profile()` - starts profiling
* `get_total_flops(as_string=False)` - returns the total number of MACs in the model
* `get_total_params(as_string=False)` - returns the total number of parameters in the model

Просмотреть файл

@ -241,13 +241,13 @@ class FlopsProfiler(object):
)
print(f'Profile Summary at step {profile_step}:')
print(
"Notations:\ndata parallel size (dp_size), model paralel size(mp_size),\nnumber of parameters (params), number of multiply-accumulate operations(MACs),\number of floating point operations (flops), floating point operations per second (FLOPS),\nfwd latency (forward propagation latency), bwd latency (backward propagation latency),\nstep (weights update latency), iter latency (sum of fwd, bwd and step latency)\n"
"Notations:\ndata parallel size (dp_size), model parallel size(mp_size),\nnumber of parameters (params), number of multiply-accumulate operations(MACs),\number of floating point operations (flops), floating point operations per second (FLOPS),\nfwd latency (forward propagation latency), bwd latency (backward propagation latency),\nstep (weights update latency), iter latency (sum of fwd, bwd and step latency)\n"
)
if self.ds_engine:
print('{:<60} {:<8}'.format('world size: ', self.ds_engine.world_size))
print('{:<60} {:<8}'.format('data parallel size: ',
self.ds_engine.dp_world_size))
print('{:<60} {:<8}'.format('model paralel size: ',
print('{:<60} {:<8}'.format('model parallel size: ',
self.ds_engine.mp_world_size))
print('{:<60} {:<8}'.format(
'batch size per GPU: ',

Просмотреть файл

@ -205,7 +205,7 @@ def model_parallel_cuda_manual_seed(seed):
Two set of RNG states are tracked:
default state: This is for data parallelism and is the same among a
set of model parallel GPUs but different across
different model paralle groups. This is used for
different model parallel groups. This is used for
example for dropout in the non-model-parallel regions.
model-parallel state: This state is different among a set of model
parallel GPUs, but the same across data parallel
@ -219,7 +219,7 @@ def model_parallel_cuda_manual_seed(seed):
# 2718 is just for fun and any POSITIVE value will work.
offset = seed + 2718
model_parallel_seed = offset + tp_rank
# Data parallel gets the original sedd.
# Data parallel gets the original seed.
data_parallel_seed = seed
if torch.distributed.get_rank() == 0:
@ -534,7 +534,7 @@ class CheckpointFunction(torch.autograd.Function):
global cuda_device, transport_stream, PARTITION_ACTIVATIONS, buffer_0, buffer_1, buffer_0_offset, buffer_1_offset
if cuda_device is None:
see_memory_usage("First Forward Begining", force=False)
see_memory_usage("First Forward Beginning", force=False)
if dist.get_rank() == 0:
logger.info(f"Activation Checkpointing Information")
logger.info(
@ -886,7 +886,7 @@ def configure(
PROFILE_TIME = profile
if CONTIGUOUS_CHECKPOINTING:
assert PARTITION_ACTIVATIONS, "Contiguous Checkpointing is only availble with partitioned activations. Set partitioned activations to true in deepspeed config"
assert PARTITION_ACTIVATIONS, "Contiguous Checkpointing is only available with partitioned activations. Set partitioned activations to true in deepspeed config"
if CONTIGUOUS_CHECKPOINTING:
assert num_layers is not None, "Must specify the number of layers with contiguous memory checkpointing"

Просмотреть файл

@ -46,7 +46,7 @@ DEEPSPEED_OPTIMIZERS = [
# extra optimizer parameters for adam/adamw
TORCH_ADAM_PARAM = "torch_adam"
# default to adamw logic for adam/adamw optimizers unless user explictly opts out
# default to adamw logic for adam/adamw optimizers unless user explicitly opts out
ADAM_W_MODE = "adam_w_mode"
ADAM_W_MODE_DEFAULT = True
@ -665,7 +665,7 @@ def get_dataloader_drop_last(param_dict):
'''Write deepspeed config files by modifying basic templates.
Can be used for quicly changing parameters via command line parameters.'''
Can be used for quickly changing parameters via command line parameters.'''
class DeepSpeedConfigWriter:
@ -738,7 +738,7 @@ class DeepSpeedConfig(object):
f"ds_config ({TRAIN_BATCH_SIZE}, {TRAIN_MICRO_BATCH_SIZE_PER_GPU}, and/or " \
f"{GRADIENT_ACCUMULATION_STEPS}). These parameters *will not be used* since " \
"elastic training is enabled, which takes control of these parameters. " \
"If you want to supress this error (the parameters will be silently ignored) " \
"If you want to suppress this error (the parameters will be silently ignored) " \
f"please set {IGNORE_NON_ELASTIC_BATCH_INFO}':true in your elasticity config.")
# micro_bsz * world_size * gas = total_batch_size

Просмотреть файл

@ -30,7 +30,7 @@ class Eigenvalue(object):
ranks=[0])
# Replace all nan/pos-inf/neg-inf to zero
# TODO: Pytorch new verion may add this function, replace this one by then.
# TODO: Pytorch new version may add this function, replace this one by then.
def nan_to_num(self, x):
device = x.device
x = x.cpu().numpy()

Просмотреть файл

@ -658,7 +658,7 @@ class DeepSpeedEngine(Module):
ompi_local_rank = os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK")
local_rank = os.environ.get('LOCAL_RANK', ompi_local_rank)
assert ompi_local_rank == local_rank, f"LOCAL_RANK ({local_rank}) != OMPI_COMM_WORLD_LOCAL_RANK ({ompi_local_rank}), " \
"not sure how to proceed as we're seeing conficting local rank info."
"not sure how to proceed as we're seeing conflicting local rank info."
os.environ['LOCAL_RANK'] = local_rank
self.local_rank = int(os.environ['LOCAL_RANK'])
@ -822,7 +822,7 @@ class DeepSpeedEngine(Module):
if not self.amp_enabled():
self._broadcast_model()
#check if parmaeters are duplicated in optimizer param_groups
#check if parameters are duplicated in optimizer param_groups
def _check_for_duplicates(self, optimizer):
for name, param in self.module.named_parameters():
param_id = id(param)
@ -830,12 +830,12 @@ class DeepSpeedEngine(Module):
def ids_list(group):
return [id(param) for param in group]
occurance = sum([
occurrence = sum([
ids_list(group['params']).count(param_id)
if param_id in ids_list(group['params']) else 0
for group in optimizer.param_groups
])
assert occurance <= 1, f"Parameter with name: {name} occurs multiple times in optimizer.param_groups. Make sure it only appears once to prevent undefined behaviour."
assert occurrence <= 1, f"Parameter with name: {name} occurs multiple times in optimizer.param_groups. Make sure it only appears once to prevent undefined behaviour."
# Configure optimizer
def _configure_optimizer(self, client_optimizer, model_parameters):
@ -918,7 +918,7 @@ class DeepSpeedEngine(Module):
torch_adam = optimizer_parameters.pop(TORCH_ADAM_PARAM, False)
adam_w_mode = optimizer_parameters.pop(ADAM_W_MODE, ADAM_W_MODE_DEFAULT)
# Optimizer name of Adam forces AdamW logic unless adam_w_mode is explictly set
# Optimizer name of Adam forces AdamW logic unless adam_w_mode is explicitly set
effective_adam_w_mode = self.optimizer_name(
) == ADAMW_OPTIMIZER or adam_w_mode
@ -1225,7 +1225,7 @@ class DeepSpeedEngine(Module):
if route == ROUTE_TRAIN:
deepspeed_io_timer = self.tput_timer
# If mpu is provied, forward world size and parallel rank to sampler.
# If mpu is provided, forward world size and parallel rank to sampler.
data_parallel_world_size = None
data_parallel_rank = None
if self.mpu is not None:
@ -1506,7 +1506,7 @@ class DeepSpeedEngine(Module):
report_progress = self.global_rank == 0 if self.global_rank else True
# Check overlow here since in DS fp16 optimizer, the overflow is updated in above step() function.
# Check overflow here since in DS fp16 optimizer, the overflow is updated in above step() function.
overflow = False
if hasattr(self.optimizer, 'overflow'):
overflow = self.optimizer.overflow

Просмотреть файл

@ -115,7 +115,7 @@ class OnebitAdam(torch.optim.Optimizer):
grads (list of tensors, optional): weight gradient to use for the
optimizer update. If gradients have type torch.half, parameters
are expected to be in type torch.float. (default: None)
output params (list of tensors, optional): A reduced recision copy
output params (list of tensors, optional): A reduced precision copy
of the updated weights written out in addition to the regular
updated weights. Have to be of same type as gradients. (default: None)
scale (float, optional): factor to divide gradient tensor values

Просмотреть файл

@ -361,7 +361,7 @@ class LRRangeTest(object):
self.step_rate = lr_range_test_step_rate
self.last_batch_iteration = last_batch_iteration
self.staircase = lr_range_test_staircase
self.interval_fn = self._staircase_interval if lr_range_test_staircase else self._continous_interval
self.interval_fn = self._staircase_interval if lr_range_test_staircase else self._continuous_interval
if last_batch_iteration == -1:
self._update_optimizer(self.min_lr)
@ -369,7 +369,7 @@ class LRRangeTest(object):
def _staircase_interval(self):
return math.floor(float(self.last_batch_iteration + 1) / self.step_size)
def _continous_interval(self):
def _continuous_interval(self):
return float(self.last_batch_iteration + 1) / self.step_size
def _get_increase(self):
@ -514,7 +514,7 @@ class OneCycle(object):
decay_mom_rate,
last_batch_iteration)
# Initalize batch iteration tracker
# Initialize batch iteration tracker
self.last_batch_iteration = last_batch_iteration
# Configure cycle shape

Просмотреть файл

@ -175,7 +175,7 @@ class PipelineModule(nn.Module):
topology = PipeDataParallelTopology(num_pp=num_stages, num_dp=dp)
self._topo = topology
# Contruct communicators for pipeline topology
# Construct communicators for pipeline topology
self._grid = PipelineParallelGrid(process_group=self.world_group,
topology=self._topo)

Просмотреть файл

@ -177,7 +177,7 @@ def _get_send_recv_group(src_stage, dest_stage):
stage_id = src_stage
'''group_id corresponds to group of [group_id, group_id+1]
unless group_id is the rank of the last stage
in which case group_id correspods to group[group_id-num_stages+1, group_id]
in which case group_id corresponds to group[group_id-num_stages+1, group_id]
'''
group_id = _grid.stage_to_global(stage_id=stage_id)

Просмотреть файл

@ -392,7 +392,7 @@ class ForwardPass(BufferOpInstruction):
.. code-block:: python
buffers['ouputs'][buffer_id] = forward(buffers['inputs'][buffer_id])
buffers['outputs'][buffer_id] = forward(buffers['inputs'][buffer_id])
"""
pass
@ -404,7 +404,7 @@ class BackwardPass(BufferOpInstruction):
.. code-block:: python
outputs = buffers['ouputs'][buffer_id]
outputs = buffers['outputs'][buffer_id]
gradients = buffers['gradients'][buffer_id]
torch.autograd.backward(tensors=outputs,
grad_tensors=gradients)

Просмотреть файл

@ -233,7 +233,7 @@ def _prime_factors(N):
class PipeDataParallelTopology(ProcessTopology):
""" A topology specialiation for hybrid data and pipeline parallelism.
""" A topology specialization for hybrid data and pipeline parallelism.
Uses data parallelism on the last dimension to encourage gradient
reductions to use high-bandwidth intra-node links and lower-volume

Просмотреть файл

@ -154,7 +154,7 @@ class Quantizer(object):
f'Quantization settings: current bit-precision = {self.q_start_bits[index]}, step = {self.qsteps}, quantization period = {self.q_period[index]}, index = {index}'
)
assert (self.q_start_bits[index] >= self.q_target_bits), \
'Quantization bit is lower thab target precision bits!'
'Quantization bit is lower than target precision bits!'
# quantize the weights base on the selected bits and the value-range
if not self.use_quantizer_kernel:

Просмотреть файл

@ -63,7 +63,7 @@ class SDLoaderBase(ABC):
a. if no mp_size resizing occurs, for both training & inference, loading
the mp_rank related checkpoint directly.
b. if has mp_size resizing, only Megatron model inference is supported,
checkpoint file(s) will be merged/splitted according to mp_rank, mp_world_size and
checkpoint file(s) will be merged/split according to mp_rank, mp_world_size and
checkpoint file list.
3. Non-PipeModule loading mp_rank_*.pt files, is_pipe_parallel=False
@ -433,7 +433,7 @@ class MegatronSDLoader(SDLoaderBase):
sd = torch.load(ckpt_file_name, map_location=lambda storage, loc: storage)
# partail_key is a sub-string of one key in the sd
# partial_key is a sub-string of one key in the sd
def check_key_exist(partial_key, sd):
keys = sd.keys()
found = False

Просмотреть файл

@ -461,12 +461,12 @@ class OptimizerSwapper(object):
self._stop_timer(UNSWAPPED_READ_GRADIENTS)
self._log_timers([UNSWAPPED_READ_GRADIENTS])
# It shoud be safe to discard unswapped gradient partitions
# It should be safe to discard unswapped gradient partitions
swap_info.release_unswapped_gradients()
if SWAPPER_DEBUG_MODE:
logger.info(
f'optimizer_retreive_unswapped_radients: param={swap_info.param_id} tensor_count={tensor_count} elem_count={num_elem_count}'
f'optimizer_retrieve_unswapped_gradients: param={swap_info.param_id} tensor_count={tensor_count} elem_count={num_elem_count}'
)
def _get_state_tensors(self, parameter):

Просмотреть файл

@ -124,7 +124,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper):
return
self._start_timer(SWAP_OUT_PARAM_TIMER)
pinned_tensors, pinned_paths, unpinned_tensors, unpinned_paths = self._seperate_pinned_tensors(swap_info)
pinned_tensors, pinned_paths, unpinned_tensors, unpinned_paths = self._separate_pinned_tensors(swap_info)
swap_bytes = sum([
self._io_aligned_numel(t.numel()) * t.element_size()
for t in swap_info.tensors
@ -203,7 +203,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper):
if DEBUG_MODE and torch.distributed.get_rank() == 0:
logger.info(f'optimizer_param_swap_in: {(swap_bytes/(1024**3)):5.2f} GB')
def _seperate_pinned_tensors(self, swap_info):
def _separate_pinned_tensors(self, swap_info):
pinned_tensors = []
pinned_paths = []

Просмотреть файл

@ -125,7 +125,7 @@ class AsyncPartitionedParameterSwapper(object):
self.swap_out_params = []
#Check if partiitoned param or numel in a tensor is swappable or not
#Check if partitioned param or numel in a tensor is swappable or not
def swappable_tensor(self, param=None, numel=None):
if param is not None:
assert numel is None, "Both parma and numel cannot be provided"

Просмотреть файл

@ -106,7 +106,7 @@ ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE = 'stage3_prefetch_bucket_size'
ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE_DEFAULT = 50000000
#parameters smaller than the threshold are only communicated once after the
#parameters are updated and are persisted thoughout the trainging
#parameters are updated and are persisted throughout the training
#avoid tons of latency bound communication
ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD = 'stage3_param_persistence_threshold'
ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT = 100000
@ -125,7 +125,7 @@ ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS_DEFAULT = True
ZERO_OPTIMIZATION_LEGACY_STAGE1 = "legacy_stage1"
ZERO_OPTIMIZATION_LEGACY_STAGE1_DEFAULT = False
# Stage 2 - partition gradients in a round robin fashsion to load-balance reduction and offload copying
# Stage 2 - partition gradients in a round robin fashion to load-balance reduction and offload copying
ZERO_OPTIMIZATION_ROUND_ROBIN_GRADIENTS = 'round_robin_gradients'
ZERO_OPTIMIZATION_ROUND_ROBIN_GRADIENTS_DEFAULT = False

Просмотреть файл

@ -10,7 +10,7 @@ class ContiguousMemoryAllocator(object):
def __init__(self, size, dtype, device):
self.buffer = torch.zeros(size, dtype=dtype, device=device)
#address to contiguous size avaialble
#address to contiguous size available
self.contiguous_sizes = {}
self.contiguous_sizes[0] = size
@ -65,7 +65,7 @@ class ContiguousMemoryAllocator(object):
print_rank_0(
f"Free before allocation {free_before}. Allocating {size}. Free after allocation {self.total_free}. Max allocated {self.max_allocated}"
)
assert self.total_free + size == free_before, "Allcation bookeeping error"
assert self.total_free + size == free_before, "Allocation bookkeeping error"
return ret_tensor

Просмотреть файл

@ -1,14 +1,14 @@
#Linear Module to use with ZeRO Stage 3 to allow for parameter memory release
#after the module execution during forward
#Instead of saving variables using save_for_backward, we save variable ids
#Allowing us to retrive the variable without creating pointer to it
#Allowing us to retrieve the variable without creating pointer to it
#Which allows for underlying tensor to be garbage collected
#When partitioned as needed by the Zero Stage 3 optimizer
#TODO instead of patching Linear module, we could patch the ctx.save_for_backward
#ctx.saved_tensors so that this approach works for all nn modules that are built upon
#torch.nn.function. However the issue is that many modules uses C++ implementations
#which does not have pytroch implementation. Eg torch.addmm which acts as a funcitonal
#when implemeted outside of torch.autograd.Function
#which does not have pytorch implementation. Eg torch.addmm which acts as a functional
#when implemented outside of torch.autograd.Function
import math

Просмотреть файл

@ -379,7 +379,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
effect. Defaults to ``True``.
dtype (``dtype``, optional): Can be used to change the data type of the parameters.
Supported options are ``torch.half`` and ``torch.float``. Defaults to ``None``
mpu (``object``, optional): A model parallelism unit object that implements get_{model,data}_parallel_{rank,group,wolrd_size}.
mpu (``object``, optional): A model parallelism unit object that implements get_{model,data}_parallel_{rank,group,world_size}.
This context accelerates model initialization and enables models that
are too large to allocate in their entirety in CPU memory. It has the
@ -1059,7 +1059,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
if not accumulate:
dest_tensor.copy_(src_tensor)
# if source and destinatoin are on same device,
# if source and destination are on same device,
# add to the provided buffer
elif src_tensor.device == dest_tensor.device:
dest_tensor.add_(src_tensor)

Просмотреть файл

@ -195,7 +195,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
self.fp16_master_weights_and_gradients = fp16_master_weights_and_gradients
if self.fp16_master_weights_and_gradients:
assert self.cpu_offload and type(self.optimizer) in [DeepSpeedCPUAdam], f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32. Currenty only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}. Either disable fp16_master_weights_and_gradients or enable ZeRO-2 Offload with DeepSpeedCPUAdam"
assert self.cpu_offload and type(self.optimizer) in [DeepSpeedCPUAdam], f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32. Currently only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}. Either disable fp16_master_weights_and_gradients or enable ZeRO-2 Offload with DeepSpeedCPUAdam"
if self.reduce_scatter:
assert not self.allreduce_always_fp32, "allreduce_always_fp32 is not yet supported with ZeRO-2 with reduce scatter enabled"
@ -223,7 +223,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
# These are the parameters that will be updated by this process directly
self.params_in_partition = []
# Offset from the first paramter in the the self.params_in_partition
# Offset from the first parameter in the the self.params_in_partition
# the parameter boundaries may not align with partition boundaries
# so we need to keep track of the offset
self.first_offset = []
@ -909,7 +909,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
partition_id, offset = partition_ids_w_offsets[idx]
# if dist.get_rank() == 0 and count < 100:
# print(f"Rank {dist.get_rank()} rank offet id {idx} calculated dp size {dist.get_world_size(group=process_group)} real dp size {dist.get_world_size(self.real_dp_process_group[i])} and dst: {partition_id}")
# print(f"Rank {dist.get_rank()} rank offset id {idx} calculated dp size {dist.get_world_size(group=process_group)} real dp size {dist.get_world_size(self.real_dp_process_group[i])} and dst: {partition_id}")
# count += 1
# Calculate numel for grad slice depending on partition location
@ -937,7 +937,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
for i, (dst, bucket_offset, numel) in enumerate(rank_and_offsets):
grad_slice = tensor.narrow(0, int(bucket_offset), int(numel))
# if dist.get_rank() == 0:
# print(f"Rank {dist.get_rank()} rank offet id {i} real dp size {dist.get_world_size(group=real_dp_process_group[i])} and dst: {dst}")
# print(f"Rank {dist.get_rank()} rank offset id {i} real dp size {dist.get_world_size(group=real_dp_process_group[i])} and dst: {dst}")
# dist.barrier()
#dist.barrier()
dst_rank = _get_global_rank(real_dp_process_group[i], dst)
@ -1008,7 +1008,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
dest_offset,
num_elements)
#accumulate gradients into param.grad or parts of it that belongs to this parittion
#accumulate gradients into param.grad or parts of it that belongs to this partition
def accumulate_gradients():
if not self.fp16_master_weights_and_gradients:
dest_buffer.copy_(self.accumulated_grads_in_cpu[param_id].view(-1),
@ -1167,7 +1167,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
device=torch.cuda.current_device())
see_memory_usage(f"after copying {total_size} gradients into partition")
# The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
# The allreduce buffer will be rewritten. Copy the gradients in partition to a new buffer
new_grad_tensor = self.grads_in_partition.view(-1).narrow(
0,
self.grads_in_partition_offset,
@ -1257,7 +1257,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
self.sequential_execution(print_func, message)
def get_grads_to_reduce(self, i, partition_id):
def get_reducable_portion(key):
def get_reducible_portion(key):
grad = self.param_dict[key].grad
total_elements = grad.numel()
start = self.grad_start_offset[i][partition_id][key]
@ -1283,7 +1283,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
grads_to_reduce = []
for key in self.is_grad_computed[i][partition_id]:
grad = get_reducable_portion(key)
grad = get_reducible_portion(key)
grads_to_reduce.append(grad)
return grads_to_reduce
@ -1500,7 +1500,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
else:
total_norm = 0.0
# if dist.get_rank() == 0:
# logger.info(f"Total Norm begining {total_norm}")
# logger.info(f"Total Norm beginning {total_norm}")
for g, p in zip(gradients, params):
# Pipeline parallelism may replicate parameters. Avoid multi-counting.
if hasattr(p, 'ds_pipe_replicated') and p.ds_pipe_replicated:
@ -1655,7 +1655,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
self.get_grad_norm_direct(self.averaged_gradients[i],
self.params_in_partition[i]))
# free gradients for all the prameters that are not updated by this process
# free gradients for all the parameters that are not updated by this process
self.free_grad_in_param_list(self.params_not_in_partition[i])
# create a flat gradients for parameters updated by this process

Просмотреть файл

@ -100,7 +100,7 @@ def _apply_to_tensors_only(module, functional, backward_function, outputs):
return outputs
#for each tensor in outputs run the forward_funciton and register backward_function as hook
#for each tensor in outputs run the forward_function and register backward_function as hook
def _apply_forward_and_backward_to_tensors_only(module,
forward_function,
backward_function,
@ -178,7 +178,7 @@ class PrefetchCoordinator(object):
# maps sub_module id to submodule objects
self.id_to_sub_module_map = {}
# stores the total number of parmeters in each sub_module
# stores the total number of parameters in each sub_module
self.id_to_sub_module_size_map = {}
self.trace_completed = False
@ -362,7 +362,7 @@ class PartitionedParameterCoordinator(object):
for param in params_to_prefetch:
param.ds_status = ZeroParamStatus.INFLIGHT
# keeping track of number of elements consumed by available parmaeters
# keeping track of number of elements consumed by available parameters
self._increment_available_parameter_numel(param.ds_numel)
if nvme:
@ -432,7 +432,7 @@ class PartitionedParameterCoordinator(object):
)
partitioned_params.append(param)
# keeping track of number of elements consumed by available parmaeters
# keeping track of number of elements consumed by available parameters
self._increment_available_parameter_numel(param.ds_numel)
print_rank_0(f"Incrementing with parameter id {param.ds_id}")
@ -504,7 +504,7 @@ class PartitionedParameterCoordinator(object):
param.ds_active_sub_modules = 0
if param.ds_status == ZeroParamStatus.AVAILABLE:
print_rank_0(
f"Releasing unpartitioned param {debug_param2name_id_numel(param)} active sub-modules {param.ds_active_sub_modules} and persisitence {param.ds_persist}"
f"Releasing unpartitioned param {debug_param2name_id_numel(param)} active sub-modules {param.ds_active_sub_modules} and persistence {param.ds_persist}"
)
self._decrement_available_parameter_numel(param.ds_numel)
param.partition()
@ -810,7 +810,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
see_memory_usage(f"After creating fp16 partitions: {num_fp16_subgroups}",
force=False)
# Optimizer ensor swapping
# Optimizer tensor swapping
if self.swap_optimizer:
self._configure_tensor_swapping(offload_optimizer_config, aio_config)
@ -902,7 +902,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
# stores if a grad in a partition has been computed or not
self.is_grad_computed = {}
# will store the averaged gradients required by this parititon
# will store the averaged gradients required by this paritition
self.averaged_gradients = {}
#creates backward hooks for gradient partitioning
@ -1046,7 +1046,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
src.data = dest.data
src.status = PartitionedParamStatus.AVAILABLE
else:
assert src.status == PartitionedParamStatus.AVAILABLE, "Partitioned Parm must be avialable here"
assert src.status == PartitionedParamStatus.AVAILABLE, "Partitioned Param must be available here"
if not avoid_copy:
dest.data.copy_(src.data)
src.data = dest.data
@ -1175,7 +1175,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
flat_offset,
total_elements)
print_rank_0(
f"Creating a flat buffer for subgroup {i} requiring {total_elements} elements, and cumulative CPU elemets {flat_offset + total_elements}",
f"Creating a flat buffer for subgroup {i} requiring {total_elements} elements, and cumulative CPU elements {flat_offset + total_elements}",
force=False)
#these parameters reside in NVME and
elif self.params_in_nvme_and_cpu:
@ -1410,7 +1410,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
self.module.register_forward_hook(_end_of_forward_hook)
self.module.register_forward_pre_hook(_pre_forward_hook)
# Add top todule to stack trace
# Add top module to stack trace
global FWD_MODULE_STACK
FWD_MODULE_STACK.append(self.module)
@ -1793,7 +1793,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
see_memory_usage(f"End ipg_epilogue", force=False)
# resets all partition to no reduced
# sets remianing grads to the total number of grads in each partition
# sets remaining grads to the total number of grads in each partition
# set is grad computed to false for all grads in partition
def reset_partition_gradient_structures(self):
total_partitions = dist.get_world_size(group=self.dp_process_group)
@ -2140,7 +2140,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
# self.reduction_stream.synchronize()
if self.gradient_accumulation_steps > 1:
# The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
# The allreduce buffer will be rewritten. Copy the gradients in partition to a new buffer
fp16_grad_tensor = self.grads_in_partition[i].narrow(
0,
dest_offset,
@ -2173,7 +2173,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
param,
fp32_grad_tensor)
else:
# The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
# The allreduce buffer will be rewritten. Copy the gradients in partition to a new buffer
fp16_grad_tensor = self.grads_in_partition[i].narrow(
0,
dest_offset,
@ -2248,7 +2248,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
self.sequential_execution(print_func, message)
def get_grads_to_reduce(self, i, partition_id):
def get_reducable_portion(key):
def get_reducible_portion(key):
grad = self.param_dict[key].grad
total_elements = grad.numel()
start = self.grad_start_offset[i][partition_id][key]
@ -2274,7 +2274,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
grads_to_reduce = []
for key in self.is_grad_computed[i][partition_id]:
grad = get_reducable_portion(key)
grad = get_reducible_portion(key)
grads_to_reduce.append(grad)
return grads_to_reduce
@ -2475,7 +2475,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
else:
total_norm = 0.0
# if dist.get_rank() == 0:
# logger.info(f"Total Norm begining {total_norm}")
# logger.info(f"Total Norm beginning {total_norm}")
for g, p in zip(gradients, params):
if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
param_norm = g.data.double().norm(2)
@ -2819,7 +2819,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
return
def dump_pre_step_gradients(self, debug_fp32_grads):
# Dump gradient norms for debbuging
# Dump gradient norms for debugging
for i, _ in enumerate(self.fp16_groups):
print(f'Pre-Step Dump Norms for Group {i} FP16P, FP16G, FP32G, FP32GUC')
for fp16_param, fp32_grad in zip(self.fp16_groups[i], debug_fp32_grads[i]):
@ -2831,7 +2831,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
print(f'Pre-Step Norms {i} {param_id} = {norm_list}')
def dump_post_step_gradients(self):
# Dump gradient norms for debbuging
# Dump gradient norms for debugging
for i, group in enumerate(self.fp16_groups):
print(
f'Post-Step Dump Norms for Group {i} FP16P, FP16DS, FP16FLAT, FP32FLAT')
@ -3137,7 +3137,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
def refresh_fp32_params(self):
self._restore_from_fp16_weights()
# Extract flattened partion for current rank from all partitions
# Extract flattened partition for current rank from all partitions
def _get_flattened_partition(self, all_partition_states):
partition_id = dist.get_rank(group=self.dp_process_group)
alignment = dist.get_world_size(group=self.dp_process_group)

Просмотреть файл

@ -100,7 +100,7 @@ def initialize(ep_size=1, mpu=None):
Arguments:
ep_size (int, optional): default=1, expert parallel size
mpu (module, optional): default=None, model parallel unit (e.g., from Megatron)
that descibes model/data parallel ranks.
that describes model/data parallel ranks.
"""
if mpu is not None:
@ -121,7 +121,7 @@ def initialize_model_parallel(model_parallel_size_):
Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
use 2 GPUs to parallelize the model. The present function will
create 4 model parallel groups and 2 data parallel grous as:
create 4 model parallel groups and 2 data parallel groups as:
4 model parallel groups:
[g0, g1], [g2, g3], [g4, g5], [g6, g7]
2 data parallel groups:

Просмотреть файл

@ -269,7 +269,7 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
model.load_state_dict(state_dict)
# submit to model hub or save the model to share with others
In this example the ``model`` will no longer be useable in the deepspeed context of the same
In this example the ``model`` will no longer be usable in the deepspeed context of the same
application. i.e. you will need to re-initialize the deepspeed engine, since
``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
@ -332,7 +332,7 @@ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
# submit to model hub or save the model to share with others
Note, that once this was run, the ``model`` will no longer be useable in the deepspeed context
Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
of the same application. i.e. you will need to re-initialize the deepspeed engine, since
``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.

Просмотреть файл

@ -4,7 +4,7 @@ title: "DeepSpeed Configuration JSON"
### Batch Size Related Parameters
**Note:** <i>**train_batch_size**</i> must be equal to <i>**train_micro_batch_size_per_gpu**</i> * <i>**gradient_accumulation**</i> * number of GPUs. For simplicty, you can choose to only specify two of the three parameters, the last one will be inferred automatically by DeepSpeed.
**Note:** <i>**train_batch_size**</i> must be equal to <i>**train_micro_batch_size_per_gpu**</i> * <i>**gradient_accumulation**</i> * number of GPUs. For simplicity, you can choose to only specify two of the three parameters, the last one will be inferred automatically by DeepSpeed.
{: .notice--warning}
<i>**train_batch_size**</i>: [integer]
@ -662,7 +662,7 @@ Configuring the asynchronous I/O module for offloading parameter and optimizer s
| Description | Default |
| -------------------------------------------------------------------------------------------------------- | ------- |
| Total number of activation checkpoints used to allocate memory buffer for contiguous_memoty_optimization | `None` |
| Total number of activation checkpoints used to allocate memory buffer for contiguous_memory_optimization | `None` |
<i>**synchronize_checkpoint_boundary**</i>: [boolean]

Просмотреть файл

@ -37,7 +37,7 @@ Figure 2 shows another mixed-precision quantization that sets target bits as 4,
![MoQ (mixed-precision)](/assets/images/bingbert-mixedbit.png){: .align-center}
Figure 3: Mixed-precision quantization with MoQ for Bert SQuAD plus.
As another example, we use eigenvalue-based MoQ to quantize Bert-Large for SQuAD finetuning. Figure 3 shows the number of bits we get to at the end of finetuning on each layer. Here, we see slightly different percision spectrum compared to BertBase on GLUE tasks. As the figure shows, we can reducethe precision on the first few layers more aggressively than the middle ones. Also, the last few layers can tollerate very low precision similar to the beginning layers. This way of quantization finally results in 90.56 F1 Score which is prettey similar to the baseline.
As another example, we use eigenvalue-based MoQ to quantize Bert-Large for SQuAD finetuning. Figure 3 shows the number of bits we get to at the end of finetuning on each layer. Here, we see slightly different precision spectrum compared to BertBase on GLUE tasks. As the figure shows, we can reduce the precision on the first few layers more aggressively than the middle ones. Also, the last few layers can tolerate very low precision similar to the beginning layers. This way of quantization finally results in 90.56 F1 Score which is pretty similar to the baseline.
## Quantized Inference Kernels

Просмотреть файл

@ -1,6 +1,6 @@
---
layout: single
title: "DeepSpeed Inference: Multi-GPU inference with customized inference kerenls and quantization support"
title: "DeepSpeed Inference: Multi-GPU inference with customized inference kernels and quantization support"
excerpt: ""
categories: news
new_post: false

Просмотреть файл

@ -27,7 +27,7 @@ MoQ quantization schedule is defined by a number of parameters which allow users
`quantize_groups`: Quantization groups, which shows the number of scales used to quantize a model, default is 1.
`quantize_bits`, The numer of bits to control the data-precision transition from a start-bit to thhe final target-bits (e.g. starting from 16-bit down to 8-bit).
`quantize_bits`, The numer of bits to control the data-precision transition from a start-bit to the final target-bits (e.g. starting from 16-bit down to 8-bit).
`start_bits`: The start bits in quantization training. Default is set to 16.
`target_bits`: The target bits in quantization training. Default is set to 16.
@ -63,7 +63,7 @@ MoQ quantization schedule is defined by a number of parameters which allow users
## How to Use MoQ for GLUE Training Tasks
Before fine-tunning the GLUE tasks using DeepSpeed MoQ, you need:
Before fine-tuning the GLUE tasks using DeepSpeed MoQ, you need:
1. Install DeepSpeed.
2. Checkout Huggingface transformers branch, install it with all required packages.
@ -206,6 +206,6 @@ As we see in the following table, MoQ consistently preserve accuracy across diff
### Tips
When using the MoQ, one needs to consider the number of samples and training iterations before setting the correct quatization period or offset to make sure that the quantization reaches the desired level of precision before training finishes.
When using the MoQ, one needs to consider the number of samples and training iterations before setting the correct quantization period or offset to make sure that the quantization reaches the desired level of precision before training finishes.
Enabling eigenvalues for quantization dynamically adjust the quantization period on the different parts of the network. This has two positive impact: 1) the quantized network can potentially produce higher accuracy than quantizing each layer with same `quantize_period` ; 2) it automatically identifies a good quantization schedule for each layer based on its senitivity.
Enabling eigenvalues for quantization dynamically adjust the quantization period on the different parts of the network. This has two positive impact: 1) the quantized network can potentially produce higher accuracy than quantizing each layer with same `quantize_period` ; 2) it automatically identifies a good quantization schedule for each layer based on its sensitivity.

Просмотреть файл

@ -26,7 +26,7 @@ ds_report
## Pre-install DeepSpeed Ops
**Note:** [PyTorch](https://pytorch.org/) must be installed _before_ pre-compiling any DeepSpeed c++/cuda ops. However, this is not required if using the default mode of JIT compilition of ops.
**Note:** [PyTorch](https://pytorch.org/) must be installed _before_ pre-compiling any DeepSpeed c++/cuda ops. However, this is not required if using the default mode of JIT compilation of ops.
{: .notice--info}
Sometimes we have found it useful to pre-install either some or all DeepSpeed

Просмотреть файл

@ -139,14 +139,14 @@ The DeepSpeed Flops Profiler can be used with the DeepSpeed runtime without any
## Flops Measurement
Similar to exsiting flops calculation tools or methods, the DeepSpeed Flops Profiler measures the flops of the forward pass of a module and the flops of the backward pass is estimated as `2` times of that of the forward pass.
Different from the PyTorch profiler which calculates the flops of PyTorch operators, the DeepSpeed Flops Profiler measures the flops witin modules in a model and provides more insights to the users about the model execution.
Similar to existing flops calculation tools or methods, the DeepSpeed Flops Profiler measures the flops of the forward pass of a module and the flops of the backward pass is estimated as `2` times of that of the forward pass.
Different from the PyTorch profiler which calculates the flops of PyTorch operators, the DeepSpeed Flops Profiler measures the flops within modules in a model and provides more insights to the users about the model execution.
The flops estimation is partly inspired by [ptflops](https://github.com/sovrasov/flops-counter.pytorch) with the major difference being that the DeepSpeed Flops Profiler not only supports flops computation directly at module level, but can also capture ```torch.nn.functional``` invoked in a module to estimate the flops.
Thus the DeepSpeed Flops Profiler allows for customized modules in the model, e.g., ```ParallelTransformerLayerworks, ParallelSelfAttention, RowParallelLinear, etc.``` in [Megatron-LM](https://github.com/NVIDIA/Megatron-LM). This is in contrast to ptflops which requires users to write customized flops calculation functions for each customized module.
## Multi-GPU, Multi-node, Data Parallelism, and Model Parallelism
The DeepSpeed Flops Profiler outputs the per GPU profile as well as the world size, data parallel size, and model paralel size. 1
The DeepSpeed Flops Profiler outputs the per GPU profile as well as the world size, data parallel size, and model parallel size. 1
For models running on multi-GPU or multi-node, only change of the model parallelism (e.g. ```--model-parallel-size``` in [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)) affects the number of flops and parameters profiled, i.e.,
`model_parallel_size * flops = total_flops` and `model_parallel_size * parameters = total_parameters`. The data parallel size or world size (related to the number of GPUs or nodes) does not affect the per GPU profile.
@ -376,7 +376,7 @@ with torch.cuda.device(0):
#### In Model Training Workflow
To profile model forward in a training workflow, use the `FlopsProfiler`class.
The `FlopsProfiler`class provides the follwing methods:
The `FlopsProfiler`class provides the following methods:
* `start_profile()` - starts profiling
* `get_total_flops(as_string=False)` - returns the total number of MACs in the model
* `get_total_params(as_string=False)` - returns the total number of parameters in the model

Просмотреть файл

@ -7,7 +7,7 @@ This tutorial describes how to use [PyTorch Profiler](https://pytorch.org/blog/i
PyTorch Profiler is an open-source tool that enables accurate and efficient performance analysis and troubleshooting for large-scale deep learning models. The profiling results can be outputted as a `.json` trace file and viewed in Google Chrome's trace viewer (chrome://tracing).
Microsoft Visual Studio Code's Python extension integrates TensorBoard into the code editor, including the support for the PyTorch Profiler.
For more detais, refer to [PYTORCH PROFILER](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html#pytorch-profiler).
For more details, refer to [PYTORCH PROFILER](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html#pytorch-profiler).
## Profile the model training loop

Просмотреть файл

@ -148,7 +148,7 @@ Please refer to the Docstrings for details of how to use each module separately.
## How to config sparsity structures
Following we describe supported sparsity structures, their parameter set and the flexibility of adding arbitrary sparsity pattern on the self-attention layer. You can update DeepSpeed config file using any of the supported sparsity structures and set the parameters accordingly.
* **SpasityConfig**:
* **SparsityConfig**:
This module, is the parent class for all sparsity structures and contains the shared features of all sparsity structures. It takes the following parameters:
* `num_heads`: an integer determining number of attention heads of the layer.
* `block`: an integer determining the block size. Current implementation of sparse self-attention is based on blocked sparse matrices. In which this parameter defines size of such square blocks; `Block X Block`.
@ -164,7 +164,7 @@ This structure is based on [Generative Modeling with Sparse Transformers](https:
![Fixed sparsity structure](/assets/images/sa_fixed_sparsity_structure.png)
* **BSLongformer** (BSLongformerSparistyConfig):
* **BSLongformer** (BSLongformerSparsityConfig):
This structure is an edited version of [Longformer: The Long-Document Transformer](https://arxiv.org/pdf/2004.05150.pdf), in which instead of single token-wise sparsity, we offer block of tokens sparsity. Parameters that define this patters are:
* `num_sliding_window_blocks`: an integer determining the number of blocks in sliding local attention window.
* `global_block_indices`: a list of integers determining which blocks are considered as global attention. Given indices, determine the blocks that all other token blocks attend to and they attend to all other token blocks. Notice that if `global_block_end_indices` parameter is set, this parameter is used as starting index of each global window.

Просмотреть файл

@ -25,7 +25,7 @@ to add DeepSpeed's builtin arguments to your application's parser.
Training Initialization
-----------------------
The entrypoint for all training with DeepSpeed is ``deepspeed.initialize()``. Will initialize distributed backend if it is not intialized already.
The entrypoint for all training with DeepSpeed is ``deepspeed.initialize()``. Will initialize distributed backend if it is not initialized already.
Example usage:
@ -39,6 +39,6 @@ Example usage:
Distributed Initialization
-----------------------
Optional distributed backend initializating separate from ``deepspeed.initialize()``. Useful in scenarios where the user wants to use torch distributed calls before calling ``deepspeed.initialize()``, such as when using model parallelism, pipeline parallelism, or certain data loader scenarios.
Optional distributed backend initialization separate from ``deepspeed.initialize()``. Useful in scenarios where the user wants to use torch distributed calls before calling ``deepspeed.initialize()``, such as when using model parallelism, pipeline parallelism, or certain data loader scenarios.
.. autofunction:: deepspeed.init_distributed

Просмотреть файл

@ -9,7 +9,7 @@ from .transformer import TransformerBuilder
from .stochastic_transformer import StochasticTransformerBuilder
from .utils import UtilsBuilder
from .async_io import AsyncIOBuilder
from .builder import get_default_compute_capatabilities
from .builder import get_default_compute_capabilities
from .transformer_inference import InferenceBuilder
from .quantizer import QuantizerBuilder

Просмотреть файл

@ -48,7 +48,7 @@ def installed_cuda_version():
return int(cuda_major), int(cuda_minor)
def get_default_compute_capatabilities():
def get_default_compute_capabilities():
compute_caps = DEFAULT_COMPUTE_CAPABILITIES
import torch.utils.cpp_extension
if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version(
@ -172,7 +172,7 @@ class OpBuilder(ABC):
'''
Test for existence of a function within a tuple of libraries.
This is used as a smoke test to check whether a certain library is avaiable.
This is used as a smoke test to check whether a certain library is available.
As a test, this creates a simple C program that calls the specified function,
and then distutils is used to compile that program and link it with the specified libraries.
Returns True if both the compile and link are successful, False otherwise.
@ -440,7 +440,7 @@ class CUDAOpBuilder(OpBuilder):
cross_compile_archs = cross_compile_archs_env.replace(' ', ';')
else:
if cross_compile_archs is None:
cross_compile_archs = get_default_compute_capatabilities()
cross_compile_archs = get_default_compute_capabilities()
ccs = cross_compile_archs.split(';')
args = []

Просмотреть файл

@ -31,7 +31,7 @@ except ImportError:
print('[WARNING] Unable to import torch, pre-compiling ops will be disabled. ' \
'Please visit https://pytorch.org/ to see how to properly install torch on your system.')
from op_builder import ALL_OPS, get_default_compute_capatabilities
from op_builder import ALL_OPS, get_default_compute_capabilities
RED_START = '\033[31m'
RED_END = '\033[0m'
@ -88,7 +88,7 @@ if torch_available and not torch.cuda.is_available():
"you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "
"(compute capabilities 6.0, 6.1, 6.2)")
if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
os.environ["TORCH_CUDA_ARCH_LIST"] = get_default_compute_capatabilities()
os.environ["TORCH_CUDA_ARCH_LIST"] = get_default_compute_capabilities()
ext_modules = []

Просмотреть файл

@ -105,7 +105,7 @@ def timeme():
def line_profileme():
print("--------------- line_profier -----------------")
print("--------------- line_profiler -----------------")
print("py")
profile(py)()
gc.collect()

Просмотреть файл

@ -3,7 +3,7 @@
helpFunction()
{
echo ""
echo "Usage: $0 -m model-parallelism -g gpu-per-node -n node# -b batch-size -s stpes -l layers -h hidden_size -q seq_length -e heads -c ckpt_num_layers -p [-d]"
echo "Usage: $0 -m model-parallelism -g gpu-per-node -n node# -b batch-size -s steps -l layers -h hidden_size -q seq_length -e heads -c ckpt_num_layers -p [-d]"
echo -e "\t-m model parallelism"
echo -e "\t-g gpus per node"
echo -e "\t-n node count"

Просмотреть файл

@ -2,7 +2,7 @@
# https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/modeling.py
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@ -295,7 +295,7 @@ class BertConfig(object):
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
hidden_dropout_prob: The dropout probabilitiy for all fully connected
hidden_dropout_prob: The dropout probability for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
@ -800,7 +800,7 @@ class BertPreTrainingHeads(nn.Module):
class BertPreTrainedModel(nn.Module):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
a simple interface for downloading and loading pretrained models.
"""
def __init__(self, config, *inputs, **kwargs):
super(BertPreTrainedModel, self).__init__()
@ -856,7 +856,7 @@ class BertPreTrainedModel(nn.Module):
. `model.chkpt` a TensorFlow checkpoint
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
*inputs, **kwargs: additional input for the specific Bert class
(ex: num_labels for BertForSequenceClassification)
"""
@ -977,7 +977,7 @@ class BertModel(BertPreTrainedModel):
`output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
Outputs: Tuple of (encoded_layers, pooled_output)
`encoded_layers`: controled by `output_all_encoded_layers` argument:
`encoded_layers`: controlled by `output_all_encoded_layers` argument:
- `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],

Просмотреть файл

@ -2,7 +2,7 @@
# https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/modeling.py
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@ -295,7 +295,7 @@ class BertConfig(object):
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
hidden_dropout_prob: The dropout probabilitiy for all fully connected
hidden_dropout_prob: The dropout probability for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
@ -895,7 +895,7 @@ class BertPreTrainingHeads(nn.Module):
class BertPreTrainedModel(nn.Module):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
a simple interface for downloading and loading pretrained models.
"""
def __init__(self, config, *inputs, **kwargs):
super(BertPreTrainedModel, self).__init__()
@ -951,7 +951,7 @@ class BertPreTrainedModel(nn.Module):
. `model.chkpt` a TensorFlow checkpoint
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
*inputs, **kwargs: additional input for the specific Bert class
(ex: num_labels for BertForSequenceClassification)
"""
@ -1072,7 +1072,7 @@ class BertModel(BertPreTrainedModel):
`output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
Outputs: Tuple of (encoded_layers, pooled_output)
`encoded_layers`: controled by `output_all_encoded_layers` argument:
`encoded_layers`: controlled by `output_all_encoded_layers` argument:
- `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],

Просмотреть файл

@ -165,7 +165,7 @@ def test_ckpt_inputs1_outputs1():
_test_activation_checkpoint(module, inputs)
# both bool and float are important, as bool is not diffentiable
# both bool and float are important, as bool is not differentiable
@pytest.mark.parametrize('mask',
[
_mixed_mask(),

Просмотреть файл

@ -30,7 +30,7 @@ def test_missing_amp_autocast(tmpdir, half_op):
@pytest.mark.parametrize('half_op', [False, True])
def test_disable_autocast_linear(tmpdir, half_op):
if _skip_autocast_test():
pytest.skip("amp autocast is not availalbe")
pytest.skip("amp autocast is not available")
hidden_dim = 4
if half_op:
@ -56,7 +56,7 @@ def test_disable_autocast_linear(tmpdir, half_op):
True)])
def test_autocast_linear(tmpdir, half_input, half_weight):
if _skip_autocast_test():
pytest.skip("amp autocast is not availalbe")
pytest.skip("amp autocast is not available")
hidden_dim = 4
input = torch.randn(hidden_dim).cuda()

Просмотреть файл

@ -20,7 +20,7 @@ def check_equal(first, second, atol=1e-2, verbose=False):
print("x = {}".format(x.flatten()))
print("y = {}".format(y.flatten()))
print('-' * 80)
np.testing.assert_allclose(x, y, err_msg="param-update dismatch!", atol=atol)
np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol)
@pytest.mark.parametrize('model_size',
[

Просмотреть файл

@ -61,16 +61,16 @@ def check_equal(first, second, atol=1e-2, verbose=False):
for i in range(len(x.shape) - 1):
countx *= x.shape[i + 1]
avgx = np.sum(avgx)
tollerance = 1
tolerance = 1
if avgx != float('inf') and avgx != -float('inf'):
avgx = avgx / countx
tollerance = avgx * atol
tolerance = avgx * atol
if verbose:
print("tollerance is ", tollerance)
print("tolerance is ", tolerance)
print("x = {}".format(x.flatten()))
print("y = {}".format(y.flatten()))
print('-' * 80)
np.testing.assert_allclose(x, y, err_msg="Index: {}".format(i), atol=tollerance)
np.testing.assert_allclose(x, y, err_msg="Index: {}".format(i), atol=tolerance)
def zero_grad(variables):

Просмотреть файл

@ -13,7 +13,7 @@ def test_init():
assert dist.get_rank() < 3
# Demonstration of pytest's paramaterization
# Demonstration of pytest's parameterization
@pytest.mark.parametrize('number,color', [(1138, 'purple')])
def test_dist_args(number, color):
"""Outer test function with inputs from pytest.mark.parametrize(). Uses a distributed

Просмотреть файл

@ -6,7 +6,7 @@ from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_f
from common import distributed_test
def test_flops_profiler_in_ds_trainning(tmpdir):
def test_flops_profiler_in_ds_training(tmpdir):
config_dict = {
"train_batch_size": 1,
"steps_per_print": 1,
@ -34,7 +34,7 @@ def test_flops_profiler_in_ds_trainning(tmpdir):
model = SimpleModel(hidden_dim, empty_grad=False)
@distributed_test(world_size=[1])
def _test_flops_profiler_in_ds_trainning(args, model, hidden_dim):
def _test_flops_profiler_in_ds_training(args, model, hidden_dim):
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())
@ -52,7 +52,7 @@ def test_flops_profiler_in_ds_trainning(tmpdir):
assert model.flops_profiler.flops == 100
assert model.flops_profiler.params == 110
_test_flops_profiler_in_ds_trainning(args, model, hidden_dim)
_test_flops_profiler_in_ds_training(args, model, hidden_dim)
class LeNet5(torch.nn.Module):