Fix many typos (#1423)

* Fix typos in docs/ * Fix typos in code comments and output strings * Fix typos in the code itself * Fix typos in tests/ Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
2021-10-01 22:56:32 -04:00 · 2021-10-01 22:56:32 -04:00 · be789b1665
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@ -196,7 +196,7 @@ bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op,
 {
    const auto op_string = read_op ? "Read" : "Write";
    if (num_bytes % get_thread_count()) {
-        std::cout << "deepseed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
+        std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
                  << " not divisible by thread count = " << get_thread_count() << std::endl;
        return false;
    }
--- a/csrc/aio/py_test/ds_aio_basic.py
+++ b/csrc/aio/py_test/ds_aio_basic.py
@ -130,7 +130,7 @@ def _aio_handle_tasklet(pool_params):
    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops


-def _init_takslet(b):
+def _init_tasklet(b):
    global aio_barrier
    aio_barrier = b

@ -138,7 +138,7 @@ def _init_takslet(b):
 def aio_basic_multiprocessing(args, read_op):
    b = Barrier(args.threads)
    pool_params = [(args, p, read_op) for p in range(args.threads)]
-    with Pool(processes=args.threads, initializer=_init_takslet, initargs=(b, )) as p:
+    with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
        pool_results = p.map(_aio_handle_tasklet, pool_params)

    report_results(args, read_op, pool_results)
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@ -162,7 +162,7 @@ def _aio_handle_tasklet(pool_params):
    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops


-def _init_takslet(b):
+def _init_tasklet(b):
    global aio_barrier
    aio_barrier = b

@ -170,7 +170,7 @@ def _init_takslet(b):
 def aio_handle_multiprocessing(args, read_op):
    b = Barrier(args.threads)
    pool_params = [(args, p, read_op) for p in range(args.threads)]
-    with Pool(processes=args.threads, initializer=_init_takslet, initargs=(b, )) as p:
+    with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
        pool_results = p.map(_aio_handle_tasklet, pool_params)

    report_results(args, read_op, pool_results)
--- a/csrc/aio/py_test/run_read_sweep.sh
+++ b/csrc/aio/py_test/run_read_sweep.sh
@ -5,7 +5,7 @@ if [[ $# -ne 2 ]]; then
 fi


-function validate_enviroment()
+function validate_environment()
 {
    validate_cmd="python ./validate_async_io.py"
    eval ${validate_cmd}
@ -18,7 +18,7 @@ function validate_enviroment()
 }


-validate_enviroment
+validate_environment

 INPUT_FILE=$1
 if [[ ! -f ${INPUT_FILE} ]]; then
--- a/csrc/aio/py_test/run_write_sweep.sh
+++ b/csrc/aio/py_test/run_write_sweep.sh
@ -9,7 +9,7 @@ function prep_folder()
    fi
 }

-function validate_enviroment()
+function validate_environment()
 {
    validate_cmd="python ./validate_async_io.py"
    eval ${validate_cmd}
@ -23,7 +23,7 @@ function validate_enviroment()



-validate_enviroment
+validate_environment

 if [[ $# -ne 3 ]]; then
    echo "Usage: $0 <write size in MB> <write dir ><output log dir>"
--- a/csrc/includes/custom_cuda_layers.h
+++ b/csrc/includes/custom_cuda_layers.h
@ -27,25 +27,25 @@
 #define MAX_REG 256

 template <typename T>
-void launch_qunatize_kernel(T* vals,
+void launch_quantize_kernel(T* vals,
                            int total_count,
                            int group_num,
                            int num_bits,
                            cudaStream_t stream);
 template <typename T>
-void launch_sr_qunatize_kernel(T* vals,
+void launch_sr_quantize_kernel(T* vals,
                               int total_count,
                               int group_num,
                               int num_bits,
                               cudaStream_t stream);
 template <typename T>
-void launch_qunatize_kernel_asym(T* vals,
+void launch_quantize_kernel_asym(T* vals,
                                 int total_count,
                                 int group_num,
                                 int num_bits,
                                 cudaStream_t stream);
 template <typename T>
-void launch_sr_qunatize_kernel_asym(T* vals,
+void launch_sr_quantize_kernel_asym(T* vals,
                                    int total_count,
                                    int group_num,
                                    int num_bits,
--- a/csrc/includes/ds_transformer_cuda.h
+++ b/csrc/includes/ds_transformer_cuda.h
@ -179,6 +179,6 @@ private:
    bool _normalize_invertible;
    bool _gelu_checkpoint;

-    // High Performace flags
+    // High Performance flags
    bool _stochastic_mode;
 };
--- a/csrc/includes/softmax.h
+++ b/csrc/includes/softmax.h
@ -17,14 +17,14 @@ public:
        size_t heads;
        size_t seq_length;
        size_t prob_depth;
-        float temprature;
+        float temperature;
        bool mem_alloc;
        Config(size_t batch, size_t h, size_t seq, int prob_size = 0, bool mem_alloc = false)
            : batchSize(batch),
              heads(h),
              seq_length(seq),
              prob_depth(prob_size),
-              temprature(1.0),
+              temperature(1.0),
              mem_alloc(mem_alloc)
        {
        }
--- a/csrc/includes/type_shim.h
+++ b/csrc/includes/type_shim.h
@ -1,7 +1,7 @@
 /* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
 #include <ATen/ATen.h>

-// Forward/backward compatiblity hack around
+// Forward/backward compatibility hack around
 // https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
 // pending more future-proof guidance from upstream.
 // struct TypeShim
--- a/csrc/quantization/pt_binding.cpp
+++ b/csrc/quantization/pt_binding.cpp
@ -11,7 +11,7 @@ at::Tensor ds_quantize(at::Tensor& vals, int groups, int bits)
    for (auto dim : t_size) size *= dim;

    if ((((size / groups) - 1) / 4096 + 1) <= MAX_REG) {
-        launch_qunatize_kernel(
+        launch_quantize_kernel(
            (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
    }
    return vals;
@ -25,7 +25,7 @@ at::Tensor ds_sr_quantize(at::Tensor& vals, int groups, int bits)
    for (auto dim : t_size) size *= dim;

    if (((size / groups) / 4 / 1024) <= 256) {
-        launch_sr_qunatize_kernel(
+        launch_sr_quantize_kernel(
            (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
    }
    return vals;
@ -39,7 +39,7 @@ at::Tensor ds_quantize_asym(at::Tensor& vals, int groups, int bits)
    for (auto dim : t_size) size *= dim;

    if ((((size / groups) - 1) / 4096 + 1) <= MAX_REG) {
-        launch_qunatize_kernel_asym(
+        launch_quantize_kernel_asym(
            (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
    }
    return vals;
@ -53,7 +53,7 @@ at::Tensor ds_sr_quantize_asym(at::Tensor& vals, int groups, int bits)
    for (auto dim : t_size) size *= dim;

    if (((size / groups) / 4 / 1024) <= 256) {
-        launch_sr_qunatize_kernel_asym(
+        launch_sr_quantize_kernel_asym(
            (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
    }
    return vals;
--- a/csrc/quantization/quantizer.cu
+++ b/csrc/quantization/quantizer.cu
@ -3,7 +3,7 @@

 namespace cg = cooperative_groups;

-__global__ void qunatize_kernel(__half* vals, int group_size, int num_bits)
+__global__ void quantize_kernel(__half* vals, int group_size, int num_bits)
 {
 #if __CUDA_ARCH__ >= 700

@ -93,7 +93,7 @@ __global__ void qunatize_kernel(__half* vals, int group_size, int num_bits)
 #endif
 }

-__global__ void qunatize_kernel(float* vals, int group_size, int num_bits)
+__global__ void quantize_kernel(float* vals, int group_size, int num_bits)
 {
    cg::thread_block b = cg::this_thread_block();
    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@ -176,7 +176,7 @@ __global__ void qunatize_kernel(float* vals, int group_size, int num_bits)
 }

 template <typename T>
-void launch_qunatize_kernel(T* vals,
+void launch_quantize_kernel(T* vals,
                            int total_count,
                            int group_num,
                            int num_bits,
@ -185,22 +185,22 @@ void launch_qunatize_kernel(T* vals,
    dim3 grid_dim(group_num);
    dim3 block_dim(1024);

-    qunatize_kernel<<<grid_dim, block_dim, 0, stream>>>(
+    quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
        vals, (total_count / group_num) / 4, num_bits);
 }

-template void launch_qunatize_kernel(float* vals,
+template void launch_quantize_kernel(float* vals,
                                     int total_count,
                                     int group_num,
                                     int num_bits,
                                     cudaStream_t stream);
-template void launch_qunatize_kernel(__half* vals,
+template void launch_quantize_kernel(__half* vals,
                                     int total_count,
                                     int group_num,
                                     int num_bits,
                                     cudaStream_t stream);

-__global__ void sr_qunatize_kernel(__half* vals,
+__global__ void sr_quantize_kernel(__half* vals,
                                   int token_size,
                                   int token_num,
                                   int num_bits,
@ -336,7 +336,7 @@ __global__ void sr_qunatize_kernel(__half* vals,
 #endif
 }

-__global__ void sr_qunatize_kernel(float* vals,
+__global__ void sr_quantize_kernel(float* vals,
                                   int token_size,
                                   int token_num,
                                   int num_bits,
@ -456,7 +456,7 @@ __global__ void sr_qunatize_kernel(float* vals,
 }

 template <typename T>
-void launch_sr_qunatize_kernel(T* vals,
+void launch_sr_quantize_kernel(T* vals,
                               int total_count,
                               int group_num,
                               int num_bits,
@ -468,21 +468,21 @@ void launch_sr_qunatize_kernel(T* vals,
    uint64_t inc = total_count / grid_dim.x / block_dim.x;
    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);

-    sr_qunatize_kernel<<<grid_dim, block_dim, 0, stream>>>(
+    sr_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
        vals, (total_count / group_num) / 4, group_num, num_bits, seed);
 }
-template void launch_sr_qunatize_kernel(float* vals,
+template void launch_sr_quantize_kernel(float* vals,
                                        int total_count,
                                        int group_num,
                                        int num_bits,
                                        cudaStream_t stream);
-template void launch_sr_qunatize_kernel(__half* vals,
+template void launch_sr_quantize_kernel(__half* vals,
                                        int total_count,
                                        int group_num,
                                        int num_bits,
                                        cudaStream_t stream);

-__global__ void qunatize_kernel_asym(__half* vals, int group_size, int num_bits)
+__global__ void quantize_kernel_asym(__half* vals, int group_size, int num_bits)
 {
 #if __CUDA_ARCH__ >= 700

@ -595,7 +595,7 @@ __global__ void qunatize_kernel_asym(__half* vals, int group_size, int num_bits)
 #endif
 }

-__global__ void qunatize_kernel_asym(float* vals, int group_size, int num_bits)
+__global__ void quantize_kernel_asym(float* vals, int group_size, int num_bits)
 {
    cg::thread_block b = cg::this_thread_block();
    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@ -699,7 +699,7 @@ __global__ void qunatize_kernel_asym(float* vals, int group_size, int num_bits)
 }

 template <typename T>
-void launch_qunatize_kernel_asym(T* vals,
+void launch_quantize_kernel_asym(T* vals,
                                 int total_count,
                                 int group_num,
                                 int num_bits,
@ -708,22 +708,22 @@ void launch_qunatize_kernel_asym(T* vals,
    dim3 grid_dim(group_num);
    dim3 block_dim(1024);

-    qunatize_kernel_asym<<<grid_dim, block_dim, 0, stream>>>(
+    quantize_kernel_asym<<<grid_dim, block_dim, 0, stream>>>(
        vals, (total_count / group_num) / 4, num_bits);
 }

-template void launch_qunatize_kernel_asym(float* vals,
+template void launch_quantize_kernel_asym(float* vals,
                                          int total_count,
                                          int group_num,
                                          int num_bits,
                                          cudaStream_t stream);
-template void launch_qunatize_kernel_asym(__half* vals,
+template void launch_quantize_kernel_asym(__half* vals,
                                          int total_count,
                                          int group_num,
                                          int num_bits,
                                          cudaStream_t stream);

-__global__ void sr_qunatize_kernel_asym(__half* vals,
+__global__ void sr_quantize_kernel_asym(__half* vals,
                                        int token_size,
                                        int token_num,
                                        int num_bits,
@ -879,7 +879,7 @@ __global__ void sr_qunatize_kernel_asym(__half* vals,
 #endif
 }

-__global__ void sr_qunatize_kernel_asym(float* vals,
+__global__ void sr_quantize_kernel_asym(float* vals,
                                        int token_size,
                                        int token_num,
                                        int num_bits,
@ -1010,7 +1010,7 @@ __global__ void sr_qunatize_kernel_asym(float* vals,
    }
 }
 template <typename T>
-void launch_sr_qunatize_kernel_asym(T* vals,
+void launch_sr_quantize_kernel_asym(T* vals,
                                    int total_count,
                                    int group_num,
                                    int num_bits,
@ -1022,15 +1022,15 @@ void launch_sr_qunatize_kernel_asym(T* vals,
    uint64_t inc = total_count / grid_dim.x / block_dim.x;
    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);

-    sr_qunatize_kernel<<<grid_dim, block_dim, 0, stream>>>(
+    sr_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
        vals, (total_count / group_num) / 4, group_num, num_bits, seed);
 }
-template void launch_sr_qunatize_kernel_asym(float* vals,
+template void launch_sr_quantize_kernel_asym(float* vals,
                                             int total_count,
                                             int group_num,
                                             int num_bits,
                                             cudaStream_t stream);
-template void launch_sr_qunatize_kernel_asym(__half* vals,
+template void launch_sr_quantize_kernel_asym(__half* vals,
                                             int total_count,
                                             int group_num,
                                             int num_bits,
--- a/deepspeed/inference/engine.py
+++ b/deepspeed/inference/engine.py
@ -102,7 +102,7 @@ class InferenceEngine(Module):
            self.mp_group = InferenceEngine.inference_mp_group

    def _check_quantize_setting(self, quantization_setting):
-        self.quatize_bits = 8
+        self.quantize_bits = 8
        self.mlp_extra_grouping = False
        self.quantize_groups = 1
        if quantization_setting is None:
@ -177,7 +177,7 @@ class InferenceEngine(Module):
            quantizer = WeightQuantization(mlp_extra_grouping=self.mlp_extra_grouping)
            model, self.quantization_scales = quantizer.model_quantize(self.module,
                                                                        self.injection_dict,
-                                                                        self.quatize_bits,
+                                                                        self.quantize_bits,
                                                                        self.quantize_groups)
        elif self.dtype == torch.half:
            self.module.half()
--- a/deepspeed/launcher/launch.py
+++ b/deepspeed/launcher/launch.py
@ -1,7 +1,7 @@
 # Copyright 2020 The Microsoft DeepSpeed Team
 """
 DeepSpeed launcher, this is similar to torch.distributed.launch but supports
-additional features such as abitrary gpu exclusion.
+additional features such as arbitrary gpu exclusion.

 deepspeed.launcher.launch is intended to be run on a single worker node and
 will spawn several worker sub-processes depending on how many devices/ranks
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@ -2,7 +2,7 @@
 """
 DeepSpeed runner is the main front-end to launching multi-worker
 training jobs with DeepSpeed. By default this uses pdsh to parallel
-ssh into multiple worker nodes and launch all the neccisary processes
+ssh into multiple worker nodes and launch all the necessary processes
 per rank for training.
 """

--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@ -19,7 +19,7 @@ class ReplaceWithTensorSlicing:
        assert dim1 > dim2, \
            'Merging tensors is not allowed here! Please use deepspeed load_checkpoint\
            for merging your checkpoints before replacing the transformer layer with\
-            inference-kerenls'
+            inference-kernels'

    def qkv_copy(self, dst, src):
        if src is None:
@ -114,9 +114,9 @@ def replace_transformer_layer(orig_layer_impl,
        seed (int): random seed value
        max_seq_length (int): max sequence length for training
        hidden_size (int): hidden dimension
-        num_attention_heads (int): numebr of attention heads
+        num_attention_heads (int): number of attention heads
        mp_size (int): model_parallelism degree
-        mp_group : model_parallel gropu initialized on the modeling side
+        mp_group : model_parallel group initialized on the modeling side
        preln (bool): does the original layer implementation do pre or post layer norm?
        fp16 (bool): fp16 or fp32
        local_rank (int): GPU rank (optional),
@ -401,7 +401,7 @@ def replace_module(model, orig_class, replace_fn, _replace_policy):
            if plcy._orig_layer_class is not None:
                policy.update({plcy._orig_layer_class: (replace_fn, plcy)})
    assert len(policy.items()) > 0,\
-        "No default policy found! Please specifiy your policy injection_policy (like {BertLayer:HFBEertLayerPolicy})." +\
+        "No default policy found! Please specify your policy injection_policy (like {BertLayer:HFBEertLayerPolicy})." +\
        "You can find some samples here: https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py"

    replaced_module, _ = _replace_module(model, policy)
--- a/deepspeed/moe/sharded_moe.py
+++ b/deepspeed/moe/sharded_moe.py
@ -33,7 +33,7 @@ exp_selection_uniform_map: Dict[torch.device, Callable] = {}

 def multiplicative_jitter(x, device: torch.device, epsilon=1e-2):
    """
-    Modified from swtich transformer paper. mesh transformers
+    Modified from switch transformer paper. mesh transformers
    Multiply values by a random number between 1-epsilon and 1+epsilon.
    Makes models more resilient to rounding errors introduced by bfloat16.
    This seems particularly important for logits.
@ -147,7 +147,7 @@ def top1gating(logits: torch.Tensor,

    mask1_rand = mask1 * uniform(mask1.shape)

-    assert logits.shape[0] >= min_capacity, "No. of tokens (batch-size) should be greater than min_capacity. Either set min_capacity to 0 or inrease your batch size."
+    assert logits.shape[0] >= min_capacity, "No. of tokens (batch-size) should be greater than min_capacity. Either set min_capacity to 0 or increase your batch size."

    _, top_idx = torch.topk(mask1_rand, k=capacity, dim=0)

--- a/deepspeed/ops/adam/cpu_adam.py
+++ b/deepspeed/ops/adam/cpu_adam.py
@ -33,9 +33,9 @@ class DeepSpeedCPUAdam(torch.optim.Optimizer):
        In order to apply this optimizer, the model requires to have its master parameter (in FP32)
        reside on the CPU memory.

-        To train on a hetrogeneous system, such as coordinating CPU and GPU, DeepSpeed offers
+        To train on a heterogeneous system, such as coordinating CPU and GPU, DeepSpeed offers
        the ZeRO-Offload technology which efficiently offloads the optimizer states into CPU memory,
-        with minimal impact on training througput. DeepSpeedCPUAdam plays an important role to minimize
+        with minimal impact on training throughput. DeepSpeedCPUAdam plays an important role to minimize
        the overhead of the optimizer's latency on CPU. Please refer to ZeRO-Offload tutorial
        (https://www.deepspeed.ai/tutorials/zero-offload/) for more information on how to enable this technology.

--- a/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py
+++ b/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py
@ -24,7 +24,7 @@ class BertSparseSelfAttention(nn.Module):

        Arguments:
            config: required: Bert model config
-            sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on FixedSparsityConfig class.
+            sparsity_config: optional: this parameter determines sparsity pattern configuration; it is based on FixedSparsityConfig class.
        """

        super(BertSparseSelfAttention, self).__init__()
@ -53,11 +53,11 @@ class BertSparseSelfAttention(nn.Module):
        """Applies forward phase of bert sparse self attention

        Arguments:
-            hidden_states: required: hidde_states tensor of the bert model
+            hidden_states: required: hidden_states tensor of the bert model
            attn_mask: required: a mask tensor of size (SequenceLength X SequenceLength); currently only 2D is supported

        Return:
-             context_layer: a dense tensor containing attnetion context
+             context_layer: a dense tensor containing attention context
        """
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
--- a/deepspeed/ops/sparse_attention/sparse_attention_utils.py
+++ b/deepspeed/ops/sparse_attention/sparse_attention_utils.py
@ -94,10 +94,10 @@ class SparseAttentionUtils:
        Arguments:
            model: required: a transformer model
            max_position: required: an integer determining new position embedding size
-            sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class
+            sparsity_config: optional: this parameter determines sparsity pattern configuration; it is based on SparsityConfig class

        Return:
-            model: updated model; in which self attention layer has been repleaced with DeepSpeed Sparse Self Attention layer.
+            model: updated model; in which self attention layer has been replaced with DeepSpeed Sparse Self Attention layer.
        """

        if hasattr(model, 'bert'):
@ -131,10 +131,10 @@ class SparseAttentionUtils:
        Arguments:
            config: required: transformer model config
            layers: required: transformer model attention layers
-            sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class
+            sparsity_config: optional: this parameter determines sparsity pattern configuration; it is based on SparsityConfig class

        Return:
-            layers: updated attention layers; in which self attention layers have been repleaced with DeepSpeed Sparse Self Attention layer.
+            layers: updated attention layers; in which self attention layers have been replaced with DeepSpeed Sparse Self Attention layer.
        """

        for layer in layers:
@ -161,7 +161,7 @@ class SparseAttentionUtils:
            It needs to be called in your model, such as BertModel, right before you calculate the embedding outputs.
            Note)
            1- instead of passing your embedding layer to this function, you can simply add this function to your model. It can be more simplified if given attention_mask and/or token_type_ids are none.
-            2- you need to call unpdad function before returning your model output to unpad the encoder sequence output.
+            2- you need to call unpad function before returning your model output to unpad the encoder sequence output.

            Arguments:
                block_size: required: an integer determining the block size of sparsity config.
--- a/deepspeed/ops/sparse_attention/sparse_self_attention.py
+++ b/deepspeed/ops/sparse_attention/sparse_self_attention.py
@ -26,7 +26,7 @@ class SparseSelfAttention(nn.Module):
        max_seq_length=2048):
        """Initialize the sparse self attention layer.
        Arguments:
-            sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class.
+            sparsity_config: optional: this parameter determines sparsity pattern configuration; it is based on SparsityConfig class.
            key_padding_mask_mode: optional: a string determining if key padding mask needs to be added, `add`, or be multiplied, `mul`.
            attn_mask_mode: optional: a string determining if attention mask needs to be added, `add`, or be multiplied, `mul`.
            max_seq_length: optional: the maximum sequence length this sparse attention module will be applied to; it controls the size of the master_layout.
@ -121,7 +121,7 @@ class SparseSelfAttention(nn.Module):
            attn_mask_mode: optional: a boolean determining if attn_mask needs to be added or multiplied

        Return:
-             attn_output: a dense tensor containing attnetion context
+             attn_output: a dense tensor containing attention context
        """
        assert query.dtype == torch.half, "sparse attention only supports training in fp16 currently, please file a github issue if you need fp32 support"
        bsz, num_heads, tgt_len, head_dim = query.size()
--- a/deepspeed/ops/sparse_attention/sparsity_config.py
+++ b/deepspeed/ops/sparse_attention/sparsity_config.py
@ -49,7 +49,7 @@ class SparsityConfig:
        """If all heads require same sparsity layout, it propagate first head layout to all heads

        Arguments:
-             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step

        Return:
             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head
@ -152,11 +152,11 @@ class FixedSparsityConfig(SparsityConfig):
        self.num_different_global_patterns = num_different_global_patterns

    def set_local_layout(self, h, layout):
-        """Sets local attantion layout used by the given head in the sparse attention.
+        """Sets local attention layout used by the given head in the sparse attention.

        Arguments:
             h: required: an integer determining head index
-             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step

        Return:
             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which local layout is set
@ -173,14 +173,14 @@ class FixedSparsityConfig(SparsityConfig):
        return layout

    def set_global_layout(self, h, layout):
-        """Sets global attantion layout used by the given head in the sparse attention.
+        """Sets global attention layout used by the given head in the sparse attention.

        Currently we set global blocks starting from the last block of a local window to the first one. That means if a local window consists of 4 blocks and global attention size is one block, we use block #4 in each local window as global. If we have different layout per head, then other heads will get #3, #2, and #1. And if we have more heads (and different layout has set) than num of global attentions, multiple head may have same global attentions.
        Note) if horizontal_global_attention is set, global blocks will be set both horizontally and vertically.

        Arguments:
             h: required: an integer determining head index
-             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step

        Return:
             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which global layout is set
@ -307,12 +307,12 @@ class VariableSparsityConfig(SparsityConfig):
        self.horizontal_global_attention = horizontal_global_attention

    def set_random_layout(self, h, layout):
-        """Sets random attantion layout used by the given head in the sparse attention.
+        """Sets random attention layout used by the given head in the sparse attention.
        Note) By default, it assumes there will be a unique random block layout for all heads; unless `different_layout_per_head` parameter is set in which each head can have a different random layout.

        Arguments:
             h: required: an integer determining head index
-             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step

        Return:
             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which random layout is set
@ -321,7 +321,7 @@ class VariableSparsityConfig(SparsityConfig):
        num_blocks = layout.shape[1]
        if (num_blocks < self.num_random_blocks):
            raise ValueError(
-                f'Number of random blocks, {self.num_random_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
+                f'Number of random blocks, {self.num_random_blocks}, must be smaller than overall number of blocks in a row, {num_blocks}!'
            )
        for row in range(0, num_blocks):
            rnd_cols = random.sample(range(0, num_blocks), self.num_random_blocks)
@ -329,10 +329,10 @@ class VariableSparsityConfig(SparsityConfig):
        return layout

    def set_local_layout(self, h, layout):
-        """Sets local attantion layout used by the given head in the sparse attention.
+        """Sets local attention layout used by the given head in the sparse attention.
        Arguments:
             h: required: an integer determining head index
-             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step

        Return:
             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which local layout is set
@ -362,11 +362,11 @@ class VariableSparsityConfig(SparsityConfig):
        return layout

    def set_global_layout(self, h, layout):
-        """Sets global attantion layout used by the given head in the sparse attention.
+        """Sets global attention layout used by the given head in the sparse attention.

        Arguments:
             h: required: an integer determining head index
-             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step

        Return:
             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which global layout is set
@ -375,7 +375,7 @@ class VariableSparsityConfig(SparsityConfig):
        num_blocks = layout.shape[1]
        if (self.global_block_end_indices is None):
            for idx in self.global_block_indices:
-                # if global block idx is in the range of the sequnce blocks
+                # if global block idx is in the range of the sequence blocks
                if (idx < num_blocks):
                    #global rows
                    if (self.horizontal_global_attention):
@ -386,7 +386,7 @@ class VariableSparsityConfig(SparsityConfig):
                    layout[h, first_row:, idx] = 1
        else:
            for _, (start_idx, end_idx) in enumerate(zip(self.global_block_indices, self.global_block_end_indices)):
-                # if global block idx is in the range of the sequnce blocks
+                # if global block idx is in the range of the sequence blocks
                if (start_idx < num_blocks):
                    end_idx = min(end_idx, num_blocks)
                    #global rows
@ -450,12 +450,12 @@ class BigBirdSparsityConfig(SparsityConfig):
        self.num_global_blocks = num_global_blocks

    def set_random_layout(self, h, layout):
-        """Sets random attantion layout used by the given head in the sparse attention.
+        """Sets random attention layout used by the given head in the sparse attention.
        Note) By default, it assumes there will be a unique random block layout for all heads; unless `different_layout_per_head` parameter is set in which each head can have a different random layout.

        Arguments:
             h: required: an integer determining head index
-             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step

        Return:
             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which random layout is set
@ -464,7 +464,7 @@ class BigBirdSparsityConfig(SparsityConfig):
        num_blocks = layout.shape[1]
        if (num_blocks < self.num_random_blocks):
            raise ValueError(
-                f'Number of random blocks, {self.num_random_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
+                f'Number of random blocks, {self.num_random_blocks}, must be smaller than overall number of blocks in a row, {num_blocks}!'
            )

        for row in range(0, num_blocks):
@ -473,11 +473,11 @@ class BigBirdSparsityConfig(SparsityConfig):
        return layout

    def set_sliding_window_layout(self, h, layout):
-        """Sets sliding local attantion layout used by the given head in the sparse attention.
+        """Sets sliding local attention layout used by the given head in the sparse attention.

        Arguments:
             h: required: an integer determining head index
-             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step

        Return:
             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which local sliding window layout is set
@ -486,7 +486,7 @@ class BigBirdSparsityConfig(SparsityConfig):
        num_blocks = layout.shape[1]
        if (num_blocks < self.num_sliding_window_blocks):
            raise ValueError(
-                f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
+                f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overall number of blocks in a row, {num_blocks}!'
            )

        w = self.num_sliding_window_blocks // 2
@ -497,11 +497,11 @@ class BigBirdSparsityConfig(SparsityConfig):
        return layout

    def set_global_layout_itc(self, h, layout):
-        """Sets global attantion layout used by the given head in the sparse attention.
+        """Sets global attention layout used by the given head in the sparse attention.

        Arguments:
             h: required: an integer determining head index
-             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step

        Return:
             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which global layout is set
@ -510,7 +510,7 @@ class BigBirdSparsityConfig(SparsityConfig):
        num_blocks = layout.shape[1]
        if (num_blocks < self.num_global_blocks):
            raise ValueError(
-                f'Number of global blocks, {self.num_global_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
+                f'Number of global blocks, {self.num_global_blocks}, must be smaller than overall number of blocks in a row, {num_blocks}!'
            )

        #global rows
@ -588,11 +588,11 @@ class BSLongformerSparsityConfig(SparsityConfig):
        self.global_block_end_indices = global_block_end_indices

    def set_sliding_window_layout(self, h, layout):
-        """Sets sliding local attantion layout used by the given head in the sparse attention.
+        """Sets sliding local attention layout used by the given head in the sparse attention.

        Arguments:
             h: required: an integer determining head index
-             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step

        Return:
             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which local sliding window layout is set
@ -601,7 +601,7 @@ class BSLongformerSparsityConfig(SparsityConfig):
        num_blocks = layout.shape[1]
        if (num_blocks < self.num_sliding_window_blocks):
            raise ValueError(
-                f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!'
+                f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overall number of blocks in a row, {num_blocks}!'
            )

        w = self.num_sliding_window_blocks // 2
@ -612,11 +612,11 @@ class BSLongformerSparsityConfig(SparsityConfig):
        return layout

    def set_global_layout(self, h, layout):
-        """Sets global attantion layout used by the given head in the sparse attention.
+        """Sets global attention layout used by the given head in the sparse attention.

        Arguments:
             h: required: an integer determining head index
-             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step
+             layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step

        Return:
             layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which global layout is set
@ -625,7 +625,7 @@ class BSLongformerSparsityConfig(SparsityConfig):
        num_blocks = layout.shape[1]
        if (self.global_block_end_indices is None):
            for idx in self.global_block_indices:
-                # if global block idx is in the range of the sequnce blocks
+                # if global block idx is in the range of the sequence blocks
                if (idx < num_blocks):
                    #global rows
                    layout[h, idx, :] = 1
@ -634,7 +634,7 @@ class BSLongformerSparsityConfig(SparsityConfig):
                    layout[h, :, idx] = 1
        else:
            for _, (start_idx, end_idx) in enumerate(zip(self.global_block_indices, self.global_block_end_indices)):
-                # if global block idx is in the range of the sequnce blocks
+                # if global block idx is in the range of the sequence blocks
                if (start_idx < num_blocks):
                    end_idx = min(end_idx, num_blocks)
                    #global rows
--- a/deepspeed/ops/transformer/inference/transformer_inference.py
+++ b/deepspeed/ops/transformer/inference/transformer_inference.py
@ -525,7 +525,7 @@ class DeepSpeedTransformerInference(nn.Module):

            config: An object of DeepSpeedInferenceConfig
            mp_group: Model parallelism group initialized on the modeling side.
-            quantize_scales: This arguement groups all the layers' scales used for quantization
+            quantize_scales: This argument groups all the layers' scales used for quantization
            quantize_groups: Number of groups used for quantizing the model
            merge_count: Shows the number of model-parallel checkpoints merged before running inference.
                We use this argument to control the quantization scale for the model parameters if a bigger
--- a/deepspeed/ops/transformer/transformer.py
+++ b/deepspeed/ops/transformer/transformer.py
@ -88,7 +88,7 @@ class DeepSpeedTransformerConfig(TransformerConfig):
                a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend
                to turn it off in order to be able to reproduce the same result through the regular kernel execution.

-            huggingface: Enbale if using the HuggingFace interface style for sending out the forward results.
+            huggingface: Enable if using the HuggingFace interface style for sending out the forward results.

            training: Enable for training rather than inference.
    """
--- a/deepspeed/profiling/flops_profiler/README.md
+++ b/deepspeed/profiling/flops_profiler/README.md
@ -135,14 +135,14 @@ The DeepSpeed Flops Profiler can be used with the DeepSpeed runtime without any

 ## Flops Measurement

-Similar to exsiting flops calculation tools or methods, the DeepSpeed Flops Profiler measures the flops of the forward pass of a module and the flops of the backward pass is estimated as `2` times of that of the forward pass.
-Different from the PyTorch profiler which calculates the flops of PyTorch operators, the DeepSpeed Flops Profiler measures the flops witin modules in a model and provides more insights to the users about the model execution.
+Similar to existing flops calculation tools or methods, the DeepSpeed Flops Profiler measures the flops of the forward pass of a module and the flops of the backward pass is estimated as `2` times of that of the forward pass.
+Different from the PyTorch profiler which calculates the flops of PyTorch operators, the DeepSpeed Flops Profiler measures the flops within modules in a model and provides more insights to the users about the model execution.
 The flops estimation is partly inspired by [ptflops](https://github.com/sovrasov/flops-counter.pytorch) with the major difference being that the DeepSpeed Flops Profiler not only supports flops computation directly at module level, but can also capture ```torch.nn.functional``` invoked in a module to estimate the flops.
 Thus the DeepSpeed Flops Profiler allows for customized modules in the model, e.g., ```ParallelTransformerLayerworks, ParallelSelfAttention, RowParallelLinear, etc.``` in [Megatron-LM](https://github.com/NVIDIA/Megatron-LM). This is in contrast to ptflops which requires users to write customized flops calculation functions for each customized module.

 ## Multi-GPU, Multi-node, Data Parallelism, and Model Parallelism

-The DeepSpeed Flops Profiler outputs the per GPU profile as well as the world size, data parallel size, and model paralel size.                                           1
+The DeepSpeed Flops Profiler outputs the per GPU profile as well as the world size, data parallel size, and model parallel size.                                          1
 For models running on multi-GPU or multi-node, only change of the model parallelism (e.g. ```--model-parallel-size``` in [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)) affects the number of flops and parameters profiled, i.e.,
 `model_parallel_size * flops = total_flops` and `model_parallel_size * parameters = total_parameters`. The data parallel size or world size (related to the number of GPUs or nodes) does not affect the per GPU profile.

@ -372,7 +372,7 @@ with torch.cuda.device(0):
 #### In Model Training Workflow

 To profile model forward in a training workflow, use the `FlopsProfiler`class.
-The `FlopsProfiler`class provides the follwing methods:
+The `FlopsProfiler`class provides the following methods:
  * `start_profile()` - starts profiling
  * `get_total_flops(as_string=False)` - returns the total number of MACs in the model
  * `get_total_params(as_string=False)` - returns the total number of parameters in the model
--- a/deepspeed/profiling/flops_profiler/profiler.py
+++ b/deepspeed/profiling/flops_profiler/profiler.py
@ -241,13 +241,13 @@ class FlopsProfiler(object):
        )
        print(f'Profile Summary at step {profile_step}:')
        print(
-            "Notations:\ndata parallel size (dp_size), model paralel size(mp_size),\nnumber of parameters (params), number of multiply-accumulate operations(MACs),\number of floating point operations (flops), floating point operations per second (FLOPS),\nfwd latency (forward propagation latency), bwd latency (backward propagation latency),\nstep (weights update latency), iter latency (sum of fwd, bwd and step latency)\n"
+            "Notations:\ndata parallel size (dp_size), model parallel size(mp_size),\nnumber of parameters (params), number of multiply-accumulate operations(MACs),\number of floating point operations (flops), floating point operations per second (FLOPS),\nfwd latency (forward propagation latency), bwd latency (backward propagation latency),\nstep (weights update latency), iter latency (sum of fwd, bwd and step latency)\n"
        )
        if self.ds_engine:
            print('{:<60}  {:<8}'.format('world size: ', self.ds_engine.world_size))
            print('{:<60}  {:<8}'.format('data parallel size: ',
                                         self.ds_engine.dp_world_size))
-            print('{:<60}  {:<8}'.format('model paralel size: ',
+            print('{:<60}  {:<8}'.format('model parallel size: ',
                                         self.ds_engine.mp_world_size))
            print('{:<60}  {:<8}'.format(
                'batch size per GPU: ',
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@ -205,7 +205,7 @@ def model_parallel_cuda_manual_seed(seed):
    Two set of RNG states are tracked:
        default state: This is for data parallelism and is the same among a
                       set of model parallel GPUs but different across
-                       different model paralle groups. This is used for
+                       different model parallel groups. This is used for
                       example for dropout in the non-model-parallel regions.
        model-parallel state: This state is different among a set of model
                              parallel GPUs, but the same across data parallel
@ -219,7 +219,7 @@ def model_parallel_cuda_manual_seed(seed):
    # 2718 is just for fun and any POSITIVE value will work.
    offset = seed + 2718
    model_parallel_seed = offset + tp_rank
-    # Data parallel gets the original sedd.
+    # Data parallel gets the original seed.
    data_parallel_seed = seed

    if torch.distributed.get_rank() == 0:
@ -534,7 +534,7 @@ class CheckpointFunction(torch.autograd.Function):
        global cuda_device, transport_stream, PARTITION_ACTIVATIONS, buffer_0, buffer_1, buffer_0_offset, buffer_1_offset

        if cuda_device is None:
-            see_memory_usage("First Forward Begining", force=False)
+            see_memory_usage("First Forward Beginning", force=False)
            if dist.get_rank() == 0:
                logger.info(f"Activation Checkpointing Information")
                logger.info(
@ -886,7 +886,7 @@ def configure(
        PROFILE_TIME = profile

    if CONTIGUOUS_CHECKPOINTING:
-        assert PARTITION_ACTIVATIONS, "Contiguous Checkpointing is only availble with partitioned activations. Set partitioned activations to true in deepspeed config"
+        assert PARTITION_ACTIVATIONS, "Contiguous Checkpointing is only available with partitioned activations. Set partitioned activations to true in deepspeed config"
    if CONTIGUOUS_CHECKPOINTING:
        assert num_layers is not None, "Must specify the number of layers with contiguous memory checkpointing"

--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@ -46,7 +46,7 @@ DEEPSPEED_OPTIMIZERS = [
 # extra optimizer parameters for adam/adamw
 TORCH_ADAM_PARAM = "torch_adam"

-# default to adamw logic for adam/adamw optimizers unless user explictly opts out
+# default to adamw logic for adam/adamw optimizers unless user explicitly opts out
 ADAM_W_MODE = "adam_w_mode"
 ADAM_W_MODE_DEFAULT = True

@ -665,7 +665,7 @@ def get_dataloader_drop_last(param_dict):


 '''Write deepspeed config files by modifying basic templates.
-Can be used for quicly changing parameters via command line parameters.'''
+Can be used for quickly changing parameters via command line parameters.'''


 class DeepSpeedConfigWriter:
@ -738,7 +738,7 @@ class DeepSpeedConfig(object):
                        f"ds_config ({TRAIN_BATCH_SIZE}, {TRAIN_MICRO_BATCH_SIZE_PER_GPU}, and/or " \
                        f"{GRADIENT_ACCUMULATION_STEPS}). These parameters *will not be used* since " \
                        "elastic training is enabled, which takes control of these parameters. " \
-                        "If you want to supress this error (the parameters will be silently ignored) " \
+                        "If you want to suppress this error (the parameters will be silently ignored) " \
                        f"please set {IGNORE_NON_ELASTIC_BATCH_INFO}':true in your elasticity config.")

            # micro_bsz * world_size * gas = total_batch_size
--- a/deepspeed/runtime/eigenvalue.py
+++ b/deepspeed/runtime/eigenvalue.py
@ -30,7 +30,7 @@ class Eigenvalue(object):
            ranks=[0])

    # Replace all nan/pos-inf/neg-inf to zero
-    # TODO: Pytorch new verion may add this function, replace this one by then.
+    # TODO: Pytorch new version may add this function, replace this one by then.
    def nan_to_num(self, x):
        device = x.device
        x = x.cpu().numpy()
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@ -658,7 +658,7 @@ class DeepSpeedEngine(Module):
            ompi_local_rank = os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK")
            local_rank = os.environ.get('LOCAL_RANK', ompi_local_rank)
            assert ompi_local_rank == local_rank, f"LOCAL_RANK ({local_rank}) != OMPI_COMM_WORLD_LOCAL_RANK ({ompi_local_rank}), " \
-                "not sure how to proceed as we're seeing conficting local rank info."
+                "not sure how to proceed as we're seeing conflicting local rank info."
            os.environ['LOCAL_RANK'] = local_rank

        self.local_rank = int(os.environ['LOCAL_RANK'])
@ -822,7 +822,7 @@ class DeepSpeedEngine(Module):
        if not self.amp_enabled():
            self._broadcast_model()

-    #check if parmaeters are duplicated in optimizer param_groups
+    #check if parameters are duplicated in optimizer param_groups
    def _check_for_duplicates(self, optimizer):
        for name, param in self.module.named_parameters():
            param_id = id(param)
@ -830,12 +830,12 @@ class DeepSpeedEngine(Module):
            def ids_list(group):
                return [id(param) for param in group]

-            occurance = sum([
+            occurrence = sum([
                ids_list(group['params']).count(param_id)
                if param_id in ids_list(group['params']) else 0
                for group in optimizer.param_groups
            ])
-            assert occurance <= 1, f"Parameter with name: {name} occurs multiple times in optimizer.param_groups. Make sure it only appears once to prevent undefined behaviour."
+            assert occurrence <= 1, f"Parameter with name: {name} occurs multiple times in optimizer.param_groups. Make sure it only appears once to prevent undefined behaviour."

    # Configure optimizer
    def _configure_optimizer(self, client_optimizer, model_parameters):
@ -918,7 +918,7 @@ class DeepSpeedEngine(Module):
            torch_adam = optimizer_parameters.pop(TORCH_ADAM_PARAM, False)
            adam_w_mode = optimizer_parameters.pop(ADAM_W_MODE, ADAM_W_MODE_DEFAULT)

-            # Optimizer name of Adam forces AdamW logic unless adam_w_mode is explictly set
+            # Optimizer name of Adam forces AdamW logic unless adam_w_mode is explicitly set
            effective_adam_w_mode = self.optimizer_name(
            ) == ADAMW_OPTIMIZER or adam_w_mode

@ -1225,7 +1225,7 @@ class DeepSpeedEngine(Module):
        if route == ROUTE_TRAIN:
            deepspeed_io_timer = self.tput_timer

-        # If mpu is provied, forward world size and parallel rank to sampler.
+        # If mpu is provided, forward world size and parallel rank to sampler.
        data_parallel_world_size = None
        data_parallel_rank = None
        if self.mpu is not None:
@ -1506,7 +1506,7 @@ class DeepSpeedEngine(Module):

        report_progress = self.global_rank == 0 if self.global_rank else True

-        # Check overlow here since in DS fp16 optimizer, the overflow is updated in above step() function.
+        # Check overflow here since in DS fp16 optimizer, the overflow is updated in above step() function.
        overflow = False
        if hasattr(self.optimizer, 'overflow'):
            overflow = self.optimizer.overflow
--- a/deepspeed/runtime/fp16/onebit/adam.py
+++ b/deepspeed/runtime/fp16/onebit/adam.py
@ -115,7 +115,7 @@ class OnebitAdam(torch.optim.Optimizer):
            grads (list of tensors, optional): weight gradient to use for the
                optimizer update. If gradients have type torch.half, parameters
                are expected to be in type torch.float. (default: None)
-            output params (list of tensors, optional): A reduced recision copy
+            output params (list of tensors, optional): A reduced precision copy
                of the updated weights written out in addition to the regular
                updated weights. Have to be of same type as gradients. (default: None)
            scale (float, optional): factor to divide gradient tensor values
--- a/deepspeed/runtime/lr_schedules.py
+++ b/deepspeed/runtime/lr_schedules.py
@ -361,7 +361,7 @@ class LRRangeTest(object):
        self.step_rate = lr_range_test_step_rate
        self.last_batch_iteration = last_batch_iteration
        self.staircase = lr_range_test_staircase
-        self.interval_fn = self._staircase_interval if lr_range_test_staircase else self._continous_interval
+        self.interval_fn = self._staircase_interval if lr_range_test_staircase else self._continuous_interval

        if last_batch_iteration == -1:
            self._update_optimizer(self.min_lr)
@ -369,7 +369,7 @@ class LRRangeTest(object):
    def _staircase_interval(self):
        return math.floor(float(self.last_batch_iteration + 1) / self.step_size)

-    def _continous_interval(self):
+    def _continuous_interval(self):
        return float(self.last_batch_iteration + 1) / self.step_size

    def _get_increase(self):
@ -514,7 +514,7 @@ class OneCycle(object):
                                      decay_mom_rate,
                                      last_batch_iteration)

-        # Initalize batch iteration tracker
+        # Initialize batch iteration tracker
        self.last_batch_iteration = last_batch_iteration

    # Configure cycle shape
--- a/deepspeed/runtime/pipe/module.py
+++ b/deepspeed/runtime/pipe/module.py
@ -175,7 +175,7 @@ class PipelineModule(nn.Module):
                topology = PipeDataParallelTopology(num_pp=num_stages, num_dp=dp)
                self._topo = topology

-        # Contruct communicators for pipeline topology
+        # Construct communicators for pipeline topology
        self._grid = PipelineParallelGrid(process_group=self.world_group,
                                          topology=self._topo)

--- a/deepspeed/runtime/pipe/p2p.py
+++ b/deepspeed/runtime/pipe/p2p.py
@ -177,7 +177,7 @@ def _get_send_recv_group(src_stage, dest_stage):
        stage_id = src_stage
    '''group_id corresponds to group of [group_id, group_id+1]
     unless group_id is the rank of the last stage
-     in which case group_id correspods to group[group_id-num_stages+1, group_id]
+     in which case group_id corresponds to group[group_id-num_stages+1, group_id]
     '''
    group_id = _grid.stage_to_global(stage_id=stage_id)

--- a/deepspeed/runtime/pipe/schedule.py
+++ b/deepspeed/runtime/pipe/schedule.py
@ -392,7 +392,7 @@ class ForwardPass(BufferOpInstruction):

    .. code-block:: python

-        buffers['ouputs'][buffer_id] = forward(buffers['inputs'][buffer_id])
+        buffers['outputs'][buffer_id] = forward(buffers['inputs'][buffer_id])
    """
    pass

@ -404,7 +404,7 @@ class BackwardPass(BufferOpInstruction):

    .. code-block:: python

-        outputs = buffers['ouputs'][buffer_id]
+        outputs = buffers['outputs'][buffer_id]
        gradients = buffers['gradients'][buffer_id]
        torch.autograd.backward(tensors=outputs,
                                grad_tensors=gradients)
--- a/deepspeed/runtime/pipe/topology.py
+++ b/deepspeed/runtime/pipe/topology.py
@ -233,7 +233,7 @@ def _prime_factors(N):


 class PipeDataParallelTopology(ProcessTopology):
-    """ A topology specialiation for hybrid data and pipeline parallelism.
+    """ A topology specialization for hybrid data and pipeline parallelism.

        Uses data parallelism on the last dimension to encourage gradient
        reductions to use high-bandwidth intra-node links and lower-volume
--- a/deepspeed/runtime/quantize.py
+++ b/deepspeed/runtime/quantize.py
@ -154,7 +154,7 @@ class Quantizer(object):
                        f'Quantization settings: current bit-precision = {self.q_start_bits[index]}, step = {self.qsteps}, quantization period = {self.q_period[index]}, index = {index}'
                    )
        assert (self.q_start_bits[index] >= self.q_target_bits), \
-            'Quantization bit is lower thab target precision bits!'
+            'Quantization bit is lower than target precision bits!'

        # quantize the weights base on the selected bits and the value-range
        if not self.use_quantizer_kernel:
--- a/deepspeed/runtime/state_dict_factory.py
+++ b/deepspeed/runtime/state_dict_factory.py
@ -63,7 +63,7 @@ class SDLoaderBase(ABC):
                a. if no mp_size resizing occurs, for both training & inference, loading
                   the mp_rank related checkpoint directly.
                b. if has mp_size resizing, only Megatron model inference is supported,
-                   checkpoint file(s) will be merged/splitted according to mp_rank, mp_world_size and
+                   checkpoint file(s) will be merged/split according to mp_rank, mp_world_size and
                   checkpoint file list.

            3. Non-PipeModule loading mp_rank_*.pt files, is_pipe_parallel=False
@ -433,7 +433,7 @@ class MegatronSDLoader(SDLoaderBase):

        sd = torch.load(ckpt_file_name, map_location=lambda storage, loc: storage)

-        # partail_key is a sub-string of one key in the sd
+        # partial_key is a sub-string of one key in the sd
        def check_key_exist(partial_key, sd):
            keys = sd.keys()
            found = False
--- a/deepspeed/runtime/swap_tensor/optimizer_utils.py
+++ b/deepspeed/runtime/swap_tensor/optimizer_utils.py
@ -461,12 +461,12 @@ class OptimizerSwapper(object):
        self._stop_timer(UNSWAPPED_READ_GRADIENTS)
        self._log_timers([UNSWAPPED_READ_GRADIENTS])

-        # It shoud be safe to discard unswapped gradient partitions
+        # It should be safe to discard unswapped gradient partitions
        swap_info.release_unswapped_gradients()

        if SWAPPER_DEBUG_MODE:
            logger.info(
-                f'optimizer_retreive_unswapped_radients: param={swap_info.param_id} tensor_count={tensor_count} elem_count={num_elem_count}'
+                f'optimizer_retrieve_unswapped_gradients: param={swap_info.param_id} tensor_count={tensor_count} elem_count={num_elem_count}'
            )

    def _get_state_tensors(self, parameter):
--- a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
+++ b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
@ -124,7 +124,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper):
            return

        self._start_timer(SWAP_OUT_PARAM_TIMER)
-        pinned_tensors, pinned_paths, unpinned_tensors, unpinned_paths = self._seperate_pinned_tensors(swap_info)
+        pinned_tensors, pinned_paths, unpinned_tensors, unpinned_paths = self._separate_pinned_tensors(swap_info)
        swap_bytes = sum([
            self._io_aligned_numel(t.numel()) * t.element_size()
            for t in swap_info.tensors
@ -203,7 +203,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper):
        if DEBUG_MODE and torch.distributed.get_rank() == 0:
            logger.info(f'optimizer_param_swap_in: {(swap_bytes/(1024**3)):5.2f} GB')

-    def _seperate_pinned_tensors(self, swap_info):
+    def _separate_pinned_tensors(self, swap_info):
        pinned_tensors = []
        pinned_paths = []

--- a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
+++ b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
@ -125,7 +125,7 @@ class AsyncPartitionedParameterSwapper(object):

        self.swap_out_params = []

-    #Check if partiitoned param or numel in a tensor is swappable or not
+    #Check if partitioned param or numel in a tensor is swappable or not
    def swappable_tensor(self, param=None, numel=None):
        if param is not None:
            assert numel is None, "Both parma and numel cannot be provided"
--- a/deepspeed/runtime/zero/constants.py
+++ b/deepspeed/runtime/zero/constants.py
@ -106,7 +106,7 @@ ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE = 'stage3_prefetch_bucket_size'
 ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE_DEFAULT = 50000000

 #parameters smaller than the threshold are only communicated once after the
-#parameters are updated and are persisted thoughout the trainging
+#parameters are updated and are persisted throughout the training
 #avoid tons of latency bound communication
 ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD = 'stage3_param_persistence_threshold'
 ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT = 100000
@ -125,7 +125,7 @@ ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS_DEFAULT = True
 ZERO_OPTIMIZATION_LEGACY_STAGE1 = "legacy_stage1"
 ZERO_OPTIMIZATION_LEGACY_STAGE1_DEFAULT = False

-# Stage 2 - partition gradients in a round robin fashsion to load-balance reduction and offload copying
+# Stage 2 - partition gradients in a round robin fashion to load-balance reduction and offload copying
 ZERO_OPTIMIZATION_ROUND_ROBIN_GRADIENTS = 'round_robin_gradients'
 ZERO_OPTIMIZATION_ROUND_ROBIN_GRADIENTS_DEFAULT = False

--- a/deepspeed/runtime/zero/contiguous_memory_allocator.py
+++ b/deepspeed/runtime/zero/contiguous_memory_allocator.py
@ -10,7 +10,7 @@ class ContiguousMemoryAllocator(object):
    def __init__(self, size, dtype, device):
        self.buffer = torch.zeros(size, dtype=dtype, device=device)

-        #address to contiguous size avaialble
+        #address to contiguous size available
        self.contiguous_sizes = {}

        self.contiguous_sizes[0] = size
@ -65,7 +65,7 @@ class ContiguousMemoryAllocator(object):
        print_rank_0(
            f"Free before allocation {free_before}. Allocating {size}. Free after allocation {self.total_free}. Max allocated {self.max_allocated}"
        )
-        assert self.total_free + size == free_before, "Allcation bookeeping error"
+        assert self.total_free + size == free_before, "Allocation bookkeeping error"

        return ret_tensor

--- a/deepspeed/runtime/zero/linear.py
+++ b/deepspeed/runtime/zero/linear.py
@ -1,14 +1,14 @@
 #Linear Module to use with ZeRO Stage 3 to allow for parameter memory release
 #after the module execution during forward
 #Instead of saving variables using save_for_backward, we save variable ids
-#Allowing us to retrive the variable without creating pointer to it
+#Allowing us to retrieve the variable without creating pointer to it
 #Which allows for underlying tensor to be garbage collected
 #When partitioned as needed by the Zero Stage 3 optimizer
 #TODO instead of patching Linear module, we could patch the ctx.save_for_backward
 #ctx.saved_tensors so that this approach works for all nn modules that are built upon
 #torch.nn.function. However the issue is that many modules uses C++ implementations
-#which does not have pytroch implementation. Eg torch.addmm which acts as a funcitonal
-#when implemeted outside of torch.autograd.Function
+#which does not have pytorch implementation. Eg torch.addmm which acts as a functional
+#when implemented outside of torch.autograd.Function

 import math

--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@ -379,7 +379,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
                effect. Defaults to ``True``.
            dtype (``dtype``, optional): Can be used to change the data type of the parameters.
                Supported options are ``torch.half`` and ``torch.float``. Defaults to ``None``
-            mpu (``object``, optional): A model parallelism unit object that implements get_{model,data}_parallel_{rank,group,wolrd_size}.
+            mpu (``object``, optional): A model parallelism unit object that implements get_{model,data}_parallel_{rank,group,world_size}.

        This context accelerates model initialization and enables models that
        are too large to allocate in their entirety in CPU memory. It has the
@ -1059,7 +1059,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
            if not accumulate:
                dest_tensor.copy_(src_tensor)

-            # if source and destinatoin are on same device,
+            # if source and destination are on same device,
            # add to the provided buffer
            elif src_tensor.device == dest_tensor.device:
                dest_tensor.add_(src_tensor)
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@ -195,7 +195,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
        self.fp16_master_weights_and_gradients = fp16_master_weights_and_gradients

        if self.fp16_master_weights_and_gradients:
-            assert self.cpu_offload and type(self.optimizer) in [DeepSpeedCPUAdam], f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32. Currenty only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}. Either disable fp16_master_weights_and_gradients or enable ZeRO-2 Offload with DeepSpeedCPUAdam"
+            assert self.cpu_offload and type(self.optimizer) in [DeepSpeedCPUAdam], f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32. Currently only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}. Either disable fp16_master_weights_and_gradients or enable ZeRO-2 Offload with DeepSpeedCPUAdam"

        if self.reduce_scatter:
            assert not self.allreduce_always_fp32, "allreduce_always_fp32 is not yet supported with ZeRO-2 with reduce scatter enabled"
@ -223,7 +223,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
        # These are the parameters that will be updated by this process directly
        self.params_in_partition = []

-        # Offset from the first paramter in the the self.params_in_partition
+        # Offset from the first parameter in the the self.params_in_partition
        # the parameter boundaries may not align with partition boundaries
        # so we need to keep track of the offset
        self.first_offset = []
@ -909,7 +909,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
                    partition_id, offset = partition_ids_w_offsets[idx]

                    # if dist.get_rank() == 0 and count < 100:
-                    #     print(f"Rank {dist.get_rank()} rank offet id {idx} calculated dp size {dist.get_world_size(group=process_group)} real dp size {dist.get_world_size(self.real_dp_process_group[i])} and dst: {partition_id}")
+                    #     print(f"Rank {dist.get_rank()} rank offset id {idx} calculated dp size {dist.get_world_size(group=process_group)} real dp size {dist.get_world_size(self.real_dp_process_group[i])} and dst: {partition_id}")
                    # count += 1

                    # Calculate numel for grad slice depending on partition location
@ -937,7 +937,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
            for i, (dst, bucket_offset, numel) in enumerate(rank_and_offsets):
                grad_slice = tensor.narrow(0, int(bucket_offset), int(numel))
                # if dist.get_rank() == 0:
-                #     print(f"Rank {dist.get_rank()} rank offet id {i} real dp size {dist.get_world_size(group=real_dp_process_group[i])} and dst: {dst}")
+                #     print(f"Rank {dist.get_rank()} rank offset id {i} real dp size {dist.get_world_size(group=real_dp_process_group[i])} and dst: {dst}")
                # dist.barrier()
                #dist.barrier()
                dst_rank = _get_global_rank(real_dp_process_group[i], dst)
@ -1008,7 +1008,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
                    dest_offset,
                    num_elements)

-        #accumulate gradients into param.grad or parts of it that belongs to this parittion
+        #accumulate gradients into param.grad or parts of it that belongs to this partition
        def accumulate_gradients():
            if not self.fp16_master_weights_and_gradients:
                dest_buffer.copy_(self.accumulated_grads_in_cpu[param_id].view(-1),
@ -1167,7 +1167,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
                                                  device=torch.cuda.current_device())
            see_memory_usage(f"after copying {total_size} gradients into partition")

-        # The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
+        # The allreduce buffer will be rewritten. Copy the gradients in partition to a new buffer
        new_grad_tensor = self.grads_in_partition.view(-1).narrow(
            0,
            self.grads_in_partition_offset,
@ -1257,7 +1257,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
        self.sequential_execution(print_func, message)

    def get_grads_to_reduce(self, i, partition_id):
-        def get_reducable_portion(key):
+        def get_reducible_portion(key):
            grad = self.param_dict[key].grad
            total_elements = grad.numel()
            start = self.grad_start_offset[i][partition_id][key]
@ -1283,7 +1283,7 @@ class FP16_DeepSpeedZeroOptimizer(object):

        grads_to_reduce = []
        for key in self.is_grad_computed[i][partition_id]:
-            grad = get_reducable_portion(key)
+            grad = get_reducible_portion(key)
            grads_to_reduce.append(grad)
        return grads_to_reduce

@ -1500,7 +1500,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
        else:
            total_norm = 0.0
            # if dist.get_rank() == 0:
-            #    logger.info(f"Total Norm begining {total_norm}")
+            #    logger.info(f"Total Norm beginning {total_norm}")
            for g, p in zip(gradients, params):
                # Pipeline parallelism may replicate parameters. Avoid multi-counting.
                if hasattr(p, 'ds_pipe_replicated') and p.ds_pipe_replicated:
@ -1655,7 +1655,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
                    self.get_grad_norm_direct(self.averaged_gradients[i],
                                              self.params_in_partition[i]))

-                # free gradients for all the prameters that are not updated by this process
+                # free gradients for all the parameters that are not updated by this process
                self.free_grad_in_param_list(self.params_not_in_partition[i])

                # create a flat gradients for parameters updated by this process
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@ -100,7 +100,7 @@ def _apply_to_tensors_only(module, functional, backward_function, outputs):
        return outputs


-#for each tensor in outputs run the forward_funciton and register backward_function as hook
+#for each tensor in outputs run the forward_function and register backward_function as hook
 def _apply_forward_and_backward_to_tensors_only(module,
                                                forward_function,
                                                backward_function,
@ -178,7 +178,7 @@ class PrefetchCoordinator(object):
        # maps sub_module id to submodule objects
        self.id_to_sub_module_map = {}

-        # stores the total number of parmeters in each sub_module
+        # stores the total number of parameters in each sub_module
        self.id_to_sub_module_size_map = {}

        self.trace_completed = False
@ -362,7 +362,7 @@ class PartitionedParameterCoordinator(object):
            for param in params_to_prefetch:
                param.ds_status = ZeroParamStatus.INFLIGHT

-                # keeping track of number of elements consumed by available parmaeters
+                # keeping track of number of elements consumed by available parameters
                self._increment_available_parameter_numel(param.ds_numel)

            if nvme:
@ -432,7 +432,7 @@ class PartitionedParameterCoordinator(object):
                )
                partitioned_params.append(param)

-                # keeping track of number of elements consumed by available parmaeters
+                # keeping track of number of elements consumed by available parameters
                self._increment_available_parameter_numel(param.ds_numel)
                print_rank_0(f"Incrementing with parameter id {param.ds_id}")

@ -504,7 +504,7 @@ class PartitionedParameterCoordinator(object):
        param.ds_active_sub_modules = 0
        if param.ds_status == ZeroParamStatus.AVAILABLE:
            print_rank_0(
-                f"Releasing unpartitioned param {debug_param2name_id_numel(param)} active sub-modules {param.ds_active_sub_modules} and persisitence {param.ds_persist}"
+                f"Releasing unpartitioned param {debug_param2name_id_numel(param)} active sub-modules {param.ds_active_sub_modules} and persistence {param.ds_persist}"
            )
            self._decrement_available_parameter_numel(param.ds_numel)
            param.partition()
@ -810,7 +810,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
        see_memory_usage(f"After creating fp16 partitions: {num_fp16_subgroups}",
                         force=False)

-        # Optimizer ensor swapping
+        # Optimizer tensor swapping
        if self.swap_optimizer:
            self._configure_tensor_swapping(offload_optimizer_config, aio_config)

@ -902,7 +902,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
        # stores if a grad in a partition has been computed or not
        self.is_grad_computed = {}

-        # will store the averaged gradients required by this parititon
+        # will store the averaged gradients required by this paritition
        self.averaged_gradients = {}

        #creates backward hooks for gradient partitioning
@ -1046,7 +1046,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
                src.data = dest.data
                src.status = PartitionedParamStatus.AVAILABLE
            else:
-                assert src.status == PartitionedParamStatus.AVAILABLE, "Partitioned Parm must be avialable here"
+                assert src.status == PartitionedParamStatus.AVAILABLE, "Partitioned Param must be available here"
                if not avoid_copy:
                    dest.data.copy_(src.data)
                src.data = dest.data
@ -1175,7 +1175,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
                                      flat_offset,
                                      total_elements)
                        print_rank_0(
-                            f"Creating a flat buffer for subgroup {i} requiring {total_elements} elements, and cumulative CPU elemets {flat_offset + total_elements}",
+                            f"Creating a flat buffer for subgroup {i} requiring {total_elements} elements, and cumulative CPU elements {flat_offset + total_elements}",
                            force=False)
                    #these parameters reside in NVME and
                    elif self.params_in_nvme_and_cpu:
@ -1410,7 +1410,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
        self.module.register_forward_hook(_end_of_forward_hook)
        self.module.register_forward_pre_hook(_pre_forward_hook)

-        # Add top todule to stack trace
+        # Add top module to stack trace
        global FWD_MODULE_STACK
        FWD_MODULE_STACK.append(self.module)

@ -1793,7 +1793,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
        see_memory_usage(f"End ipg_epilogue", force=False)

    # resets all partition to no reduced
-    # sets remianing grads to the total number of grads in each partition
+    # sets remaining grads to the total number of grads in each partition
    # set is grad computed to false for all grads in partition
    def reset_partition_gradient_structures(self):
        total_partitions = dist.get_world_size(group=self.dp_process_group)
@ -2140,7 +2140,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
                    #    self.reduction_stream.synchronize()

                    if self.gradient_accumulation_steps > 1:
-                        # The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
+                        # The allreduce buffer will be rewritten. Copy the gradients in partition to a new buffer
                        fp16_grad_tensor = self.grads_in_partition[i].narrow(
                            0,
                            dest_offset,
@ -2173,7 +2173,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
                                param,
                                fp32_grad_tensor)
                else:
-                    # The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
+                    # The allreduce buffer will be rewritten. Copy the gradients in partition to a new buffer
                    fp16_grad_tensor = self.grads_in_partition[i].narrow(
                        0,
                        dest_offset,
@ -2248,7 +2248,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
        self.sequential_execution(print_func, message)

    def get_grads_to_reduce(self, i, partition_id):
-        def get_reducable_portion(key):
+        def get_reducible_portion(key):
            grad = self.param_dict[key].grad
            total_elements = grad.numel()
            start = self.grad_start_offset[i][partition_id][key]
@ -2274,7 +2274,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):

        grads_to_reduce = []
        for key in self.is_grad_computed[i][partition_id]:
-            grad = get_reducable_portion(key)
+            grad = get_reducible_portion(key)
            grads_to_reduce.append(grad)
        return grads_to_reduce

@ -2475,7 +2475,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
        else:
            total_norm = 0.0
            # if dist.get_rank() == 0:
-            #    logger.info(f"Total Norm begining {total_norm}")
+            #    logger.info(f"Total Norm beginning {total_norm}")
            for g, p in zip(gradients, params):
                if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
                    param_norm = g.data.double().norm(2)
@ -2819,7 +2819,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
        return

    def dump_pre_step_gradients(self, debug_fp32_grads):
-        # Dump gradient norms for debbuging
+        # Dump gradient norms for debugging
        for i, _ in enumerate(self.fp16_groups):
            print(f'Pre-Step Dump Norms for Group {i} FP16P, FP16G, FP32G, FP32GUC')
            for fp16_param, fp32_grad in zip(self.fp16_groups[i], debug_fp32_grads[i]):
@ -2831,7 +2831,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
                print(f'Pre-Step Norms {i} {param_id} = {norm_list}')

    def dump_post_step_gradients(self):
-        # Dump gradient norms for debbuging
+        # Dump gradient norms for debugging
        for i, group in enumerate(self.fp16_groups):
            print(
                f'Post-Step Dump Norms for Group {i} FP16P, FP16DS, FP16FLAT, FP32FLAT')
@ -3137,7 +3137,7 @@ class FP16_DeepSpeedZeroOptimizer_Stage3(object):
    def refresh_fp32_params(self):
        self._restore_from_fp16_weights()

-    # Extract flattened partion for current rank from all partitions
+    # Extract flattened partition for current rank from all partitions
    def _get_flattened_partition(self, all_partition_states):
        partition_id = dist.get_rank(group=self.dp_process_group)
        alignment = dist.get_world_size(group=self.dp_process_group)
--- a/deepspeed/utils/groups.py
+++ b/deepspeed/utils/groups.py
@ -100,7 +100,7 @@ def initialize(ep_size=1, mpu=None):
    Arguments:
        ep_size (int, optional): default=1, expert parallel size
        mpu (module, optional): default=None, model parallel unit (e.g., from Megatron)
-            that descibes model/data parallel ranks.
+            that describes model/data parallel ranks.

    """
    if mpu is not None:
@ -121,7 +121,7 @@ def initialize_model_parallel(model_parallel_size_):

    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
    use 2 GPUs to parallelize the model. The present function will
-    create 4 model parallel groups and 2 data parallel grous as:
+    create 4 model parallel groups and 2 data parallel groups as:
        4 model parallel groups:
            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
        2 data parallel groups:
--- a/deepspeed/utils/zero_to_fp32.py
+++ b/deepspeed/utils/zero_to_fp32.py
@ -269,7 +269,7 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
        model.load_state_dict(state_dict)
        # submit to model hub or save the model to share with others

-    In this example the ``model`` will no longer be useable in the deepspeed context of the same
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
    application. i.e. you will need to re-initialize the deepspeed engine, since
    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.

@ -332,7 +332,7 @@ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
        # submit to model hub or save the model to share with others

-    Note, that once this was run, the ``model`` will no longer be useable in the deepspeed context
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.

--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@ -4,7 +4,7 @@ title: "DeepSpeed Configuration JSON"

 ### Batch Size Related Parameters

-**Note:** <i>**train_batch_size**</i> must be equal to  <i>**train_micro_batch_size_per_gpu**</i> * <i>**gradient_accumulation**</i> * number of GPUs. For simplicty, you can choose to only specify two of the three parameters, the last one will be inferred automatically by DeepSpeed.
+**Note:** <i>**train_batch_size**</i> must be equal to  <i>**train_micro_batch_size_per_gpu**</i> * <i>**gradient_accumulation**</i> * number of GPUs. For simplicity, you can choose to only specify two of the three parameters, the last one will be inferred automatically by DeepSpeed.
 {: .notice--warning}

 <i>**train_batch_size**</i>: [integer]
@ -662,7 +662,7 @@ Configuring the asynchronous I/O module for offloading parameter and optimizer s

 | Description                                                                                              | Default |
 | -------------------------------------------------------------------------------------------------------- | ------- |
-| Total number of activation checkpoints used to allocate memory buffer for contiguous_memoty_optimization | `None`  |
+| Total number of activation checkpoints used to allocate memory buffer for contiguous_memory_optimization | `None`  |

 <i>**synchronize_checkpoint_boundary**</i>: [boolean]

--- a/docs/_posts/2021-05-05-MoQ.md
+++ b/docs/_posts/2021-05-05-MoQ.md
@ -37,7 +37,7 @@ Figure 2 shows another mixed-precision quantization that sets target bits as 4,
 ![MoQ (mixed-precision)](/assets/images/bingbert-mixedbit.png){: .align-center}
 Figure 3: Mixed-precision quantization with MoQ for Bert SQuAD plus.

-As another example, we use eigenvalue-based MoQ to quantize Bert-Large for SQuAD finetuning. Figure 3 shows the number of bits we get to at the end of finetuning on each layer. Here, we see slightly different percision spectrum compared to BertBase on GLUE tasks. As the figure shows, we can reducethe precision on the first few layers more aggressively than the middle ones. Also, the last few layers can tollerate  very low precision similar to the beginning layers. This way of quantization finally results in 90.56 F1 Score which is prettey similar to the baseline.
+As another example, we use eigenvalue-based MoQ to quantize Bert-Large for SQuAD finetuning. Figure 3 shows the number of bits we get to at the end of finetuning on each layer. Here, we see slightly different precision spectrum compared to BertBase on GLUE tasks. As the figure shows, we can reduce the precision on the first few layers more aggressively than the middle ones. Also, the last few layers can tolerate  very low precision similar to the beginning layers. This way of quantization finally results in 90.56 F1 Score which is pretty similar to the baseline.

 ## Quantized Inference Kernels

--- a/docs/_posts/2021-05-05-inference-kernel-optimization.md
+++ b/docs/_posts/2021-05-05-inference-kernel-optimization.md
@ -1,6 +1,6 @@
 ---
 layout: single
-title: "DeepSpeed Inference: Multi-GPU inference with customized inference kerenls and quantization support"
+title: "DeepSpeed Inference: Multi-GPU inference with customized inference kernels and quantization support"
 excerpt: ""
 categories: news
 new_post: false
--- a/docs/_tutorials/MoQ-tutorial.md
+++ b/docs/_tutorials/MoQ-tutorial.md
@ -27,7 +27,7 @@ MoQ quantization schedule is defined by a number of parameters which allow users

 `quantize_groups`: Quantization groups, which shows the number of scales used to quantize a model, default is 1.

-`quantize_bits`, The numer of bits to control the data-precision transition from a start-bit to thhe final target-bits (e.g. starting from 16-bit down to 8-bit).
+`quantize_bits`, The numer of bits to control the data-precision transition from a start-bit to the final target-bits (e.g. starting from 16-bit down to 8-bit).

    `start_bits`: The start bits in quantization training. Default is set to 16.
    `target_bits`: The target bits in quantization training. Default is set to 16.
@ -63,7 +63,7 @@ MoQ quantization schedule is defined by a number of parameters which allow users

 ## How to Use MoQ for GLUE Training Tasks

-Before fine-tunning the GLUE tasks using DeepSpeed MoQ, you need:
+Before fine-tuning the GLUE tasks using DeepSpeed MoQ, you need:

 1. Install DeepSpeed.
 2. Checkout Huggingface transformers branch, install it with all required packages.
@ -206,6 +206,6 @@ As we see in the following table, MoQ consistently preserve accuracy across diff

 ### Tips

-When using the MoQ, one needs to consider the number of samples and training iterations before setting the correct quatization period or offset to make sure that the quantization reaches the desired level of precision before training finishes.
+When using the MoQ, one needs to consider the number of samples and training iterations before setting the correct quantization period or offset to make sure that the quantization reaches the desired level of precision before training finishes.

-Enabling eigenvalues for quantization dynamically adjust the quantization period on the different parts of the network. This has two positive impact: 1) the quantized network can potentially produce higher accuracy than quantizing each layer with same `quantize_period` ; 2) it automatically identifies a good quantization schedule for each layer based on its senitivity.
+Enabling eigenvalues for quantization dynamically adjust the quantization period on the different parts of the network. This has two positive impact: 1) the quantized network can potentially produce higher accuracy than quantizing each layer with same `quantize_period` ; 2) it automatically identifies a good quantization schedule for each layer based on its sensitivity.
--- a/docs/_tutorials/advanced-install.md
+++ b/docs/_tutorials/advanced-install.md
@ -26,7 +26,7 @@ ds_report

 ## Pre-install DeepSpeed Ops

-**Note:** [PyTorch](https://pytorch.org/) must be installed _before_ pre-compiling any DeepSpeed c++/cuda ops. However, this is not required if using the default mode of JIT compilition of ops.
+**Note:** [PyTorch](https://pytorch.org/) must be installed _before_ pre-compiling any DeepSpeed c++/cuda ops. However, this is not required if using the default mode of JIT compilation of ops.
 {: .notice--info}

 Sometimes we have found it useful to pre-install either some or all DeepSpeed
--- a/docs/_tutorials/flops-profiler.md
+++ b/docs/_tutorials/flops-profiler.md
@ -139,14 +139,14 @@ The DeepSpeed Flops Profiler can be used with the DeepSpeed runtime without any

 ## Flops Measurement

-Similar to exsiting flops calculation tools or methods, the DeepSpeed Flops Profiler measures the flops of the forward pass of a module and the flops of the backward pass is estimated as `2` times of that of the forward pass.
-Different from the PyTorch profiler which calculates the flops of PyTorch operators, the DeepSpeed Flops Profiler measures the flops witin modules in a model and provides more insights to the users about the model execution.
+Similar to existing flops calculation tools or methods, the DeepSpeed Flops Profiler measures the flops of the forward pass of a module and the flops of the backward pass is estimated as `2` times of that of the forward pass.
+Different from the PyTorch profiler which calculates the flops of PyTorch operators, the DeepSpeed Flops Profiler measures the flops within modules in a model and provides more insights to the users about the model execution.
 The flops estimation is partly inspired by [ptflops](https://github.com/sovrasov/flops-counter.pytorch) with the major difference being that the DeepSpeed Flops Profiler not only supports flops computation directly at module level, but can also capture ```torch.nn.functional``` invoked in a module to estimate the flops.
 Thus the DeepSpeed Flops Profiler allows for customized modules in the model, e.g., ```ParallelTransformerLayerworks, ParallelSelfAttention, RowParallelLinear, etc.``` in [Megatron-LM](https://github.com/NVIDIA/Megatron-LM). This is in contrast to ptflops which requires users to write customized flops calculation functions for each customized module.

 ## Multi-GPU, Multi-node, Data Parallelism, and Model Parallelism

-The DeepSpeed Flops Profiler outputs the per GPU profile as well as the world size, data parallel size, and model paralel size.                                           1
+The DeepSpeed Flops Profiler outputs the per GPU profile as well as the world size, data parallel size, and model parallel size.                                           1
 For models running on multi-GPU or multi-node, only change of the model parallelism (e.g. ```--model-parallel-size``` in [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)) affects the number of flops and parameters profiled, i.e.,
 `model_parallel_size * flops = total_flops` and `model_parallel_size * parameters = total_parameters`. The data parallel size or world size (related to the number of GPUs or nodes) does not affect the per GPU profile.

@ -376,7 +376,7 @@ with torch.cuda.device(0):
 #### In Model Training Workflow

 To profile model forward in a training workflow, use the `FlopsProfiler`class.
-The `FlopsProfiler`class provides the follwing methods:
+The `FlopsProfiler`class provides the following methods:
  * `start_profile()` - starts profiling
  * `get_total_flops(as_string=False)` - returns the total number of MACs in the model
  * `get_total_params(as_string=False)` - returns the total number of parameters in the model
--- a/docs/_tutorials/pytorch-profiler.md
+++ b/docs/_tutorials/pytorch-profiler.md
@ -7,7 +7,7 @@ This tutorial describes how to use [PyTorch Profiler](https://pytorch.org/blog/i
 PyTorch Profiler is an open-source tool that enables accurate and efficient performance analysis and troubleshooting for large-scale deep learning models.  The profiling results can be outputted as a `.json` trace file and viewed in Google Chrome's trace viewer (chrome://tracing).
 Microsoft Visual Studio Code's Python extension integrates TensorBoard into the code editor, including the support for the PyTorch Profiler.

-For more detais, refer to [PYTORCH PROFILER](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html#pytorch-profiler).
+For more details, refer to [PYTORCH PROFILER](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html#pytorch-profiler).

 ## Profile the model training loop

--- a/docs/_tutorials/sparse-attention.md
+++ b/docs/_tutorials/sparse-attention.md
@ -148,7 +148,7 @@ Please refer to the Docstrings for details of how to use each module separately.
 ## How to config sparsity structures
 Following we describe supported sparsity structures, their parameter set and the flexibility of adding arbitrary sparsity pattern on the self-attention layer. You can update DeepSpeed config file using any of the supported sparsity structures and set the parameters accordingly.

-* **SpasityConfig**:
+* **SparsityConfig**:
 This module, is the parent class for all sparsity structures and contains the shared features of all sparsity structures. It takes the following parameters:
  * `num_heads`: an integer determining number of attention heads of the layer.
  * `block`: an integer determining the block size. Current implementation of sparse self-attention is based on blocked sparse matrices. In which this parameter defines size of such square blocks; `Block X Block`.
@ -164,7 +164,7 @@ This structure is based on [Generative Modeling with Sparse Transformers](https:

 ![Fixed sparsity structure](/assets/images/sa_fixed_sparsity_structure.png)

-* **BSLongformer** (BSLongformerSparistyConfig):
+* **BSLongformer** (BSLongformerSparsityConfig):
 This structure is an edited version of [Longformer: The Long-Document Transformer](https://arxiv.org/pdf/2004.05150.pdf), in which instead of single token-wise sparsity, we offer block of tokens sparsity. Parameters that define this patters are:
 	* `num_sliding_window_blocks`: an integer determining the number of blocks in sliding local attention window.
 	* `global_block_indices`: a list of integers determining which blocks are considered as global attention. Given indices, determine the blocks that all other token blocks attend to and they attend to all other token blocks. Notice that if `global_block_end_indices` parameter is set, this parameter is used as starting index of each global window.
--- a/docs/code-docs/source/initialize.rst
+++ b/docs/code-docs/source/initialize.rst
@ -25,7 +25,7 @@ to add DeepSpeed's builtin arguments to your application's parser.

 Training Initialization
 -----------------------
-The entrypoint for all training with DeepSpeed is ``deepspeed.initialize()``. Will initialize distributed backend if it is not intialized already.
+The entrypoint for all training with DeepSpeed is ``deepspeed.initialize()``. Will initialize distributed backend if it is not initialized already.

 Example usage:

@ -39,6 +39,6 @@ Example usage:

 Distributed Initialization
 -----------------------
-Optional distributed backend initializating separate from ``deepspeed.initialize()``. Useful in scenarios where the user wants to use torch distributed calls before calling ``deepspeed.initialize()``, such as when using model parallelism, pipeline parallelism, or certain data loader scenarios.
+Optional distributed backend initialization separate from ``deepspeed.initialize()``. Useful in scenarios where the user wants to use torch distributed calls before calling ``deepspeed.initialize()``, such as when using model parallelism, pipeline parallelism, or certain data loader scenarios.

 .. autofunction:: deepspeed.init_distributed
--- a/op_builder/init.py
+++ b/op_builder/init.py
@ -9,7 +9,7 @@ from .transformer import TransformerBuilder
 from .stochastic_transformer import StochasticTransformerBuilder
 from .utils import UtilsBuilder
 from .async_io import AsyncIOBuilder
-from .builder import get_default_compute_capatabilities
+from .builder import get_default_compute_capabilities
 from .transformer_inference import InferenceBuilder
 from .quantizer import QuantizerBuilder

--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@ -48,7 +48,7 @@ def installed_cuda_version():
    return int(cuda_major), int(cuda_minor)


-def get_default_compute_capatabilities():
+def get_default_compute_capabilities():
    compute_caps = DEFAULT_COMPUTE_CAPABILITIES
    import torch.utils.cpp_extension
    if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version(
@ -172,7 +172,7 @@ class OpBuilder(ABC):
        '''
        Test for existence of a function within a tuple of libraries.

-        This is used as a smoke test to check whether a certain library is avaiable.
+        This is used as a smoke test to check whether a certain library is available.
        As a test, this creates a simple C program that calls the specified function,
        and then distutils is used to compile that program and link it with the specified libraries.
        Returns True if both the compile and link are successful, False otherwise.
@ -440,7 +440,7 @@ class CUDAOpBuilder(OpBuilder):
                cross_compile_archs = cross_compile_archs_env.replace(' ', ';')
            else:
                if cross_compile_archs is None:
-                    cross_compile_archs = get_default_compute_capatabilities()
+                    cross_compile_archs = get_default_compute_capabilities()
            ccs = cross_compile_archs.split(';')

        args = []
--- a/setup.py
+++ b/setup.py
@ -31,7 +31,7 @@ except ImportError:
    print('[WARNING] Unable to import torch, pre-compiling ops will be disabled. ' \
        'Please visit https://pytorch.org/ to see how to properly install torch on your system.')

-from op_builder import ALL_OPS, get_default_compute_capatabilities
+from op_builder import ALL_OPS, get_default_compute_capabilities

 RED_START = '\033[31m'
 RED_END = '\033[0m'
@ -88,7 +88,7 @@ if torch_available and not torch.cuda.is_available():
        "you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "
        "(compute capabilities 6.0, 6.1, 6.2)")
    if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
-        os.environ["TORCH_CUDA_ARCH_LIST"] = get_default_compute_capatabilities()
+        os.environ["TORCH_CUDA_ARCH_LIST"] = get_default_compute_capabilities()

 ext_modules = []

--- a/tests/benchmarks/flatten_bench.py
+++ b/tests/benchmarks/flatten_bench.py
@ -105,7 +105,7 @@ def timeme():


 def line_profileme():
-    print("--------------- line_profier -----------------")
+    print("--------------- line_profiler -----------------")
    print("py")
    profile(py)()
    gc.collect()
--- a/tests/model/Megatron_GPT2/ds_gpt2_test.sh
+++ b/tests/model/Megatron_GPT2/ds_gpt2_test.sh
@ -3,7 +3,7 @@
 helpFunction()
 {
    echo ""
-    echo "Usage: $0 -m model-parallelism -g gpu-per-node -n node# -b batch-size -s stpes -l layers -h hidden_size -q seq_length -e heads -c ckpt_num_layers -p [-d]"
+    echo "Usage: $0 -m model-parallelism -g gpu-per-node -n node# -b batch-size -s steps -l layers -h hidden_size -q seq_length -e heads -c ckpt_num_layers -p [-d]"
    echo -e "\t-m model parallelism"
    echo -e "\t-g gpus per node"
    echo -e "\t-n node count"
--- a/tests/unit/modeling.py
+++ b/tests/unit/modeling.py
@ -2,7 +2,7 @@
 # https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/modeling.py

 # coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -295,7 +295,7 @@ class BertConfig(object):
                layer in the Transformer encoder.
            hidden_act: The non-linear activation function (function or string) in the
                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+            hidden_dropout_prob: The dropout probability for all fully connected
                layers in the embeddings, encoder, and pooler.
            attention_probs_dropout_prob: The dropout ratio for the attention
                probabilities.
@ -800,7 +800,7 @@ class BertPreTrainingHeads(nn.Module):

 class BertPreTrainedModel(nn.Module):
    """ An abstract class to handle weights initialization and
-        a simple interface for dowloading and loading pretrained models.
+        a simple interface for downloading and loading pretrained models.
    """
    def __init__(self, config, *inputs, **kwargs):
        super(BertPreTrainedModel, self).__init__()
@ -856,7 +856,7 @@ class BertPreTrainedModel(nn.Module):
                    . `model.chkpt` a TensorFlow checkpoint
            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+            state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
            *inputs, **kwargs: additional input for the specific Bert class
                (ex: num_labels for BertForSequenceClassification)
        """
@ -977,7 +977,7 @@ class BertModel(BertPreTrainedModel):
        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.

    Outputs: Tuple of (encoded_layers, pooled_output)
-        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+        `encoded_layers`: controlled by `output_all_encoded_layers` argument:
            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
--- a/tests/unit/modelingpreln.py
+++ b/tests/unit/modelingpreln.py
@ -2,7 +2,7 @@
 # https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/modeling.py

 # coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -295,7 +295,7 @@ class BertConfig(object):
                layer in the Transformer encoder.
            hidden_act: The non-linear activation function (function or string) in the
                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+            hidden_dropout_prob: The dropout probability for all fully connected
                layers in the embeddings, encoder, and pooler.
            attention_probs_dropout_prob: The dropout ratio for the attention
                probabilities.
@ -895,7 +895,7 @@ class BertPreTrainingHeads(nn.Module):

 class BertPreTrainedModel(nn.Module):
    """ An abstract class to handle weights initialization and
-        a simple interface for dowloading and loading pretrained models.
+        a simple interface for downloading and loading pretrained models.
    """
    def __init__(self, config, *inputs, **kwargs):
        super(BertPreTrainedModel, self).__init__()
@ -951,7 +951,7 @@ class BertPreTrainedModel(nn.Module):
                    . `model.chkpt` a TensorFlow checkpoint
            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+            state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
            *inputs, **kwargs: additional input for the specific Bert class
                (ex: num_labels for BertForSequenceClassification)
        """
@ -1072,7 +1072,7 @@ class BertModel(BertPreTrainedModel):
        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.

    Outputs: Tuple of (encoded_layers, pooled_output)
-        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+        `encoded_layers`: controlled by `output_all_encoded_layers` argument:
            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
--- a/tests/unit/test_activation_checkpointing.py
+++ b/tests/unit/test_activation_checkpointing.py
@ -165,7 +165,7 @@ def test_ckpt_inputs1_outputs1():
    _test_activation_checkpoint(module, inputs)


-# both bool and float are important, as bool is not diffentiable
+# both bool and float are important, as bool is not differentiable
@pytest.mark.parametrize('mask',
                         [
                             _mixed_mask(),
--- a/tests/unit/test_autocast.py
+++ b/tests/unit/test_autocast.py
@ -30,7 +30,7 @@ def test_missing_amp_autocast(tmpdir, half_op):
@pytest.mark.parametrize('half_op', [False, True])
 def test_disable_autocast_linear(tmpdir, half_op):
    if _skip_autocast_test():
-        pytest.skip("amp autocast is not availalbe")
+        pytest.skip("amp autocast is not available")

    hidden_dim = 4
    if half_op:
@ -56,7 +56,7 @@ def test_disable_autocast_linear(tmpdir, half_op):
                           True)])
 def test_autocast_linear(tmpdir, half_input, half_weight):
    if _skip_autocast_test():
-        pytest.skip("amp autocast is not availalbe")
+        pytest.skip("amp autocast is not available")

    hidden_dim = 4
    input = torch.randn(hidden_dim).cuda()
--- a/tests/unit/test_cpu_adam.py
+++ b/tests/unit/test_cpu_adam.py
@ -20,7 +20,7 @@ def check_equal(first, second, atol=1e-2, verbose=False):
        print("x = {}".format(x.flatten()))
        print("y = {}".format(y.flatten()))
        print('-' * 80)
-    np.testing.assert_allclose(x, y, err_msg="param-update dismatch!", atol=atol)
+    np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol)

@pytest.mark.parametrize('model_size',
                         [
--- a/tests/unit/test_cuda_backward.py
+++ b/tests/unit/test_cuda_backward.py
@ -61,16 +61,16 @@ def check_equal(first, second, atol=1e-2, verbose=False):
        for i in range(len(x.shape) - 1):
            countx *= x.shape[i + 1]
            avgx = np.sum(avgx)
-        tollerance = 1
+        tolerance = 1
        if avgx != float('inf') and avgx != -float('inf'):
            avgx = avgx / countx
-            tollerance = avgx * atol
+            tolerance = avgx * atol
        if verbose:
-            print("tollerance is ", tollerance)
+            print("tolerance is ", tolerance)
            print("x = {}".format(x.flatten()))
            print("y = {}".format(y.flatten()))
            print('-' * 80)
-        np.testing.assert_allclose(x, y, err_msg="Index: {}".format(i), atol=tollerance)
+        np.testing.assert_allclose(x, y, err_msg="Index: {}".format(i), atol=tolerance)


 def zero_grad(variables):
--- a/tests/unit/test_dist.py
+++ b/tests/unit/test_dist.py
@ -13,7 +13,7 @@ def test_init():
    assert dist.get_rank() < 3


-# Demonstration of pytest's paramaterization
+# Demonstration of pytest's parameterization
@pytest.mark.parametrize('number,color', [(1138, 'purple')])
 def test_dist_args(number, color):
    """Outer test function with inputs from pytest.mark.parametrize(). Uses a distributed
--- a/tests/unit/test_flops_profiler.py
+++ b/tests/unit/test_flops_profiler.py
@ -6,7 +6,7 @@ from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_f
 from common import distributed_test


-def test_flops_profiler_in_ds_trainning(tmpdir):
+def test_flops_profiler_in_ds_training(tmpdir):
    config_dict = {
        "train_batch_size": 1,
        "steps_per_print": 1,
@ -34,7 +34,7 @@ def test_flops_profiler_in_ds_trainning(tmpdir):
    model = SimpleModel(hidden_dim, empty_grad=False)

    @distributed_test(world_size=[1])
-    def _test_flops_profiler_in_ds_trainning(args, model, hidden_dim):
+    def _test_flops_profiler_in_ds_training(args, model, hidden_dim):
        model, _, _, _ = deepspeed.initialize(args=args,
                                            model=model,
                                            model_parameters=model.parameters())
@ -52,7 +52,7 @@ def test_flops_profiler_in_ds_trainning(tmpdir):
        assert model.flops_profiler.flops == 100
        assert model.flops_profiler.params == 110

-    _test_flops_profiler_in_ds_trainning(args, model, hidden_dim)
+    _test_flops_profiler_in_ds_training(args, model, hidden_dim)


 class LeNet5(torch.nn.Module):