From 4cf970e6bb3c2ff29b2f03fcddb6f2cf26245a23 Mon Sep 17 00:00:00 2001
From: Alex Hedges <aphedges@users.noreply.github.com>
Date: Sat, 22 Jan 2022 17:45:58 -0500
Subject: [PATCH] Add codespell to pre-commit checks (#1717)

---
 .pre-commit-config.yaml                             | 13 +++++++++++++
 .pylintrc                                           |  2 +-
 csrc/aio/py_lib/py_ds_aio.cpp                       |  4 ++--
 csrc/lamb/fused_lamb_cuda.cpp                       |  4 ++--
 deepspeed/autotuning/README.md                      |  2 +-
 deepspeed/autotuning/autotuner.py                   | 10 +++++-----
 deepspeed/autotuning/constants.py                   |  2 +-
 deepspeed/autotuning/scheduler.py                   |  2 +-
 deepspeed/autotuning/tuner/README.md                |  2 +-
 deepspeed/autotuning/tuner/model_based_tuner.py     |  2 +-
 deepspeed/autotuning/utils.py                       |  2 +-
 deepspeed/env_report.py                             |  2 +-
 deepspeed/profiling/flops_profiler/README.md        | 10 +++++-----
 deepspeed/runtime/comm/coalesced_collectives.py     |  2 +-
 deepspeed/runtime/engine.py                         |  6 +++---
 deepspeed/runtime/fp16/unfused_optimizer.py         |  2 +-
 deepspeed/runtime/lr_schedules.py                   |  4 ++--
 deepspeed/runtime/pipe/engine.py                    |  2 +-
 deepspeed/runtime/pipe/p2p.py                       |  4 ++--
 deepspeed/runtime/pipe/topology.py                  |  2 +-
 deepspeed/runtime/zero/constants.py                 |  2 +-
 .../runtime/zero/contiguous_memory_allocator.py     |  4 ++--
 deepspeed/runtime/zero/partition_parameters.py      |  2 +-
 deepspeed/runtime/zero/stage3.py                    | 12 ++++++------
 deepspeed/runtime/zero/stage_1_and_2.py             |  4 ++--
 docs/_pages/config-json.md                          |  6 +++---
 docs/_posts/2021-03-08-zero3-offload.md             |  2 +-
 docs/_tutorials/MoQ-tutorial.md                     |  6 +++---
 docs/_tutorials/flops-profiler.md                   | 10 +++++-----
 docs/_tutorials/mixture-of-experts.md               |  2 +-
 docs/_tutorials/moe-inference-tutorial.md           |  2 +-
 docs/_tutorials/sparse-attention.md                 |  2 +-
 tests/unit/test_onebit.py                           | 10 +++++-----
 tests/unit/test_run.py                              |  4 ++--
 tests/unit/test_zero.py                             |  6 +++---
 35 files changed, 83 insertions(+), 70 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 61fd67b0b18a..f2fa818101c1 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,3 +32,16 @@ repos:
     hooks:
     -   id: clang-format  # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
         args: []
+
+-   repo: https://github.com/codespell-project/codespell
+    rev: v2.1.0
+    hooks:
+    -   id: codespell
+        args: [
+            # Do not check files that are automatically generated
+            '--skip=docs/Gemfile.lock,tests/unit/gpt2-merges.txt,tests/unit/gpt2-vocab.json',
+            '--ignore-regex=\\n',  # Do not count the 'n' in an escaped newline as part of a word
+            '--ignore-words-list=unsupport',  # Word used in error messages that need rewording
+            --check-filenames,
+            --check-hidden
+        ]
diff --git a/.pylintrc b/.pylintrc
index 41f18600b6f2..4fe39ed126e5 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -55,7 +55,7 @@ confidence=
 # can either give multiple identifiers separated by comma (,) or put this
 # option multiple times (only on the command line, not in the configuration
 # file where it should appear only once). You can also use "--disable=all" to
-# disable everything first and then reenable specific checks. For example, if
+# disable everything first and then re-enable specific checks. For example, if
 # you want to run only the similarities checker, you can use "--disable=all
 # --enable=similarities". If you want to run only the classes checker, but have
 # no Warning level messages displayed, use "--disable=all --enable=classes
diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp
index 61f95cd99c0e..68590581ce2d 100755
--- a/csrc/aio/py_lib/py_ds_aio.cpp
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@@ -11,9 +11,9 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
-    m.def("aio_read", &deepspeed_py_aio_read, "DeepSpeed Asynchornous I/O Read");
+    m.def("aio_read", &deepspeed_py_aio_read, "DeepSpeed Asynchronous I/O Read");
 
-    m.def("aio_write", &deepspeed_py_aio_write, "DeepSpeed Asynchornous I/O Write");
+    m.def("aio_write", &deepspeed_py_aio_write, "DeepSpeed Asynchronous I/O Write");
 
     m.def("deepspeed_memcpy", &deepspeed_py_memcpy, "DeepSpeed Memory Copy");
 
diff --git a/csrc/lamb/fused_lamb_cuda.cpp b/csrc/lamb/fused_lamb_cuda.cpp
index efa4f18d4d94..7a142b13b00c 100644
--- a/csrc/lamb/fused_lamb_cuda.cpp
+++ b/csrc/lamb/fused_lamb_cuda.cpp
@@ -61,7 +61,7 @@ at::Tensor lamb(at::Tensor& p,
 
     // intermediate for weight L2 reduction
     // make sure that the threads per block is at least 512 during the kernel launch otherwise the
-    // behavious is unexpected
+    // behaviour is unexpected
     at::Tensor w_l2_i = at::empty(
         {512},
         p.options().dtype(p.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float
@@ -69,7 +69,7 @@ at::Tensor lamb(at::Tensor& p,
 
     // intermediate for update L2 reduction
     // make sure that the threads per block is at least 512 during the kernel launch otherwise the
-    // behavious is unexpected
+    // behaviour is unexpected
     at::Tensor u_l2_i = at::empty(
         {512},
         p.options().dtype(p.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float
diff --git a/deepspeed/autotuning/README.md b/deepspeed/autotuning/README.md
index 3278483a0a32..2cb73b01318a 100755
--- a/deepspeed/autotuning/README.md
+++ b/deepspeed/autotuning/README.md
@@ -167,7 +167,7 @@ For example, the following section in the DeepSpeed configuration file limits th
 }
 ```
 
-The entry bellow asks the Autotuner to use `4` as the micro-batch size per GPU in tuning (micro-batch size per GPU is fixed as 4). Note that it's different from using ` "train_micro_batch_size_per_gpu": [4]` which asks the Autotuner to tune micro-batch size per GPU starting from `4`.
+The entry below asks the Autotuner to use `4` as the micro-batch size per GPU in tuning (micro-batch size per GPU is fixed as 4). Note that it's different from using ` "train_micro_batch_size_per_gpu": [4]` which asks the Autotuner to tune micro-batch size per GPU starting from `4`.
 ```json
 {
     "train_micro_batch_size_per_gpu": [4],
diff --git a/deepspeed/autotuning/autotuner.py b/deepspeed/autotuning/autotuner.py
index adc955ede6e1..5371d560ffa4 100755
--- a/deepspeed/autotuning/autotuner.py
+++ b/deepspeed/autotuning/autotuner.py
@@ -341,7 +341,7 @@ def _generate_experiments(self, tuning_space, max_train_batch_size_per_gpu):
         else:
             return exps
 
-        # replace the corresponding parameter values if the user specfies them in the DeepSpeed configuration file
+        # replace the corresponding parameter values if the user specifies them in the DeepSpeed configuration file
         replace_dict(tuning_space,
                      self.user_config,
                      [ZERO_OPTIMIZATION,
@@ -511,7 +511,7 @@ def tune_space(self,
         max_train_batch_size_per_gpu = 0
         tuning_micro_batch_sizes_overwritten = False
 
-        # calcuate max micro batch size using gpu memory, model instatiation memory and activation memory
+        # calculate max micro batch size using gpu memory, model instantiation memory and activation memory
         # calculated_max_micro_batch_size = (memory_per_gpu - instantiation_memory) // activation_memory_micro_batch_size_1
         calculated_max_micro_batch_size = int(
             self.gpu_mem -
@@ -584,11 +584,11 @@ def tune_space(self,
             logger.info(f"End tuning for space: {tuning_space_name}")
             return max_micro_batch_size, fast_best_mbs, fast_best_metric_val
 
-        # if the best metric or the micro batch size for that best metric in the current Zero stage after tuning micro batch size is less than the corrresponding value in the prevous Zero stage, return, do not tune other Zero configuration paramerts
+        # if the best metric or the micro batch size for that best metric in the current Zero stage after tuning micro batch size is less than the corresponding value in the previous Zero stage, return, do not tune other Zero configuration parameters
         if stage > 0:
             if fast_best_mbs <= prev_best_mbs or fast_best_metric_val < prev_best_metric_val:
                 logger.info(
-                    f"End tuning for space: {tuning_space_name}. No need to tune other Zero configuration paramerts."
+                    f"End tuning for space: {tuning_space_name}. No need to tune other Zero configuration parameters."
                 )
                 return max_micro_batch_size, fast_best_mbs, fast_best_metric_val
 
@@ -665,7 +665,7 @@ def model_info_profile_run(self):
         """Does a model information profling experiment that collects the number of model parameters and activation memory.\
             The experiment produces a "profile_model_info" folder under self.results_dir.
         Returns:
-            [dict]: a model inforation dictionary, e.g., {"num_params": 335144976, "trainable_num_params": 335144976, "activation_mem_per_gpu": 324358144, "rank": 0}
+            [dict]: a model information dictionary, e.g., {"num_params": 335144976, "trainable_num_params": 335144976, "activation_mem_per_gpu": 324358144, "rank": 0}
         """
         logger.info("Starting model info profile run.")
         model_info = self.autotuning_config.model_info
diff --git a/deepspeed/autotuning/constants.py b/deepspeed/autotuning/constants.py
index 3cde418edec1..3bfcd2725f90 100644
--- a/deepspeed/autotuning/constants.py
+++ b/deepspeed/autotuning/constants.py
@@ -137,7 +137,7 @@
 }
 
 #########################################
-# autotunner serach space constants
+# autotunner search space constants
 #########################################
 
 DEFAULT_HF_CONFIG = {
diff --git a/deepspeed/autotuning/scheduler.py b/deepspeed/autotuning/scheduler.py
index 0bfa9d19d67f..c42548a61748 100755
--- a/deepspeed/autotuning/scheduler.py
+++ b/deepspeed/autotuning/scheduler.py
@@ -241,7 +241,7 @@ def parse_results(self, metric):
         for exp_id, (exp, err) in self.finished_experiments.items():
             if err:
                 logger.info(
-                    f"The experiment exp_id = {exp_id}, exp_name = {exp['name']}, did not run succesfully with error = {err}, thus a metrics.txt does not exist for it. Check the stderr.log in {exp['result_dir']}"
+                    f"The experiment exp_id = {exp_id}, exp_name = {exp['name']}, did not run successfully with error = {err}, thus a metrics.txt does not exist for it. Check the stderr.log in {exp['result_dir']}"
                 )
                 continue
 
diff --git a/deepspeed/autotuning/tuner/README.md b/deepspeed/autotuning/tuner/README.md
index e7c860741a37..4218140a29e8 100644
--- a/deepspeed/autotuning/tuner/README.md
+++ b/deepspeed/autotuning/tuner/README.md
@@ -1,7 +1,7 @@
 # Tuner
 
 
-`exps` is a list of experiment descriptions (dictionarys).
+`exps` is a list of experiment descriptions (dictionaries).
 An experimentation description has a `ds_config` field that stores the DeepSpeed configuration to be used in the experiment.
 
 A tuner is based on BaseTuner and at least implements the `next_batch` method. It can implement a different `tune` method from the BaseTuner's.
diff --git a/deepspeed/autotuning/tuner/model_based_tuner.py b/deepspeed/autotuning/tuner/model_based_tuner.py
index 63c19c56c954..d8bc2b499f3d 100755
--- a/deepspeed/autotuning/tuner/model_based_tuner.py
+++ b/deepspeed/autotuning/tuner/model_based_tuner.py
@@ -120,7 +120,7 @@ def update(self):
             feature_val = []
             if err:
                 logger.info(
-                    f"Skipping exp_id = {exp_id}, exp_name = {exp['name']}, the experiment did not run succesfully with error = {err}, thus a metrics.txt does not exist for it. Please check the stderr.log in {exp['result_dir']}"
+                    f"Skipping exp_id = {exp_id}, exp_name = {exp['name']}, the experiment did not run successfully with error = {err}, thus a metrics.txt does not exist for it. Please check the stderr.log in {exp['result_dir']}"
                 )
                 ds_config = exp["ds_config"]
                 flattened_ds_config = flatten(ds_config)
diff --git a/deepspeed/autotuning/utils.py b/deepspeed/autotuning/utils.py
index 81bbac2cfe93..8cde6b867e8d 100644
--- a/deepspeed/autotuning/utils.py
+++ b/deepspeed/autotuning/utils.py
@@ -317,7 +317,7 @@ def canonical_name(config: dict, tuning_keys=None, prefix="", omit_val=False):
     Args:
         config (dict): the config dict used to generate the name
         tuning_keys (list, optional):  the tuning keys used to generate the name. Defaults to None.
-        prefix (str, optional): a string added to the begining of the name. Defaults to None.
+        prefix (str, optional): a string added to the beginning of the name. Defaults to None.
     """
     if TRAIN_MICRO_BATCH_SIZE_PER_GPU not in tuning_keys:
         tuning_keys.append(TRAIN_MICRO_BATCH_SIZE_PER_GPU)
diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py
index 5ede10a678c4..4e873e4bf209 100644
--- a/deepspeed/env_report.py
+++ b/deepspeed/env_report.py
@@ -107,7 +107,7 @@ def parse_arguments():
         '--hide_operator_status',
         action='store_true',
         help=
-        'Suppress display of installation and compatiblity statuses of DeepSpeed operators. '
+        'Suppress display of installation and compatibility statuses of DeepSpeed operators. '
     )
     parser.add_argument('--hide_errors_and_warnings',
                         action='store_true',
diff --git a/deepspeed/profiling/flops_profiler/README.md b/deepspeed/profiling/flops_profiler/README.md
index cfbab2f1416a..4c6cd6b12c2b 100644
--- a/deepspeed/profiling/flops_profiler/README.md
+++ b/deepspeed/profiling/flops_profiler/README.md
@@ -16,7 +16,7 @@ Below is an example output for BERT-Large(NVIDIA) on an A100 GPU with batch size
 -------------------------- DeepSpeed Flops Profiler --------------------------
 Profile Summary at step 10:
 Notations:
-data parallel size (dp_size), model paralel size(mp_size),
+data parallel size (dp_size), model parallel size(mp_size),
 number of parameters (params), number of multiply-accumulate operations(MACs),
 number of floating-point operations (flops), floating-point operations per second (FLOPS),
 fwd latency (forward propagation latency), bwd latency (backward propagation latency),
@@ -24,7 +24,7 @@ step (weights update latency), iter latency (sum of fwd, bwd and step latency)
 
 world size:                                                   1
 data parallel size:                                           1
-model paralel size:                                           1
+model parallel size:                                          1
 batch size per GPU:                                           80
 params per gpu:                                               336.23 M
 params of model = params per GPU * mp_size:                   336.23 M
@@ -160,7 +160,7 @@ The DeepSpeed Flops Profiler can be used with the DeepSpeed runtime or as a stan
       - [Example Training Workflow](#example-training-workflow)
 ### Usage With the DeepSpeed Runtime
 
-When using DeepSpeed for model training, the profiler can be configured in the deepspeed configuration file. No explict API calls are needed to use the profiler. The profiler can be enabled by adding the following field to the `deepspeed_config` json file. Refer to [flops profiler](https://www.deepspeed.ai/docs/config-json/#flops-profiler) for details.
+When using DeepSpeed for model training, the profiler can be configured in the deepspeed configuration file. No explicit API calls are needed to use the profiler. The profiler can be enabled by adding the following field to the `deepspeed_config` json file. Refer to [flops profiler](https://www.deepspeed.ai/docs/config-json/#flops-profiler) for details.
 
 ```json
 {
@@ -185,7 +185,7 @@ An example output of 12-layer Megatron-LM model (`hidden_size = 8192, num_attent
 -------------------------- DeepSpeed Flops Profiler --------------------------
 Profile Summary at step 10:
 Notations:
-data parallel size (dp_size), model paralel size(mp_size),
+data parallel size (dp_size), model parallel size(mp_size),
 number of parameters (params), number of multiply-accumulate operations(MACs),
 number of floating-point operations (flops), floating-point operations per second (FLOPS),
 fwd latency (forward propagation latency), bwd latency (backward propagation latency),
@@ -193,7 +193,7 @@ step (weights update latency), iter latency (sum of fwd, bwd and step latency)
 
 world size:                                                   1
 data parallel size:                                           1
-model paralel size:                                           1
+model parallel size:                                          1
 batch size per GPU:                                           1024
 params per gpu:                                               1.29 M
 params of model = params per GPU * mp_size:                   1.29 M
diff --git a/deepspeed/runtime/comm/coalesced_collectives.py b/deepspeed/runtime/comm/coalesced_collectives.py
index 880a3cc46b89..3b42d223182d 100644
--- a/deepspeed/runtime/comm/coalesced_collectives.py
+++ b/deepspeed/runtime/comm/coalesced_collectives.py
@@ -47,7 +47,7 @@ def reduce_scatter_coalesced(
     """simultaneously reduce-scatter a list of tensors - this can be done more
     efficiently than individual reduce scatter calls
 
-    TODO. see if PyTorch team wants a c++ verson of this for ProcessGroupNCCL
+    TODO. see if PyTorch team wants a c++ version of this for ProcessGroupNCCL
     """
     this_rank = torch.distributed.get_rank(group)
     world_sz = torch.distributed.get_world_size(group)
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 6c1aadee3a98..620adfc94d35 100644
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -861,7 +861,7 @@ def _set_distributed_vars(self, args):
     def _configure_with_arguments(self, args, mpu):
         # After the distributed backend is initialized we are guaranteed the LOCAL_RANK
         # environment variable is set. We must align args.local_rank to this value for
-        # backwards compatability with scripts relying on [args|self].local_rank containing
+        # backwards compatibility with scripts relying on [args|self].local_rank containing
         # the correct local rank info. _do_args_sanity_check will ensure this is the case.
 
         if "OMPI_COMM_WORLD_LOCAL_RANK" in os.environ:
@@ -2005,7 +2005,7 @@ def _autotuning_exit(self):
                 msg["latency"]
             print_json_dist(msg, [0], path=self.autotuning_metric_path())
             import atexit
-            atexit.register(print, "Autotuning: done with runing current ds config.")
+            atexit.register(print, "Autotuning: done with running current ds config.")
         exit()
 
     def _write_tensorboard(self):
@@ -2290,7 +2290,7 @@ def load_moe_state_dict(self, checkpoint_path, tag, state_dict, old_moe_load):
                 global_expert_id = expp_rank * num_local_experts + local_expert_id
                 expert_state_dict = torch.load(self._get_expert_ckpt_name(
                     checkpoint_path,
-                    -1, # -1 means ingore layer_id
+                    -1, # -1 means ignore layer_id
                     global_expert_id,
                     tag),
                     map_location=torch.device('cpu'))
diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py
index cd1c20ef4235..1837fb10b034 100755
--- a/deepspeed/runtime/fp16/unfused_optimizer.py
+++ b/deepspeed/runtime/fp16/unfused_optimizer.py
@@ -55,7 +55,7 @@ def __init__(self,
             #copied to fp16 weights
             fp32_group = [p.clone().float().detach() for p in param_group['params']]
 
-            #incase the internal optimizer needs it
+            #in case the internal optimizer needs it
             for p in fp32_group:
                 p.requires_grad = True
 
diff --git a/deepspeed/runtime/lr_schedules.py b/deepspeed/runtime/lr_schedules.py
index 21bc2f4661b6..b4dc749ae193 100755
--- a/deepspeed/runtime/lr_schedules.py
+++ b/deepspeed/runtime/lr_schedules.py
@@ -314,7 +314,7 @@ class LRRangeTest(object):
     the paper `A disciplined approach to neural network hyper-parameters: Part1`_.
 
     LRRT policy is used for finding maximum LR that trains a model without divergence, and can be used to
-    configure the LR boundaries for Cylic LR schedules.
+    configure the LR boundaries for Cyclic LR schedules.
 
     LRRT changes the learning rate after every batch.
     `step` should be called after a batch has been used for training.
@@ -325,7 +325,7 @@ class LRRangeTest(object):
             lower boundary in the range test for each parameter group.
         lr_range_test_step_size (int): Interval of training steps to increase learning rate. Default: 2000
         lr_range_test_step_rate (float): Scaling rate for range test. Default: 1.0
-        lr_range_test_staircase (bool): Scale in staircase fashion, rather than continous. Default: False.
+        lr_range_test_staircase (bool): Scale in staircase fashion, rather than continuous. Default: False.
         last_batch_iteration (int): The index of the last batch. This parameter is used when
             resuming a training job. Since `step()` should be invoked after each
             batch instead of after each epoch, this number represents the total
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 18394605812d..2ed0279a4e3e 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -156,7 +156,7 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
                         f'TOTAL_PARAMS={total_params} ({total_params/1e6:0.3f}M) '
                         f'UNIQUE_PARAMS={unique_params} ({unique_params/1e6:0.3f}M)')
 
-        #intialize peer-2-peer communication and allreduce groups
+        #initialize peer-2-peer communication and allreduce groups
         if self.is_pipe_parallel:
             p2p.init_process_groups(self.grid)
 
diff --git a/deepspeed/runtime/pipe/p2p.py b/deepspeed/runtime/pipe/p2p.py
index fa3654eaf8ee..8a1b71926006 100644
--- a/deepspeed/runtime/pipe/p2p.py
+++ b/deepspeed/runtime/pipe/p2p.py
@@ -47,7 +47,7 @@ def _is_valid_send_recv(src_stage, dest_stage):
 
 def send(tensor, dest_stage, async_op=False):
     global _groups
-    assert async_op == False, "Doesnt support async_op true"
+    assert async_op == False, "Doesn't support async_op true"
     src_stage = _grid.get_stage_id()
     _is_valid_send_recv(src_stage, dest_stage)
 
@@ -68,7 +68,7 @@ def send(tensor, dest_stage, async_op=False):
 
 def recv(tensor, src_stage, async_op=False):
     global _groups
-    assert async_op == False, "Doesnt support async_op true"
+    assert async_op == False, "Doesn't support async_op true"
     dest_stage = _grid.get_stage_id()
     _is_valid_send_recv(src_stage, dest_stage)
 
diff --git a/deepspeed/runtime/pipe/topology.py b/deepspeed/runtime/pipe/topology.py
index 610d23b6faea..240c973a3fc1 100644
--- a/deepspeed/runtime/pipe/topology.py
+++ b/deepspeed/runtime/pipe/topology.py
@@ -191,7 +191,7 @@ def _filter_helper(x):
             return True
 
         coords = filter(_filter_helper, self.mapping.keys())
-        return [self.mapping[coo] for coo in coords]
+        return [self.mapping[coord] for coord in coords]
 
     def get_axis_list(self, axis, idx):
         """Returns the list of global ranks whose coordinate in an axis is idx.
diff --git a/deepspeed/runtime/zero/constants.py b/deepspeed/runtime/zero/constants.py
index 804f15ba19e6..7f7c10e9d4af 100755
--- a/deepspeed/runtime/zero/constants.py
+++ b/deepspeed/runtime/zero/constants.py
@@ -125,7 +125,7 @@
 ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS = 'ignore_unused_parameters'
 ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS_DEFAULT = True
 
-# Use deepspeed < v0.3.17 zero stage 1, kept for backwards compatability reasons
+# Use deepspeed < v0.3.17 zero stage 1, kept for backwards compatibility reasons
 ZERO_OPTIMIZATION_LEGACY_STAGE1 = "legacy_stage1"
 ZERO_OPTIMIZATION_LEGACY_STAGE1_DEFAULT = False
 
diff --git a/deepspeed/runtime/zero/contiguous_memory_allocator.py b/deepspeed/runtime/zero/contiguous_memory_allocator.py
index cf1adebece86..a5b14ae13342 100644
--- a/deepspeed/runtime/zero/contiguous_memory_allocator.py
+++ b/deepspeed/runtime/zero/contiguous_memory_allocator.py
@@ -96,7 +96,7 @@ def release_tensor(self, tensor):
         print_rank_0(
             f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}."
         )
-        assert self.total_free - tensor_size == free_before, "Release bookeeping error"
+        assert self.total_free - tensor_size == free_before, "Release bookkeeping error"
 
     def release_tensor_with_id(self, tensor_id):
         free_before = self.total_free
@@ -109,7 +109,7 @@ def release_tensor_with_id(self, tensor_id):
         print_rank_0(
             f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}."
         )
-        assert self.total_free - tensor_size == free_before, "Release bookeeping error"
+        assert self.total_free - tensor_size == free_before, "Release bookkeeping error"
 
     #shows the current memory allocation at specified resolution
     def print_allocation(self, resolution=200):
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 42d5622704a9..0acc397138eb 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -691,7 +691,7 @@ def get_model():
 
         self._validate_remote_device(remote_device, _ds_config)
 
-        # Remote device is the device where parameter partiitons are stored
+        # Remote device is the device where parameter partitions are stored
         # It can be same as local_device or it could be CPU or NVMe.
         self.remote_device = self.local_device if remote_device is None else remote_device
         self.pin_memory = pin_memory if (self.remote_device
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 359bb273196c..ad945b745989 100755
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -232,7 +232,7 @@ def __init__(
         # this is a much less elegant way of fixing this vs something like using
         # cudaMallocAsync/cudaFreeAsync. Choosing to not expose this to the user now
         # because ideally in the future its replaced by an async allocation
-        # mechanism which doesnt require any configuration by the user.
+        # mechanism which doesn't require any configuration by the user.
         self.__ongoing_fetch_events: Deque[Event] = collections.deque()
         # TODO. make this configurable via JSON
         self.__max_ongoing_fetch_events: int = 2
@@ -250,7 +250,7 @@ def record_trace(self, sub_module: Module) -> None:
         """adds sub module to trace"""
         if self.trace_complete:
             raise RuntimeError(
-                "attemted to record trace when trace was already complete")
+                "attempted to record trace when trace was already complete")
 
         self.__submodule_order.append(sub_module)
         for param in sorted(set(iter_params(sub_module)), key=lambda p: p.ds_id):
@@ -597,7 +597,7 @@ def __init__(self,
         # - assume all model params in fp16
         # - assume all params requires grad
         # - flat by groups, not keeping state. TODO: remove state explicitly?
-        # - master gard and unflat master weight never exist. TODO: a way to save out unflat master?
+        # - master grad and unflat master weight never exist. TODO: a way to save out unflat master?
         if not torch.cuda.is_available:
             raise SystemError("Cannot use fp16 without CUDA.")
         self.optimizer = init_optimizer
@@ -867,7 +867,7 @@ def __init__(self,
         # stores if a grad in a partition has been computed or not
         self.is_grad_computed = {}
 
-        # will store the averaged gradients required by this paritition
+        # will store the averaged gradients required by this partition
         self.averaged_gradients = {}
 
         #creates backward hooks for gradient partitioning
@@ -1011,8 +1011,8 @@ def elements_in_ipg_bucket(self):
 
     def _move_to_flat_buffer(self, param_list, flat_buffer, avoid_copy=False):
         '''If flat buffer is None then the parameters in the param_list are
-        not copied to the flat buffer. This is because they excede the number of max_params_in_cpu
-        Some of these parameters may aready be in CPU in unflattened buffers
+        not copied to the flat buffer. This is because they exceed the number of max_params_in_cpu
+        Some of these parameters may already be in CPU in unflattened buffers
         or they maybe in GPU, or they maybe in NVME. If they are in NVME, then
         they will be marked as NOT_AVAILABLE, and will be moved to CPU when they are
         needed during training.'''
diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
index 9d9f051e728c..89f1b34f1cb6 100755
--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -132,7 +132,7 @@ def __init__(self,
         # - assume all model params in fp16
         # - assume all params requires grad
         # - flat by groups, not keeping state. TODO: remove state explicitly?
-        # - master gard and unflat master weight never exist. TODO: a way to save out unflat master?
+        # - master grad and unflat master weight never exist. TODO: a way to save out unflat master?
         if not torch.cuda.is_available:
             raise SystemError("Cannot use fp16 without CUDA.")
         self.optimizer = init_optimizer
@@ -391,7 +391,7 @@ def __init__(self,
         # simplified param id
         self.param_id = {}
 
-        #interesting code: unique ids being assigned to individual paramters
+        #interesting code: unique ids being assigned to individual parameters
         largest_param_numel = 0
         count = 0
         for i, params_group in enumerate(self.bit16_groups):
diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
index 5f81f20867e3..202f4c817d1f 100755
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -653,9 +653,9 @@ Configuring the asynchronous I/O module for offloading parameter and optimizer s
 
 <i>**overwrite**</i>: [boolean]
 
-| Description                                                                               | Default |
-| ----------------------------------------------------------------------------------------- | ------- |
-| Whether to run autotuing experiments whose results alreay exsit. Setting it to true would overwrite the existing result. | `false` |
+| Description                                                                                                               | Default |
+|---------------------------------------------------------------------------------------------------------------------------| ------- |
+| Whether to run autotuing experiments whose results already exist. Setting it to true would overwrite the existing result. | `false` |
 
 
 <i>**metric**</i>: [string]
diff --git a/docs/_posts/2021-03-08-zero3-offload.md b/docs/_posts/2021-03-08-zero3-offload.md
index 6c4f5ab2e4f4..8e5778afa0fc 100644
--- a/docs/_posts/2021-03-08-zero3-offload.md
+++ b/docs/_posts/2021-03-08-zero3-offload.md
@@ -28,7 +28,7 @@ There are three stages in ZeRO corresponding to three model states, as shown in
 </a>
 Figure 1. Overview of ZeRO memory savings
 
-In addition to these three stages, ZeRO family of technology also consists of ZeRO-2 Offload. ZeRO-2 Offload is a heterogenous DL training technology that works in conjunction with ZeRO-2 to offload partitioned optimizer states and gradients to CPU memory. ZeRO-2 Offload offers the full memory advantage of ZeRO-2 even on a single GPU, while at the same time offering great scalability of ZeRO-2 on multi-GPU setup.  DeepSpeed library has been offering ZeRO-2 Offload since Sept 2020. For details, please see below:
+In addition to these three stages, ZeRO family of technology also consists of ZeRO-2 Offload. ZeRO-2 Offload is a heterogeneous DL training technology that works in conjunction with ZeRO-2 to offload partitioned optimizer states and gradients to CPU memory. ZeRO-2 Offload offers the full memory advantage of ZeRO-2 even on a single GPU, while at the same time offering great scalability of ZeRO-2 on multi-GPU setup.  DeepSpeed library has been offering ZeRO-2 Offload since Sept 2020. For details, please see below:
 
 * ZeRO: [Stage 1 blog](https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/), [Stage 2 blog](https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/), [Tutorial](/tutorials/zero)
 * ZeRO-Offload: [Blog](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/#toc-heading-3), [Tutorials](/tutorials/zero-offload), [Paper link](https://arxiv.org/abs/2101.06840)
diff --git a/docs/_tutorials/MoQ-tutorial.md b/docs/_tutorials/MoQ-tutorial.md
index 726492c8ca9d..d2abb3f11619 100644
--- a/docs/_tutorials/MoQ-tutorial.md
+++ b/docs/_tutorials/MoQ-tutorial.md
@@ -27,20 +27,20 @@ MoQ quantization schedule is defined by a number of parameters which allow users
 
 `quantize_groups`: Quantization groups, which shows the number of scales used to quantize a model, default is 1.
 
-`quantize_bits`, The numer of bits to control the data-precision transition from a start-bit to the final target-bits (e.g. starting from 16-bit down to 8-bit).
+`quantize_bits`, The number of bits to control the data-precision transition from a start-bit to the final target-bits (e.g. starting from 16-bit down to 8-bit).
 
     `start_bits`: The start bits in quantization training. Default is set to 16.
     `target_bits`: The target bits in quantization training. Default is set to 16.
 
 `quantize_schedule`, This determines how to schedule the training steps at each precision level.
 
-    `quantize_period`: indicates the period by which we reduce down the precison (number of bits) for quantization. By default, we use a period of 100 training steps, that will be doubled every time the precision reduces by 1 bit.
+    `quantize_period`: indicates the period by which we reduce down the precision (number of bits) for quantization. By default, we use a period of 100 training steps, that will be doubled every time the precision reduces by 1 bit.
     `schedule_offset`: indicates when the quantization starts to happen (before this offset, we just use the normal training precision which can be either FP32/FP16). Default is set to 100 steps.
 
 `quantize_algo`, The algorithm used to quantize the model.
 
     `q_type`: we currently support symmetric and asymmetric quantization that result in signed and unsigned integer values, respectively. Default is set to symmetric
-    `rounding`: for the rounding of the quantized values, we can either round to the nearest value or use stocahstic rounding. Default is set to nearest.
+    `rounding`: for the rounding of the quantized values, we can either round to the nearest value or use stochastic rounding. Default is set to nearest.
 
 ### Eigenvalue Parameters
 
diff --git a/docs/_tutorials/flops-profiler.md b/docs/_tutorials/flops-profiler.md
index 1333932755d2..f5dfd05a6964 100644
--- a/docs/_tutorials/flops-profiler.md
+++ b/docs/_tutorials/flops-profiler.md
@@ -20,7 +20,7 @@ Below is an example output for BERT-Large(NVIDIA) on an A100 GPU with batch size
 -------------------------- DeepSpeed Flops Profiler --------------------------
 Profile Summary at step 10:
 Notations:
-data parallel size (dp_size), model paralel size(mp_size),
+data parallel size (dp_size), model parallel size(mp_size),
 number of parameters (params), number of multiply-accumulate operations(MACs),
 number of floating-point operations (flops), floating-point operations per second (FLOPS),
 fwd latency (forward propagation latency), bwd latency (backward propagation latency),
@@ -28,7 +28,7 @@ step (weights update latency), iter latency (sum of fwd, bwd and step latency)
 
 world size:                                                   1
 data parallel size:                                           1
-model paralel size:                                           1
+model parallel size:                                          1
 batch size per GPU:                                           80
 params per gpu:                                               336.23 M
 params of model = params per GPU * mp_size:                   336.23 M
@@ -166,7 +166,7 @@ The DeepSpeed Flops Profiler can be used with the DeepSpeed runtime or as a stan
 
 ### Usage With the DeepSpeed Runtime
 
-When using DeepSpeed for model training, the profiler can be configured in the deepspeed [configuration file](/docs/config-json/#flops-profiler). No explict API calls are needed to use the profiler. The profiler can be enabled by adding the following field to deepspeed's configuration json file. Refer to [flops profiler](/docs/config-json/#flops-profiler) for details.
+When using DeepSpeed for model training, the profiler can be configured in the deepspeed [configuration file](/docs/config-json/#flops-profiler). No explicit API calls are needed to use the profiler. The profiler can be enabled by adding the following field to deepspeed's configuration json file. Refer to [flops profiler](/docs/config-json/#flops-profiler) for details.
 
 ```json
 {
@@ -191,7 +191,7 @@ An example output of 12-layer Megatron-LM model (`hidden_size = 8192, num_attent
 -------------------------- DeepSpeed Flops Profiler --------------------------
 Profile Summary at step 10:
 Notations:
-data parallel size (dp_size), model paralel size(mp_size),
+data parallel size (dp_size), model parallel size(mp_size),
 number of parameters (params), number of multiply-accumulate operations(MACs),
 number of floating-point operations (flops), floating-point operations per second (FLOPS),
 fwd latency (forward propagation latency), bwd latency (backward propagation latency),
@@ -199,7 +199,7 @@ step (weights update latency), iter latency (sum of fwd, bwd and step latency)
 
 world size:                                                   1
 data parallel size:                                           1
-model paralel size:                                           1
+model parallel size:                                          1
 batch size per GPU:                                           1024
 params per gpu:                                               1.29 M
 params of model = params per GPU * mp_size:                   1.29 M
diff --git a/docs/_tutorials/mixture-of-experts.md b/docs/_tutorials/mixture-of-experts.md
index 91d7d87864fa..4d025a6cacf6 100644
--- a/docs/_tutorials/mixture-of-experts.md
+++ b/docs/_tutorials/mixture-of-experts.md
@@ -80,7 +80,7 @@ self.experts = deepspeed.moe.layer.MoE(hidden_size=input_dim, expert=ExpertModul
 ```
 With the above two commands, the DeepSpeed runtime will be set to train an MoE model with a total of 8 experts on 4 GPUs in 4 experts/GPU mode. We call this the E + D mode as described earlier in the table.
 
-For more advanced use case of the groups API including the inter-operability with Megatron style mpu object, watch this space!
+For more advanced use case of the groups API including the interoperability with Megatron style mpu object, watch this space!
 
 
 ```python
diff --git a/docs/_tutorials/moe-inference-tutorial.md b/docs/_tutorials/moe-inference-tutorial.md
index c9d2826171c1..b3abf1a531b5 100644
--- a/docs/_tutorials/moe-inference-tutorial.md
+++ b/docs/_tutorials/moe-inference-tutorial.md
@@ -24,7 +24,7 @@ In this part, we elaborate the usage of MoE inference support in the DeepSpeed l
 
 First step to use DeepSpeed-MoE inferenece is to initialize the expert-parallel groups. To do so, one can use the group utility from DeepSpeed to initialize the group (`deepspeed.utils.groups.initialize`). This function creates the groups based on minimum of the world\_size (total number of GPUs) and expert size. By using this group, we can partition the experts among the expert-parallel GPUs. If number of experts is lower than total number of GPUs, DeepSpeed-MoE leverages expert-slicing for partitioning the expert parameters between the expert-parallel GPUs.
 
-For inference with DeepSpeed-MoE, use `init_inference` API to load the MoE model for inference. Here, you can specify the Model-parallelism/tensor-slicing (MP) degree, number of experts, and if the model has not been loaded with the appropriate checkpoint, you can also provide the checkpoint description using a `json` file or simply pass the `'checkpoint'` path to load the moddel. To inject the high-performance inference kernels, you can pass int the `replace_method` as `'auto'` and set the `replace_with_kernel_inject` to True.
+For inference with DeepSpeed-MoE, use `init_inference` API to load the MoE model for inference. Here, you can specify the Model-parallelism/tensor-slicing (MP) degree, number of experts, and if the model has not been loaded with the appropriate checkpoint, you can also provide the checkpoint description using a `json` file or simply pass the `'checkpoint'` path to load the model. To inject the high-performance inference kernels, you can pass int the `replace_method` as `'auto'` and set the `replace_with_kernel_inject` to True.
 
 ```python
 
diff --git a/docs/_tutorials/sparse-attention.md b/docs/_tutorials/sparse-attention.md
index eaa8bca0df80..8905b38debbc 100644
--- a/docs/_tutorials/sparse-attention.md
+++ b/docs/_tutorials/sparse-attention.md
@@ -184,7 +184,7 @@ This structure also combines the idea of local, global and random attention. Fur
 	* `global_block_end_indices`: a list of integers determining end indices of global window blocks. By default this is not used. But if it is set, it must have the same size as `global_block_indices` parameter, and combining this two parameters, for each index `i`, blocks from `global_block_indices[i]` to `global_block_end_indices[i]` (exclusive) are considered as global attention block.
 	* `attention`: a string determining attention type. Attention can be `unidirectional`, such as autoregressive models, in which tokens attend only to tokens appear before them in the context. Considering that, the upper triangular of attention matrix is empty as above figure. Or it can be `bidirectional`, such as BERT, in which tokens can attend to any other tokens before or after them. Then, the upper triangular part of the attention matrix is mirror of the lower triangular in the above figure.
 	* `horizontal_global_attention`: a boolean determining if blocks that are global representative of a local window, also attend to all other blocks. This is valid only if attention type is `bidirectional`. Looking at the attention matrix, that means global attention not only includes the vertical blocks, but also horizontal blocks
-Figure bellow illustrates an example of `variable` sparsity, in which blue, orange and green blocks illustrate local, global, and random attention blocks respectively.
+Figure below illustrates an example of `variable` sparsity, in which blue, orange and green blocks illustrate local, global, and random attention blocks respectively.
 
 ![Variable sparsity structure](/assets/images/sa_variable_sparsity_structure.png)
 
diff --git a/tests/unit/test_onebit.py b/tests/unit/test_onebit.py
index 6f113c837e62..c6450fb81d62 100644
--- a/tests/unit/test_onebit.py
+++ b/tests/unit/test_onebit.py
@@ -265,7 +265,7 @@ def _test_onebitadam_checkpointing(mask1, mask2, args, model, hidden_dim):
                                 load_optimizer_states=True,
                                 load_lr_scheduler_states=True)
         assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Momentum mask should not change after loading checkpoint"
-        # Test whether worker&server error is resetted
+        # Test whether worker&server error is reset
         for v in optimizer_2.state.values():
             assert 'worker_error' not in v, f"Incorrect worker error"
             assert 'server_error' not in v, f"Incorrect server error"
@@ -291,7 +291,7 @@ def _test_onebitadam_checkpointing(mask1, mask2, args, model, hidden_dim):
                                 load_optimizer_states=True,
                                 load_lr_scheduler_states=True)
         assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Momentum mask should not change after loading checkpoint"
-        # Test whether worker&server error is resetted
+        # Test whether worker&server error is reset
         for v in optimizer_3.state.values():
             assert 'worker_error' not in v, f"Incorrect worker error"
             assert 'server_error' not in v, f"Incorrect server error"
@@ -682,7 +682,7 @@ def _test_onebitlamb_checkpointing(mask1, mask2, args, model, hidden_dim):
                                 load_optimizer_states=True,
                                 load_lr_scheduler_states=True)
         assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Momentum mask should not change after loading checkpoint"
-        # Test whether worker&server error is resetted
+        # Test whether worker&server error is reset
         assert len(optimizer_2.optimizer.worker_errors) == 0, f"Incorrect worker error"
         assert len(optimizer_2.optimizer.server_errors) == 0, f"Incorrect server error"
         # Test whether scaling_coeffs is loaded correctly
@@ -713,10 +713,10 @@ def _test_onebitlamb_checkpointing(mask1, mask2, args, model, hidden_dim):
                                 load_optimizer_states=True,
                                 load_lr_scheduler_states=True)
         assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Momentum mask should not change after loading checkpoint"
-        # Test whether worker&server error is resetted
+        # Test whether worker&server error is reset
         assert len(optimizer_3.optimizer.worker_errors) == 0, f"Incorrect worker error"
         assert len(optimizer_3.optimizer.server_errors) == 0, f"Incorrect server error"
-        # Test whether scaling_coeffs, lamb_coeff_freeze, last_factor are resetted
+        # Test whether scaling_coeffs, lamb_coeff_freeze, last_factor are reset
         for v in optimizer_3.state.values():
             assert v['lamb_coeff_freeze'] == 0.0, f"Incorrect lamb_coeff_freeze"
             assert v['last_factor'] == 1.0, f"Incorrect last_factor"
diff --git a/tests/unit/test_run.py b/tests/unit/test_run.py
index ed069c335fe6..f2b0a8b2018a 100644
--- a/tests/unit/test_run.py
+++ b/tests/unit/test_run.py
@@ -13,7 +13,7 @@ def test_parser_mutual_exclusive():
 
 def test_parser_local():
     ''' Test cases with only one node. '''
-    # First try no incude/exclude
+    # First try no include/exclude
     hosts = {'worker-0': [0, 1, 2, 3]}
     ret = dsrun.parse_resource_filter(hosts)
     assert (ret == hosts)
@@ -49,7 +49,7 @@ def test_parser_local():
 
 
 def test_parser_multinode():
-    # First try no incude/exclude
+    # First try no include/exclude
     hosts = {'worker-0': [0, 1, 2, 3], 'worker-1': [0, 1, 2, 3]}
     ret = dsrun.parse_resource_filter(hosts)
     assert (ret == hosts)
diff --git a/tests/unit/test_zero.py b/tests/unit/test_zero.py
index c2ff33a14042..82042eee36df 100755
--- a/tests/unit/test_zero.py
+++ b/tests/unit/test_zero.py
@@ -736,7 +736,7 @@ def create_tensor(vals, dtype: torch.dtype = None) -> Tensor:
                 ds_engine.optimizer.zero_grad()
 
         # TODO. add testing for this - for now we just call it to make sure it
-        # doesnt throw
+        # doesn't throw
         ds_engine.optimizer.step()
         # taking an optimizer step invalidates all parameters, make sure everything
         # has been partitioned afterwards
@@ -978,7 +978,7 @@ def _distributed_test():
 
 
 @pytest.mark.skip(
-    reason="depends on upgraded pytorch and nccl that isnt always available")
+    reason="depends on upgraded pytorch and nccl that isn't always available")
 @pytest.mark.parametrize("param_persistence_threshold", [0, 10])
 @pytest.mark.parametrize("contiguous_gradients", [True, False])
 @pytest.mark.parametrize("offload_optimizer", [True, False])
@@ -1174,7 +1174,7 @@ def create_tensor(vals):
                 ds_engine.optimizer.zero_grad()
 
         # TODO. add testing for this - for now we just call it to make sure it
-        # doesnt throw
+        # doesn't throw
         ds_engine.optimizer.step()
         _assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE})