Add codespell to pre-commit checks (microsoft#1717)

leiwen83 · Jan 22, 2022 · 4cf970e · 4cf970e
1 parent 09c065b
commit 4cf970e
Show file tree

Hide file tree

Showing 35 changed files with 83 additions and 70 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -32,3 +32,16 @@ repos:
     hooks:
     -   id: clang-format  # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
         args: []
+
+-   repo: https://github.com/codespell-project/codespell
+    rev: v2.1.0
+    hooks:
+    -   id: codespell
+        args: [
+            # Do not check files that are automatically generated
+            '--skip=docs/Gemfile.lock,tests/unit/gpt2-merges.txt,tests/unit/gpt2-vocab.json',
+            '--ignore-regex=\\n',  # Do not count the 'n' in an escaped newline as part of a word
+            '--ignore-words-list=unsupport',  # Word used in error messages that need rewording
+            --check-filenames,
+            --check-hidden
+        ]
diff --git a/.pylintrc b/.pylintrc
@@ -55,7 +55,7 @@ confidence=
 # can either give multiple identifiers separated by comma (,) or put this
 # option multiple times (only on the command line, not in the configuration
 # file where it should appear only once). You can also use "--disable=all" to
-# disable everything first and then reenable specific checks. For example, if
+# disable everything first and then re-enable specific checks. For example, if
 # you want to run only the similarities checker, you can use "--disable=all
 # --enable=similarities". If you want to run only the classes checker, but have
 # no Warning level messages displayed, use "--disable=all --enable=classes

diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp
@@ -11,9 +11,9 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
-    m.def("aio_read", &deepspeed_py_aio_read, "DeepSpeed Asynchornous I/O Read");
+    m.def("aio_read", &deepspeed_py_aio_read, "DeepSpeed Asynchronous I/O Read");
 
-    m.def("aio_write", &deepspeed_py_aio_write, "DeepSpeed Asynchornous I/O Write");
+    m.def("aio_write", &deepspeed_py_aio_write, "DeepSpeed Asynchronous I/O Write");
 
     m.def("deepspeed_memcpy", &deepspeed_py_memcpy, "DeepSpeed Memory Copy");
 

diff --git a/csrc/lamb/fused_lamb_cuda.cpp b/csrc/lamb/fused_lamb_cuda.cpp
@@ -61,15 +61,15 @@ at::Tensor lamb(at::Tensor& p,
 
     // intermediate for weight L2 reduction
     // make sure that the threads per block is at least 512 during the kernel launch otherwise the
-    // behavious is unexpected
+    // behaviour is unexpected
     at::Tensor w_l2_i = at::empty(
         {512},
         p.options().dtype(p.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float
                                                                         : p.type().scalarType()));
 
     // intermediate for update L2 reduction
     // make sure that the threads per block is at least 512 during the kernel launch otherwise the
-    // behavious is unexpected
+    // behaviour is unexpected
     at::Tensor u_l2_i = at::empty(
         {512},
         p.options().dtype(p.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float

diff --git a/deepspeed/autotuning/README.md b/deepspeed/autotuning/README.md
@@ -167,7 +167,7 @@ For example, the following section in the DeepSpeed configuration file limits th
 }
 ```
 
-The entry bellow asks the Autotuner to use `4` as the micro-batch size per GPU in tuning (micro-batch size per GPU is fixed as 4). Note that it's different from using ` "train_micro_batch_size_per_gpu": [4]` which asks the Autotuner to tune micro-batch size per GPU starting from `4`.
+The entry below asks the Autotuner to use `4` as the micro-batch size per GPU in tuning (micro-batch size per GPU is fixed as 4). Note that it's different from using ` "train_micro_batch_size_per_gpu": [4]` which asks the Autotuner to tune micro-batch size per GPU starting from `4`.
 ```json
 {
     "train_micro_batch_size_per_gpu": [4],

diff --git a/deepspeed/autotuning/autotuner.py b/deepspeed/autotuning/autotuner.py
@@ -341,7 +341,7 @@ def _generate_experiments(self, tuning_space, max_train_batch_size_per_gpu):
         else:
             return exps
 
-        # replace the corresponding parameter values if the user specfies them in the DeepSpeed configuration file
+        # replace the corresponding parameter values if the user specifies them in the DeepSpeed configuration file
         replace_dict(tuning_space,
                      self.user_config,
                      [ZERO_OPTIMIZATION,
@@ -511,7 +511,7 @@ def tune_space(self,
         max_train_batch_size_per_gpu = 0
         tuning_micro_batch_sizes_overwritten = False
 
-        # calcuate max micro batch size using gpu memory, model instatiation memory and activation memory
+        # calculate max micro batch size using gpu memory, model instantiation memory and activation memory
         # calculated_max_micro_batch_size = (memory_per_gpu - instantiation_memory) // activation_memory_micro_batch_size_1
         calculated_max_micro_batch_size = int(
             self.gpu_mem -
@@ -584,11 +584,11 @@ def tune_space(self,
             logger.info(f"End tuning for space: {tuning_space_name}")
             return max_micro_batch_size, fast_best_mbs, fast_best_metric_val
 
-        # if the best metric or the micro batch size for that best metric in the current Zero stage after tuning micro batch size is less than the corrresponding value in the prevous Zero stage, return, do not tune other Zero configuration paramerts
+        # if the best metric or the micro batch size for that best metric in the current Zero stage after tuning micro batch size is less than the corresponding value in the previous Zero stage, return, do not tune other Zero configuration parameters
         if stage > 0:
             if fast_best_mbs <= prev_best_mbs or fast_best_metric_val < prev_best_metric_val:
                 logger.info(
-                    f"End tuning for space: {tuning_space_name}. No need to tune other Zero configuration paramerts."
+                    f"End tuning for space: {tuning_space_name}. No need to tune other Zero configuration parameters."
                 )
                 return max_micro_batch_size, fast_best_mbs, fast_best_metric_val
 
@@ -665,7 +665,7 @@ def model_info_profile_run(self):
         """Does a model information profling experiment that collects the number of model parameters and activation memory.\
             The experiment produces a "profile_model_info" folder under self.results_dir.
         Returns:
-            [dict]: a model inforation dictionary, e.g., {"num_params": 335144976, "trainable_num_params": 335144976, "activation_mem_per_gpu": 324358144, "rank": 0}
+            [dict]: a model information dictionary, e.g., {"num_params": 335144976, "trainable_num_params": 335144976, "activation_mem_per_gpu": 324358144, "rank": 0}
         """
         logger.info("Starting model info profile run.")
         model_info = self.autotuning_config.model_info

diff --git a/deepspeed/autotuning/constants.py b/deepspeed/autotuning/constants.py
@@ -137,7 +137,7 @@
 }
 
 #########################################
-# autotunner serach space constants
+# autotunner search space constants
 #########################################
 
 DEFAULT_HF_CONFIG = {

diff --git a/deepspeed/autotuning/scheduler.py b/deepspeed/autotuning/scheduler.py
@@ -241,7 +241,7 @@ def parse_results(self, metric):
         for exp_id, (exp, err) in self.finished_experiments.items():
             if err:
                 logger.info(
-                    f"The experiment exp_id = {exp_id}, exp_name = {exp['name']}, did not run succesfully with error = {err}, thus a metrics.txt does not exist for it. Check the stderr.log in {exp['result_dir']}"
+                    f"The experiment exp_id = {exp_id}, exp_name = {exp['name']}, did not run successfully with error = {err}, thus a metrics.txt does not exist for it. Check the stderr.log in {exp['result_dir']}"
                 )
                 continue
 

diff --git a/deepspeed/autotuning/tuner/README.md b/deepspeed/autotuning/tuner/README.md
@@ -1,7 +1,7 @@
 # Tuner
 
 
-`exps` is a list of experiment descriptions (dictionarys).
+`exps` is a list of experiment descriptions (dictionaries).
 An experimentation description has a `ds_config` field that stores the DeepSpeed configuration to be used in the experiment.
 
 A tuner is based on BaseTuner and at least implements the `next_batch` method. It can implement a different `tune` method from the BaseTuner's.

diff --git a/deepspeed/autotuning/tuner/model_based_tuner.py b/deepspeed/autotuning/tuner/model_based_tuner.py
@@ -120,7 +120,7 @@ def update(self):
             feature_val = []
             if err:
                 logger.info(
-                    f"Skipping exp_id = {exp_id}, exp_name = {exp['name']}, the experiment did not run succesfully with error = {err}, thus a metrics.txt does not exist for it. Please check the stderr.log in {exp['result_dir']}"
+                    f"Skipping exp_id = {exp_id}, exp_name = {exp['name']}, the experiment did not run successfully with error = {err}, thus a metrics.txt does not exist for it. Please check the stderr.log in {exp['result_dir']}"
                 )
                 ds_config = exp["ds_config"]
                 flattened_ds_config = flatten(ds_config)

diff --git a/deepspeed/autotuning/utils.py b/deepspeed/autotuning/utils.py
@@ -317,7 +317,7 @@ def canonical_name(config: dict, tuning_keys=None, prefix="", omit_val=False):
     Args:
         config (dict): the config dict used to generate the name
         tuning_keys (list, optional):  the tuning keys used to generate the name. Defaults to None.
-        prefix (str, optional): a string added to the begining of the name. Defaults to None.
+        prefix (str, optional): a string added to the beginning of the name. Defaults to None.
     """
     if TRAIN_MICRO_BATCH_SIZE_PER_GPU not in tuning_keys:
         tuning_keys.append(TRAIN_MICRO_BATCH_SIZE_PER_GPU)

diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py
@@ -107,7 +107,7 @@ def parse_arguments():
         '--hide_operator_status',
         action='store_true',
         help=
-        'Suppress display of installation and compatiblity statuses of DeepSpeed operators. '
+        'Suppress display of installation and compatibility statuses of DeepSpeed operators. '
     )
     parser.add_argument('--hide_errors_and_warnings',
                         action='store_true',

diff --git a/deepspeed/profiling/flops_profiler/README.md b/deepspeed/profiling/flops_profiler/README.md
@@ -16,15 +16,15 @@ Below is an example output for BERT-Large(NVIDIA) on an A100 GPU with batch size
 -------------------------- DeepSpeed Flops Profiler --------------------------
 Profile Summary at step 10:
 Notations:
-data parallel size (dp_size), model paralel size(mp_size),
+data parallel size (dp_size), model parallel size(mp_size),
 number of parameters (params), number of multiply-accumulate operations(MACs),
 number of floating-point operations (flops), floating-point operations per second (FLOPS),
 fwd latency (forward propagation latency), bwd latency (backward propagation latency),
 step (weights update latency), iter latency (sum of fwd, bwd and step latency)
 
 world size:                                                   1
 data parallel size:                                           1
-model paralel size:                                           1
+model parallel size:                                          1
 batch size per GPU:                                           80
 params per gpu:                                               336.23 M
 params of model = params per GPU * mp_size:                   336.23 M
@@ -160,7 +160,7 @@ The DeepSpeed Flops Profiler can be used with the DeepSpeed runtime or as a stan
       - [Example Training Workflow](#example-training-workflow)
 ### Usage With the DeepSpeed Runtime
 
-When using DeepSpeed for model training, the profiler can be configured in the deepspeed configuration file. No explict API calls are needed to use the profiler. The profiler can be enabled by adding the following field to the `deepspeed_config` json file. Refer to [flops profiler](https://www.deepspeed.ai/docs/config-json/#flops-profiler) for details.
+When using DeepSpeed for model training, the profiler can be configured in the deepspeed configuration file. No explicit API calls are needed to use the profiler. The profiler can be enabled by adding the following field to the `deepspeed_config` json file. Refer to [flops profiler](https://www.deepspeed.ai/docs/config-json/#flops-profiler) for details.
 
 ```json
 {
@@ -185,15 +185,15 @@ An example output of 12-layer Megatron-LM model (`hidden_size = 8192, num_attent
 -------------------------- DeepSpeed Flops Profiler --------------------------
 Profile Summary at step 10:
 Notations:
-data parallel size (dp_size), model paralel size(mp_size),
+data parallel size (dp_size), model parallel size(mp_size),
 number of parameters (params), number of multiply-accumulate operations(MACs),
 number of floating-point operations (flops), floating-point operations per second (FLOPS),
 fwd latency (forward propagation latency), bwd latency (backward propagation latency),
 step (weights update latency), iter latency (sum of fwd, bwd and step latency)
 
 world size:                                                   1
 data parallel size:                                           1
-model paralel size:                                           1
+model parallel size:                                          1
 batch size per GPU:                                           1024
 params per gpu:                                               1.29 M
 params of model = params per GPU * mp_size:                   1.29 M

diff --git a/deepspeed/runtime/comm/coalesced_collectives.py b/deepspeed/runtime/comm/coalesced_collectives.py
@@ -47,7 +47,7 @@ def reduce_scatter_coalesced(
     """simultaneously reduce-scatter a list of tensors - this can be done more
     efficiently than individual reduce scatter calls
 
-    TODO. see if PyTorch team wants a c++ verson of this for ProcessGroupNCCL
+    TODO. see if PyTorch team wants a c++ version of this for ProcessGroupNCCL
     """
     this_rank = torch.distributed.get_rank(group)
     world_sz = torch.distributed.get_world_size(group)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
@@ -861,7 +861,7 @@ def _set_distributed_vars(self, args):
     def _configure_with_arguments(self, args, mpu):
         # After the distributed backend is initialized we are guaranteed the LOCAL_RANK
         # environment variable is set. We must align args.local_rank to this value for
-        # backwards compatability with scripts relying on [args|self].local_rank containing
+        # backwards compatibility with scripts relying on [args|self].local_rank containing
         # the correct local rank info. _do_args_sanity_check will ensure this is the case.
 
         if "OMPI_COMM_WORLD_LOCAL_RANK" in os.environ:
@@ -2005,7 +2005,7 @@ def _autotuning_exit(self):
                 msg["latency"]
             print_json_dist(msg, [0], path=self.autotuning_metric_path())
             import atexit
-            atexit.register(print, "Autotuning: done with runing current ds config.")
+            atexit.register(print, "Autotuning: done with running current ds config.")
         exit()
 
     def _write_tensorboard(self):
@@ -2290,7 +2290,7 @@ def load_moe_state_dict(self, checkpoint_path, tag, state_dict, old_moe_load):
                 global_expert_id = expp_rank * num_local_experts + local_expert_id
                 expert_state_dict = torch.load(self._get_expert_ckpt_name(
                     checkpoint_path,
-                    -1, # -1 means ingore layer_id
+                    -1, # -1 means ignore layer_id
                     global_expert_id,
                     tag),
                     map_location=torch.device('cpu'))

diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py
@@ -55,7 +55,7 @@ def __init__(self,
             #copied to fp16 weights
             fp32_group = [p.clone().float().detach() for p in param_group['params']]
 
-            #incase the internal optimizer needs it
+            #in case the internal optimizer needs it
             for p in fp32_group:
                 p.requires_grad = True
 

diff --git a/deepspeed/runtime/lr_schedules.py b/deepspeed/runtime/lr_schedules.py
@@ -314,7 +314,7 @@ class LRRangeTest(object):
     the paper `A disciplined approach to neural network hyper-parameters: Part1`_.
 
     LRRT policy is used for finding maximum LR that trains a model without divergence, and can be used to
-    configure the LR boundaries for Cylic LR schedules.
+    configure the LR boundaries for Cyclic LR schedules.
 
     LRRT changes the learning rate after every batch.
     `step` should be called after a batch has been used for training.
@@ -325,7 +325,7 @@ class LRRangeTest(object):
             lower boundary in the range test for each parameter group.
         lr_range_test_step_size (int): Interval of training steps to increase learning rate. Default: 2000
         lr_range_test_step_rate (float): Scaling rate for range test. Default: 1.0
-        lr_range_test_staircase (bool): Scale in staircase fashion, rather than continous. Default: False.
+        lr_range_test_staircase (bool): Scale in staircase fashion, rather than continuous. Default: False.
         last_batch_iteration (int): The index of the last batch. This parameter is used when
             resuming a training job. Since `step()` should be invoked after each
             batch instead of after each epoch, this number represents the total

diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
@@ -156,7 +156,7 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
                         f'TOTAL_PARAMS={total_params} ({total_params/1e6:0.3f}M) '
                         f'UNIQUE_PARAMS={unique_params} ({unique_params/1e6:0.3f}M)')
 
-        #intialize peer-2-peer communication and allreduce groups
+        #initialize peer-2-peer communication and allreduce groups
         if self.is_pipe_parallel:
             p2p.init_process_groups(self.grid)
 

diff --git a/deepspeed/runtime/pipe/p2p.py b/deepspeed/runtime/pipe/p2p.py
@@ -47,7 +47,7 @@ def _is_valid_send_recv(src_stage, dest_stage):
 
 def send(tensor, dest_stage, async_op=False):
     global _groups
-    assert async_op == False, "Doesnt support async_op true"
+    assert async_op == False, "Doesn't support async_op true"
     src_stage = _grid.get_stage_id()
     _is_valid_send_recv(src_stage, dest_stage)
 
@@ -68,7 +68,7 @@ def send(tensor, dest_stage, async_op=False):
 
 def recv(tensor, src_stage, async_op=False):
     global _groups
-    assert async_op == False, "Doesnt support async_op true"
+    assert async_op == False, "Doesn't support async_op true"
     dest_stage = _grid.get_stage_id()
     _is_valid_send_recv(src_stage, dest_stage)
 

diff --git a/deepspeed/runtime/pipe/topology.py b/deepspeed/runtime/pipe/topology.py
@@ -191,7 +191,7 @@ def _filter_helper(x):
             return True
 
         coords = filter(_filter_helper, self.mapping.keys())
-        return [self.mapping[coo] for coo in coords]
+        return [self.mapping[coord] for coord in coords]
 
     def get_axis_list(self, axis, idx):
         """Returns the list of global ranks whose coordinate in an axis is idx.

diff --git a/deepspeed/runtime/zero/constants.py b/deepspeed/runtime/zero/constants.py
@@ -125,7 +125,7 @@
 ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS = 'ignore_unused_parameters'
 ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS_DEFAULT = True
 
-# Use deepspeed < v0.3.17 zero stage 1, kept for backwards compatability reasons
+# Use deepspeed < v0.3.17 zero stage 1, kept for backwards compatibility reasons
 ZERO_OPTIMIZATION_LEGACY_STAGE1 = "legacy_stage1"
 ZERO_OPTIMIZATION_LEGACY_STAGE1_DEFAULT = False
 

diff --git a/deepspeed/runtime/zero/contiguous_memory_allocator.py b/deepspeed/runtime/zero/contiguous_memory_allocator.py
@@ -96,7 +96,7 @@ def release_tensor(self, tensor):
         print_rank_0(
             f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}."
         )
-        assert self.total_free - tensor_size == free_before, "Release bookeeping error"
+        assert self.total_free - tensor_size == free_before, "Release bookkeeping error"
 
     def release_tensor_with_id(self, tensor_id):
         free_before = self.total_free
@@ -109,7 +109,7 @@ def release_tensor_with_id(self, tensor_id):
         print_rank_0(
             f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}."
         )
-        assert self.total_free - tensor_size == free_before, "Release bookeeping error"
+        assert self.total_free - tensor_size == free_before, "Release bookkeeping error"
 
     #shows the current memory allocation at specified resolution
     def print_allocation(self, resolution=200):

diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
@@ -691,7 +691,7 @@ def get_model():
 
         self._validate_remote_device(remote_device, _ds_config)
 
-        # Remote device is the device where parameter partiitons are stored
+        # Remote device is the device where parameter partitions are stored
         # It can be same as local_device or it could be CPU or NVMe.
         self.remote_device = self.local_device if remote_device is None else remote_device
         self.pin_memory = pin_memory if (self.remote_device
-Original file line number
+Diff line change
@@ Expand Up @@
     }
     ```
-    The entry bellow asks the Autotuner to use `4` as the micro-batch size per GPU in tuning (micro-batch size per GPU is fixed as 4). Note that it's different from using ` "train_micro_batch_size_per_gpu": [4]` which asks the Autotuner to tune micro-batch size per GPU starting from `4`.
+    The entry below asks the Autotuner to use `4` as the micro-batch size per GPU in tuning (micro-batch size per GPU is fixed as 4). Note that it's different from using ` "train_micro_batch_size_per_gpu": [4]` which asks the Autotuner to tune micro-batch size per GPU starting from `4`.
     ```json
     {
         "train_micro_batch_size_per_gpu": [4],
@@ Expand Down @@