Fix many typos (#1423)

* Fix typos in docs/ * Fix typos in code comments and output strings * Fix typos in the code itself * Fix typos in tests/ Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
microsoft · Oct 2, 2021 · be789b1 · be789b1
1 parent 30965ea
commit be789b1
Show file tree

Hide file tree

Showing 70 changed files with 227 additions and 227 deletions.
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -196,7 +196,7 @@ bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op,
 {
     const auto op_string = read_op ? "Read" : "Write";
     if (num_bytes % get_thread_count()) {
-        std::cout << "deepseed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
+        std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
                   << " not divisible by thread count = " << get_thread_count() << std::endl;
         return false;
     }

diff --git a/csrc/aio/py_test/ds_aio_basic.py b/csrc/aio/py_test/ds_aio_basic.py
@@ -130,15 +130,15 @@ def _aio_handle_tasklet(pool_params):
     return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
 
 
-def _init_takslet(b):
+def _init_tasklet(b):
     global aio_barrier
     aio_barrier = b
 
 
 def aio_basic_multiprocessing(args, read_op):
     b = Barrier(args.threads)
     pool_params = [(args, p, read_op) for p in range(args.threads)]
-    with Pool(processes=args.threads, initializer=_init_takslet, initargs=(b, )) as p:
+    with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
         pool_results = p.map(_aio_handle_tasklet, pool_params)
 
     report_results(args, read_op, pool_results)
diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
@@ -162,15 +162,15 @@ def _aio_handle_tasklet(pool_params):
     return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
 
 
-def _init_takslet(b):
+def _init_tasklet(b):
     global aio_barrier
     aio_barrier = b
 
 
 def aio_handle_multiprocessing(args, read_op):
     b = Barrier(args.threads)
     pool_params = [(args, p, read_op) for p in range(args.threads)]
-    with Pool(processes=args.threads, initializer=_init_takslet, initargs=(b, )) as p:
+    with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
         pool_results = p.map(_aio_handle_tasklet, pool_params)
 
     report_results(args, read_op, pool_results)
diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh
@@ -5,7 +5,7 @@ if [[ $# -ne 2 ]]; then
 fi
 
 
-function validate_enviroment()
+function validate_environment()
 {
     validate_cmd="python ./validate_async_io.py"
     eval ${validate_cmd}
@@ -18,7 +18,7 @@ function validate_enviroment()
 }
 
 
-validate_enviroment
+validate_environment
 
 INPUT_FILE=$1
 if [[ ! -f ${INPUT_FILE} ]]; then

diff --git a/csrc/aio/py_test/run_write_sweep.sh b/csrc/aio/py_test/run_write_sweep.sh
@@ -9,7 +9,7 @@ function prep_folder()
     fi
 }
 
-function validate_enviroment()
+function validate_environment()
 {
     validate_cmd="python ./validate_async_io.py"
     eval ${validate_cmd}
@@ -23,7 +23,7 @@ function validate_enviroment()
 
 
 
-validate_enviroment
+validate_environment
 
 if [[ $# -ne 3 ]]; then
     echo "Usage: $0 <write size in MB> <write dir ><output log dir>"

diff --git a/csrc/includes/custom_cuda_layers.h b/csrc/includes/custom_cuda_layers.h
@@ -27,25 +27,25 @@
 #define MAX_REG 256
 
 template <typename T>
-void launch_qunatize_kernel(T* vals,
+void launch_quantize_kernel(T* vals,
                             int total_count,
                             int group_num,
                             int num_bits,
                             cudaStream_t stream);
 template <typename T>
-void launch_sr_qunatize_kernel(T* vals,
+void launch_sr_quantize_kernel(T* vals,
                                int total_count,
                                int group_num,
                                int num_bits,
                                cudaStream_t stream);
 template <typename T>
-void launch_qunatize_kernel_asym(T* vals,
+void launch_quantize_kernel_asym(T* vals,
                                  int total_count,
                                  int group_num,
                                  int num_bits,
                                  cudaStream_t stream);
 template <typename T>
-void launch_sr_qunatize_kernel_asym(T* vals,
+void launch_sr_quantize_kernel_asym(T* vals,
                                     int total_count,
                                     int group_num,
                                     int num_bits,

diff --git a/csrc/includes/ds_transformer_cuda.h b/csrc/includes/ds_transformer_cuda.h
@@ -179,6 +179,6 @@ class BertTransformerLayer {
     bool _normalize_invertible;
     bool _gelu_checkpoint;
 
-    // High Performace flags
+    // High Performance flags
     bool _stochastic_mode;
 };
diff --git a/csrc/includes/softmax.h b/csrc/includes/softmax.h
@@ -17,14 +17,14 @@ class Softmax {
         size_t heads;
         size_t seq_length;
         size_t prob_depth;
-        float temprature;
+        float temperature;
         bool mem_alloc;
         Config(size_t batch, size_t h, size_t seq, int prob_size = 0, bool mem_alloc = false)
             : batchSize(batch),
               heads(h),
               seq_length(seq),
               prob_depth(prob_size),
-              temprature(1.0),
+              temperature(1.0),
               mem_alloc(mem_alloc)
         {
         }

diff --git a/csrc/includes/type_shim.h b/csrc/includes/type_shim.h
@@ -1,7 +1,7 @@
 /* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
 #include <ATen/ATen.h>
 
-// Forward/backward compatiblity hack around
+// Forward/backward compatibility hack around
 // https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
 // pending more future-proof guidance from upstream.
 // struct TypeShim

diff --git a/csrc/quantization/pt_binding.cpp b/csrc/quantization/pt_binding.cpp
@@ -11,7 +11,7 @@ at::Tensor ds_quantize(at::Tensor& vals, int groups, int bits)
     for (auto dim : t_size) size *= dim;
 
     if ((((size / groups) - 1) / 4096 + 1) <= MAX_REG) {
-        launch_qunatize_kernel(
+        launch_quantize_kernel(
             (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
     }
     return vals;
@@ -25,7 +25,7 @@ at::Tensor ds_sr_quantize(at::Tensor& vals, int groups, int bits)
     for (auto dim : t_size) size *= dim;
 
     if (((size / groups) / 4 / 1024) <= 256) {
-        launch_sr_qunatize_kernel(
+        launch_sr_quantize_kernel(
             (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
     }
     return vals;
@@ -39,7 +39,7 @@ at::Tensor ds_quantize_asym(at::Tensor& vals, int groups, int bits)
     for (auto dim : t_size) size *= dim;
 
     if ((((size / groups) - 1) / 4096 + 1) <= MAX_REG) {
-        launch_qunatize_kernel_asym(
+        launch_quantize_kernel_asym(
             (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
     }
     return vals;
@@ -53,7 +53,7 @@ at::Tensor ds_sr_quantize_asym(at::Tensor& vals, int groups, int bits)
     for (auto dim : t_size) size *= dim;
 
     if (((size / groups) / 4 / 1024) <= 256) {
-        launch_sr_qunatize_kernel_asym(
+        launch_sr_quantize_kernel_asym(
             (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
     }
     return vals;

diff --git a/csrc/quantization/quantizer.cu b/csrc/quantization/quantizer.cu
@@ -3,7 +3,7 @@
 
 namespace cg = cooperative_groups;
 
-__global__ void qunatize_kernel(__half* vals, int group_size, int num_bits)
+__global__ void quantize_kernel(__half* vals, int group_size, int num_bits)
 {
 #if __CUDA_ARCH__ >= 700
 
@@ -93,7 +93,7 @@ __global__ void qunatize_kernel(__half* vals, int group_size, int num_bits)
 #endif
 }
 
-__global__ void qunatize_kernel(float* vals, int group_size, int num_bits)
+__global__ void quantize_kernel(float* vals, int group_size, int num_bits)
 {
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -176,7 +176,7 @@ __global__ void qunatize_kernel(float* vals, int group_size, int num_bits)
 }
 
 template <typename T>
-void launch_qunatize_kernel(T* vals,
+void launch_quantize_kernel(T* vals,
                             int total_count,
                             int group_num,
                             int num_bits,
@@ -185,22 +185,22 @@ void launch_qunatize_kernel(T* vals,
     dim3 grid_dim(group_num);
     dim3 block_dim(1024);
 
-    qunatize_kernel<<<grid_dim, block_dim, 0, stream>>>(
+    quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
         vals, (total_count / group_num) / 4, num_bits);
 }
 
-template void launch_qunatize_kernel(float* vals,
+template void launch_quantize_kernel(float* vals,
                                      int total_count,
                                      int group_num,
                                      int num_bits,
                                      cudaStream_t stream);
-template void launch_qunatize_kernel(__half* vals,
+template void launch_quantize_kernel(__half* vals,
                                      int total_count,
                                      int group_num,
                                      int num_bits,
                                      cudaStream_t stream);
 
-__global__ void sr_qunatize_kernel(__half* vals,
+__global__ void sr_quantize_kernel(__half* vals,
                                    int token_size,
                                    int token_num,
                                    int num_bits,
@@ -336,7 +336,7 @@ __global__ void sr_qunatize_kernel(__half* vals,
 #endif
 }
 
-__global__ void sr_qunatize_kernel(float* vals,
+__global__ void sr_quantize_kernel(float* vals,
                                    int token_size,
                                    int token_num,
                                    int num_bits,
@@ -456,7 +456,7 @@ __global__ void sr_qunatize_kernel(float* vals,
 }
 
 template <typename T>
-void launch_sr_qunatize_kernel(T* vals,
+void launch_sr_quantize_kernel(T* vals,
                                int total_count,
                                int group_num,
                                int num_bits,
@@ -468,21 +468,21 @@ void launch_sr_qunatize_kernel(T* vals,
     uint64_t inc = total_count / grid_dim.x / block_dim.x;
     std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
 
-    sr_qunatize_kernel<<<grid_dim, block_dim, 0, stream>>>(
+    sr_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
         vals, (total_count / group_num) / 4, group_num, num_bits, seed);
 }
-template void launch_sr_qunatize_kernel(float* vals,
+template void launch_sr_quantize_kernel(float* vals,
                                         int total_count,
                                         int group_num,
                                         int num_bits,
                                         cudaStream_t stream);
-template void launch_sr_qunatize_kernel(__half* vals,
+template void launch_sr_quantize_kernel(__half* vals,
                                         int total_count,
                                         int group_num,
                                         int num_bits,
                                         cudaStream_t stream);
 
-__global__ void qunatize_kernel_asym(__half* vals, int group_size, int num_bits)
+__global__ void quantize_kernel_asym(__half* vals, int group_size, int num_bits)
 {
 #if __CUDA_ARCH__ >= 700
 
@@ -595,7 +595,7 @@ __global__ void qunatize_kernel_asym(__half* vals, int group_size, int num_bits)
 #endif
 }
 
-__global__ void qunatize_kernel_asym(float* vals, int group_size, int num_bits)
+__global__ void quantize_kernel_asym(float* vals, int group_size, int num_bits)
 {
     cg::thread_block b = cg::this_thread_block();
     cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
@@ -699,7 +699,7 @@ __global__ void qunatize_kernel_asym(float* vals, int group_size, int num_bits)
 }
 
 template <typename T>
-void launch_qunatize_kernel_asym(T* vals,
+void launch_quantize_kernel_asym(T* vals,
                                  int total_count,
                                  int group_num,
                                  int num_bits,
@@ -708,22 +708,22 @@ void launch_qunatize_kernel_asym(T* vals,
     dim3 grid_dim(group_num);
     dim3 block_dim(1024);
 
-    qunatize_kernel_asym<<<grid_dim, block_dim, 0, stream>>>(
+    quantize_kernel_asym<<<grid_dim, block_dim, 0, stream>>>(
         vals, (total_count / group_num) / 4, num_bits);
 }
 
-template void launch_qunatize_kernel_asym(float* vals,
+template void launch_quantize_kernel_asym(float* vals,
                                           int total_count,
                                           int group_num,
                                           int num_bits,
                                           cudaStream_t stream);
-template void launch_qunatize_kernel_asym(__half* vals,
+template void launch_quantize_kernel_asym(__half* vals,
                                           int total_count,
                                           int group_num,
                                           int num_bits,
                                           cudaStream_t stream);
 
-__global__ void sr_qunatize_kernel_asym(__half* vals,
+__global__ void sr_quantize_kernel_asym(__half* vals,
                                         int token_size,
                                         int token_num,
                                         int num_bits,
@@ -879,7 +879,7 @@ __global__ void sr_qunatize_kernel_asym(__half* vals,
 #endif
 }
 
-__global__ void sr_qunatize_kernel_asym(float* vals,
+__global__ void sr_quantize_kernel_asym(float* vals,
                                         int token_size,
                                         int token_num,
                                         int num_bits,
@@ -1010,7 +1010,7 @@ __global__ void sr_qunatize_kernel_asym(float* vals,
     }
 }
 template <typename T>
-void launch_sr_qunatize_kernel_asym(T* vals,
+void launch_sr_quantize_kernel_asym(T* vals,
                                     int total_count,
                                     int group_num,
                                     int num_bits,
@@ -1022,15 +1022,15 @@ void launch_sr_qunatize_kernel_asym(T* vals,
     uint64_t inc = total_count / grid_dim.x / block_dim.x;
     std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
 
-    sr_qunatize_kernel<<<grid_dim, block_dim, 0, stream>>>(
+    sr_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
         vals, (total_count / group_num) / 4, group_num, num_bits, seed);
 }
-template void launch_sr_qunatize_kernel_asym(float* vals,
+template void launch_sr_quantize_kernel_asym(float* vals,
                                              int total_count,
                                              int group_num,
                                              int num_bits,
                                              cudaStream_t stream);
-template void launch_sr_qunatize_kernel_asym(__half* vals,
+template void launch_sr_quantize_kernel_asym(__half* vals,
                                              int total_count,
                                              int group_num,
                                              int num_bits,

diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py
@@ -102,7 +102,7 @@ def _create_model_parallel_group(self):
             self.mp_group = InferenceEngine.inference_mp_group
 
     def _check_quantize_setting(self, quantization_setting):
-        self.quatize_bits = 8
+        self.quantize_bits = 8
         self.mlp_extra_grouping = False
         self.quantize_groups = 1
         if quantization_setting is None:
@@ -177,7 +177,7 @@ def _convert_to_dtype(self):
             quantizer = WeightQuantization(mlp_extra_grouping=self.mlp_extra_grouping)
             model, self.quantization_scales = quantizer.model_quantize(self.module,
                                                                         self.injection_dict,
-                                                                        self.quatize_bits,
+                                                                        self.quantize_bits,
                                                                         self.quantize_groups)
         elif self.dtype == torch.half:
             self.module.half()

diff --git a/deepspeed/launcher/launch.py b/deepspeed/launcher/launch.py
@@ -1,7 +1,7 @@
 # Copyright 2020 The Microsoft DeepSpeed Team
 """
 DeepSpeed launcher, this is similar to torch.distributed.launch but supports
-additional features such as abitrary gpu exclusion.
+additional features such as arbitrary gpu exclusion.
 
 deepspeed.launcher.launch is intended to be run on a single worker node and
 will spawn several worker sub-processes depending on how many devices/ranks

diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
@@ -2,7 +2,7 @@
 """
 DeepSpeed runner is the main front-end to launching multi-worker
 training jobs with DeepSpeed. By default this uses pdsh to parallel
-ssh into multiple worker nodes and launch all the neccisary processes
+ssh into multiple worker nodes and launch all the necessary processes
 per rank for training.
 """