From be789b1665b8cafbd1b865534da501b3b287b17f Mon Sep 17 00:00:00 2001 From: Alex Hedges Date: Fri, 1 Oct 2021 22:56:32 -0400 Subject: [PATCH] Fix many typos (#1423) * Fix typos in docs/ * Fix typos in code comments and output strings * Fix typos in the code itself * Fix typos in tests/ Co-authored-by: Olatunji Ruwase --- csrc/aio/py_lib/deepspeed_py_aio_handle.cpp | 2 +- csrc/aio/py_test/ds_aio_basic.py | 4 +- csrc/aio/py_test/ds_aio_handle.py | 4 +- csrc/aio/py_test/run_read_sweep.sh | 4 +- csrc/aio/py_test/run_write_sweep.sh | 4 +- csrc/includes/custom_cuda_layers.h | 8 +-- csrc/includes/ds_transformer_cuda.h | 2 +- csrc/includes/softmax.h | 4 +- csrc/includes/type_shim.h | 2 +- csrc/quantization/pt_binding.cpp | 8 +-- csrc/quantization/quantizer.cu | 48 +++++++-------- deepspeed/inference/engine.py | 4 +- deepspeed/launcher/launch.py | 2 +- deepspeed/launcher/runner.py | 2 +- deepspeed/module_inject/replace_module.py | 8 +-- deepspeed/moe/sharded_moe.py | 4 +- deepspeed/ops/adam/cpu_adam.py | 4 +- .../bert_sparse_self_attention.py | 6 +- .../sparse_attention_utils.py | 10 ++-- .../sparse_attention/sparse_self_attention.py | 4 +- .../ops/sparse_attention/sparsity_config.py | 60 +++++++++---------- .../inference/transformer_inference.py | 2 +- deepspeed/ops/transformer/transformer.py | 2 +- deepspeed/profiling/flops_profiler/README.md | 8 +-- .../profiling/flops_profiler/profiler.py | 4 +- .../activation_checkpointing/checkpointing.py | 8 +-- deepspeed/runtime/config.py | 6 +- deepspeed/runtime/eigenvalue.py | 2 +- deepspeed/runtime/engine.py | 14 ++--- deepspeed/runtime/fp16/onebit/adam.py | 2 +- deepspeed/runtime/lr_schedules.py | 6 +- deepspeed/runtime/pipe/module.py | 2 +- deepspeed/runtime/pipe/p2p.py | 2 +- deepspeed/runtime/pipe/schedule.py | 4 +- deepspeed/runtime/pipe/topology.py | 2 +- deepspeed/runtime/quantize.py | 2 +- deepspeed/runtime/state_dict_factory.py | 4 +- .../runtime/swap_tensor/optimizer_utils.py | 4 +- .../partitioned_optimizer_swapper.py | 4 +- .../swap_tensor/partitioned_param_swapper.py | 2 +- deepspeed/runtime/zero/constants.py | 4 +- .../zero/contiguous_memory_allocator.py | 4 +- deepspeed/runtime/zero/linear.py | 6 +- .../runtime/zero/partition_parameters.py | 4 +- deepspeed/runtime/zero/stage2.py | 20 +++---- deepspeed/runtime/zero/stage3.py | 38 ++++++------ deepspeed/utils/groups.py | 4 +- deepspeed/utils/zero_to_fp32.py | 4 +- docs/_pages/config-json.md | 4 +- docs/_posts/2021-05-05-MoQ.md | 2 +- ...021-05-05-inference-kernel-optimization.md | 2 +- docs/_tutorials/MoQ-tutorial.md | 8 +-- docs/_tutorials/advanced-install.md | 2 +- docs/_tutorials/flops-profiler.md | 8 +-- docs/_tutorials/pytorch-profiler.md | 2 +- docs/_tutorials/sparse-attention.md | 4 +- docs/code-docs/source/initialize.rst | 4 +- op_builder/__init__.py | 2 +- op_builder/builder.py | 6 +- setup.py | 4 +- tests/benchmarks/flatten_bench.py | 2 +- tests/model/Megatron_GPT2/ds_gpt2_test.sh | 2 +- tests/unit/modeling.py | 10 ++-- tests/unit/modelingpreln.py | 10 ++-- tests/unit/test_activation_checkpointing.py | 2 +- tests/unit/test_autocast.py | 4 +- tests/unit/test_cpu_adam.py | 2 +- tests/unit/test_cuda_backward.py | 8 +-- tests/unit/test_dist.py | 2 +- tests/unit/test_flops_profiler.py | 6 +- 70 files changed, 227 insertions(+), 227 deletions(-) diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp index 80f1a8785dcd..4635e751d6d8 100644 --- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp +++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp @@ -196,7 +196,7 @@ bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op, { const auto op_string = read_op ? "Read" : "Write"; if (num_bytes % get_thread_count()) { - std::cout << "deepseed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes + std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes << " not divisible by thread count = " << get_thread_count() << std::endl; return false; } diff --git a/csrc/aio/py_test/ds_aio_basic.py b/csrc/aio/py_test/ds_aio_basic.py index 99a48d503c0c..e9f399d502a8 100755 --- a/csrc/aio/py_test/ds_aio_basic.py +++ b/csrc/aio/py_test/ds_aio_basic.py @@ -130,7 +130,7 @@ def _aio_handle_tasklet(pool_params): return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops -def _init_takslet(b): +def _init_tasklet(b): global aio_barrier aio_barrier = b @@ -138,7 +138,7 @@ def _init_takslet(b): def aio_basic_multiprocessing(args, read_op): b = Barrier(args.threads) pool_params = [(args, p, read_op) for p in range(args.threads)] - with Pool(processes=args.threads, initializer=_init_takslet, initargs=(b, )) as p: + with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p: pool_results = p.map(_aio_handle_tasklet, pool_params) report_results(args, read_op, pool_results) diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py index c018ca417418..68abbe80261b 100755 --- a/csrc/aio/py_test/ds_aio_handle.py +++ b/csrc/aio/py_test/ds_aio_handle.py @@ -162,7 +162,7 @@ def _aio_handle_tasklet(pool_params): return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops -def _init_takslet(b): +def _init_tasklet(b): global aio_barrier aio_barrier = b @@ -170,7 +170,7 @@ def _init_takslet(b): def aio_handle_multiprocessing(args, read_op): b = Barrier(args.threads) pool_params = [(args, p, read_op) for p in range(args.threads)] - with Pool(processes=args.threads, initializer=_init_takslet, initargs=(b, )) as p: + with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p: pool_results = p.map(_aio_handle_tasklet, pool_params) report_results(args, read_op, pool_results) diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh index 2f814837c296..b9d7e050454a 100755 --- a/csrc/aio/py_test/run_read_sweep.sh +++ b/csrc/aio/py_test/run_read_sweep.sh @@ -5,7 +5,7 @@ if [[ $# -ne 2 ]]; then fi -function validate_enviroment() +function validate_environment() { validate_cmd="python ./validate_async_io.py" eval ${validate_cmd} @@ -18,7 +18,7 @@ function validate_enviroment() } -validate_enviroment +validate_environment INPUT_FILE=$1 if [[ ! -f ${INPUT_FILE} ]]; then diff --git a/csrc/aio/py_test/run_write_sweep.sh b/csrc/aio/py_test/run_write_sweep.sh index 942c43efda2f..99f2113dda6f 100755 --- a/csrc/aio/py_test/run_write_sweep.sh +++ b/csrc/aio/py_test/run_write_sweep.sh @@ -9,7 +9,7 @@ function prep_folder() fi } -function validate_enviroment() +function validate_environment() { validate_cmd="python ./validate_async_io.py" eval ${validate_cmd} @@ -23,7 +23,7 @@ function validate_enviroment() -validate_enviroment +validate_environment if [[ $# -ne 3 ]]; then echo "Usage: $0 " diff --git a/csrc/includes/custom_cuda_layers.h b/csrc/includes/custom_cuda_layers.h index 263ceeaab9dd..17301dc20be8 100644 --- a/csrc/includes/custom_cuda_layers.h +++ b/csrc/includes/custom_cuda_layers.h @@ -27,25 +27,25 @@ #define MAX_REG 256 template -void launch_qunatize_kernel(T* vals, +void launch_quantize_kernel(T* vals, int total_count, int group_num, int num_bits, cudaStream_t stream); template -void launch_sr_qunatize_kernel(T* vals, +void launch_sr_quantize_kernel(T* vals, int total_count, int group_num, int num_bits, cudaStream_t stream); template -void launch_qunatize_kernel_asym(T* vals, +void launch_quantize_kernel_asym(T* vals, int total_count, int group_num, int num_bits, cudaStream_t stream); template -void launch_sr_qunatize_kernel_asym(T* vals, +void launch_sr_quantize_kernel_asym(T* vals, int total_count, int group_num, int num_bits, diff --git a/csrc/includes/ds_transformer_cuda.h b/csrc/includes/ds_transformer_cuda.h index cdd65b4a7da7..e5aaf8411f99 100755 --- a/csrc/includes/ds_transformer_cuda.h +++ b/csrc/includes/ds_transformer_cuda.h @@ -179,6 +179,6 @@ class BertTransformerLayer { bool _normalize_invertible; bool _gelu_checkpoint; - // High Performace flags + // High Performance flags bool _stochastic_mode; }; diff --git a/csrc/includes/softmax.h b/csrc/includes/softmax.h index 2bc2f67059cf..ca3cf8a8ee6a 100755 --- a/csrc/includes/softmax.h +++ b/csrc/includes/softmax.h @@ -17,14 +17,14 @@ class Softmax { size_t heads; size_t seq_length; size_t prob_depth; - float temprature; + float temperature; bool mem_alloc; Config(size_t batch, size_t h, size_t seq, int prob_size = 0, bool mem_alloc = false) : batchSize(batch), heads(h), seq_length(seq), prob_depth(prob_size), - temprature(1.0), + temperature(1.0), mem_alloc(mem_alloc) { } diff --git a/csrc/includes/type_shim.h b/csrc/includes/type_shim.h index 3c6b9c5ae98e..ba1e188f3e1c 100644 --- a/csrc/includes/type_shim.h +++ b/csrc/includes/type_shim.h @@ -1,7 +1,7 @@ /* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */ #include -// Forward/backward compatiblity hack around +// Forward/backward compatibility hack around // https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288 // pending more future-proof guidance from upstream. // struct TypeShim diff --git a/csrc/quantization/pt_binding.cpp b/csrc/quantization/pt_binding.cpp index f8cba122e71d..f76c4368a200 100644 --- a/csrc/quantization/pt_binding.cpp +++ b/csrc/quantization/pt_binding.cpp @@ -11,7 +11,7 @@ at::Tensor ds_quantize(at::Tensor& vals, int groups, int bits) for (auto dim : t_size) size *= dim; if ((((size / groups) - 1) / 4096 + 1) <= MAX_REG) { - launch_qunatize_kernel( + launch_quantize_kernel( (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream()); } return vals; @@ -25,7 +25,7 @@ at::Tensor ds_sr_quantize(at::Tensor& vals, int groups, int bits) for (auto dim : t_size) size *= dim; if (((size / groups) / 4 / 1024) <= 256) { - launch_sr_qunatize_kernel( + launch_sr_quantize_kernel( (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream()); } return vals; @@ -39,7 +39,7 @@ at::Tensor ds_quantize_asym(at::Tensor& vals, int groups, int bits) for (auto dim : t_size) size *= dim; if ((((size / groups) - 1) / 4096 + 1) <= MAX_REG) { - launch_qunatize_kernel_asym( + launch_quantize_kernel_asym( (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream()); } return vals; @@ -53,7 +53,7 @@ at::Tensor ds_sr_quantize_asym(at::Tensor& vals, int groups, int bits) for (auto dim : t_size) size *= dim; if (((size / groups) / 4 / 1024) <= 256) { - launch_sr_qunatize_kernel_asym( + launch_sr_quantize_kernel_asym( (T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream()); } return vals; diff --git a/csrc/quantization/quantizer.cu b/csrc/quantization/quantizer.cu index c48ae38969e3..f0573076e3d6 100644 --- a/csrc/quantization/quantizer.cu +++ b/csrc/quantization/quantizer.cu @@ -3,7 +3,7 @@ namespace cg = cooperative_groups; -__global__ void qunatize_kernel(__half* vals, int group_size, int num_bits) +__global__ void quantize_kernel(__half* vals, int group_size, int num_bits) { #if __CUDA_ARCH__ >= 700 @@ -93,7 +93,7 @@ __global__ void qunatize_kernel(__half* vals, int group_size, int num_bits) #endif } -__global__ void qunatize_kernel(float* vals, int group_size, int num_bits) +__global__ void quantize_kernel(float* vals, int group_size, int num_bits) { cg::thread_block b = cg::this_thread_block(); cg::thread_block_tile<32> g = cg::tiled_partition<32>(b); @@ -176,7 +176,7 @@ __global__ void qunatize_kernel(float* vals, int group_size, int num_bits) } template -void launch_qunatize_kernel(T* vals, +void launch_quantize_kernel(T* vals, int total_count, int group_num, int num_bits, @@ -185,22 +185,22 @@ void launch_qunatize_kernel(T* vals, dim3 grid_dim(group_num); dim3 block_dim(1024); - qunatize_kernel<<>>( + quantize_kernel<<>>( vals, (total_count / group_num) / 4, num_bits); } -template void launch_qunatize_kernel(float* vals, +template void launch_quantize_kernel(float* vals, int total_count, int group_num, int num_bits, cudaStream_t stream); -template void launch_qunatize_kernel(__half* vals, +template void launch_quantize_kernel(__half* vals, int total_count, int group_num, int num_bits, cudaStream_t stream); -__global__ void sr_qunatize_kernel(__half* vals, +__global__ void sr_quantize_kernel(__half* vals, int token_size, int token_num, int num_bits, @@ -336,7 +336,7 @@ __global__ void sr_qunatize_kernel(__half* vals, #endif } -__global__ void sr_qunatize_kernel(float* vals, +__global__ void sr_quantize_kernel(float* vals, int token_size, int token_num, int num_bits, @@ -456,7 +456,7 @@ __global__ void sr_qunatize_kernel(float* vals, } template -void launch_sr_qunatize_kernel(T* vals, +void launch_sr_quantize_kernel(T* vals, int total_count, int group_num, int num_bits, @@ -468,21 +468,21 @@ void launch_sr_qunatize_kernel(T* vals, uint64_t inc = total_count / grid_dim.x / block_dim.x; std::pair seed = Context::Instance().IncrementOffset(inc); - sr_qunatize_kernel<<>>( + sr_quantize_kernel<<>>( vals, (total_count / group_num) / 4, group_num, num_bits, seed); } -template void launch_sr_qunatize_kernel(float* vals, +template void launch_sr_quantize_kernel(float* vals, int total_count, int group_num, int num_bits, cudaStream_t stream); -template void launch_sr_qunatize_kernel(__half* vals, +template void launch_sr_quantize_kernel(__half* vals, int total_count, int group_num, int num_bits, cudaStream_t stream); -__global__ void qunatize_kernel_asym(__half* vals, int group_size, int num_bits) +__global__ void quantize_kernel_asym(__half* vals, int group_size, int num_bits) { #if __CUDA_ARCH__ >= 700 @@ -595,7 +595,7 @@ __global__ void qunatize_kernel_asym(__half* vals, int group_size, int num_bits) #endif } -__global__ void qunatize_kernel_asym(float* vals, int group_size, int num_bits) +__global__ void quantize_kernel_asym(float* vals, int group_size, int num_bits) { cg::thread_block b = cg::this_thread_block(); cg::thread_block_tile<32> g = cg::tiled_partition<32>(b); @@ -699,7 +699,7 @@ __global__ void qunatize_kernel_asym(float* vals, int group_size, int num_bits) } template -void launch_qunatize_kernel_asym(T* vals, +void launch_quantize_kernel_asym(T* vals, int total_count, int group_num, int num_bits, @@ -708,22 +708,22 @@ void launch_qunatize_kernel_asym(T* vals, dim3 grid_dim(group_num); dim3 block_dim(1024); - qunatize_kernel_asym<<>>( + quantize_kernel_asym<<>>( vals, (total_count / group_num) / 4, num_bits); } -template void launch_qunatize_kernel_asym(float* vals, +template void launch_quantize_kernel_asym(float* vals, int total_count, int group_num, int num_bits, cudaStream_t stream); -template void launch_qunatize_kernel_asym(__half* vals, +template void launch_quantize_kernel_asym(__half* vals, int total_count, int group_num, int num_bits, cudaStream_t stream); -__global__ void sr_qunatize_kernel_asym(__half* vals, +__global__ void sr_quantize_kernel_asym(__half* vals, int token_size, int token_num, int num_bits, @@ -879,7 +879,7 @@ __global__ void sr_qunatize_kernel_asym(__half* vals, #endif } -__global__ void sr_qunatize_kernel_asym(float* vals, +__global__ void sr_quantize_kernel_asym(float* vals, int token_size, int token_num, int num_bits, @@ -1010,7 +1010,7 @@ __global__ void sr_qunatize_kernel_asym(float* vals, } } template -void launch_sr_qunatize_kernel_asym(T* vals, +void launch_sr_quantize_kernel_asym(T* vals, int total_count, int group_num, int num_bits, @@ -1022,15 +1022,15 @@ void launch_sr_qunatize_kernel_asym(T* vals, uint64_t inc = total_count / grid_dim.x / block_dim.x; std::pair seed = Context::Instance().IncrementOffset(inc); - sr_qunatize_kernel<<>>( + sr_quantize_kernel<<>>( vals, (total_count / group_num) / 4, group_num, num_bits, seed); } -template void launch_sr_qunatize_kernel_asym(float* vals, +template void launch_sr_quantize_kernel_asym(float* vals, int total_count, int group_num, int num_bits, cudaStream_t stream); -template void launch_sr_qunatize_kernel_asym(__half* vals, +template void launch_sr_quantize_kernel_asym(__half* vals, int total_count, int group_num, int num_bits, diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py index 67b656219978..ac91eb859c6f 100644 --- a/deepspeed/inference/engine.py +++ b/deepspeed/inference/engine.py @@ -102,7 +102,7 @@ def _create_model_parallel_group(self): self.mp_group = InferenceEngine.inference_mp_group def _check_quantize_setting(self, quantization_setting): - self.quatize_bits = 8 + self.quantize_bits = 8 self.mlp_extra_grouping = False self.quantize_groups = 1 if quantization_setting is None: @@ -177,7 +177,7 @@ def _convert_to_dtype(self): quantizer = WeightQuantization(mlp_extra_grouping=self.mlp_extra_grouping) model, self.quantization_scales = quantizer.model_quantize(self.module, self.injection_dict, - self.quatize_bits, + self.quantize_bits, self.quantize_groups) elif self.dtype == torch.half: self.module.half() diff --git a/deepspeed/launcher/launch.py b/deepspeed/launcher/launch.py index 2ad98ea358a5..edcd8013493c 100755 --- a/deepspeed/launcher/launch.py +++ b/deepspeed/launcher/launch.py @@ -1,7 +1,7 @@ # Copyright 2020 The Microsoft DeepSpeed Team """ DeepSpeed launcher, this is similar to torch.distributed.launch but supports -additional features such as abitrary gpu exclusion. +additional features such as arbitrary gpu exclusion. deepspeed.launcher.launch is intended to be run on a single worker node and will spawn several worker sub-processes depending on how many devices/ranks diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py index 0bb0b57e9e58..3d93363ffd3d 100755 --- a/deepspeed/launcher/runner.py +++ b/deepspeed/launcher/runner.py @@ -2,7 +2,7 @@ """ DeepSpeed runner is the main front-end to launching multi-worker training jobs with DeepSpeed. By default this uses pdsh to parallel -ssh into multiple worker nodes and launch all the neccisary processes +ssh into multiple worker nodes and launch all the necessary processes per rank for training. """ diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 09eb90a584f3..101b40e71cc0 100755 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -19,7 +19,7 @@ def merge_assert(self, dim1, dim2): assert dim1 > dim2, \ 'Merging tensors is not allowed here! Please use deepspeed load_checkpoint\ for merging your checkpoints before replacing the transformer layer with\ - inference-kerenls' + inference-kernels' def qkv_copy(self, dst, src): if src is None: @@ -114,9 +114,9 @@ def replace_transformer_layer(orig_layer_impl, seed (int): random seed value max_seq_length (int): max sequence length for training hidden_size (int): hidden dimension - num_attention_heads (int): numebr of attention heads + num_attention_heads (int): number of attention heads mp_size (int): model_parallelism degree - mp_group : model_parallel gropu initialized on the modeling side + mp_group : model_parallel group initialized on the modeling side preln (bool): does the original layer implementation do pre or post layer norm? fp16 (bool): fp16 or fp32 local_rank (int): GPU rank (optional), @@ -401,7 +401,7 @@ def replace_module(model, orig_class, replace_fn, _replace_policy): if plcy._orig_layer_class is not None: policy.update({plcy._orig_layer_class: (replace_fn, plcy)}) assert len(policy.items()) > 0,\ - "No default policy found! Please specifiy your policy injection_policy (like {BertLayer:HFBEertLayerPolicy})." +\ + "No default policy found! Please specify your policy injection_policy (like {BertLayer:HFBEertLayerPolicy})." +\ "You can find some samples here: https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py" replaced_module, _ = _replace_module(model, policy) diff --git a/deepspeed/moe/sharded_moe.py b/deepspeed/moe/sharded_moe.py index 59028a455037..a3b6e53244d0 100644 --- a/deepspeed/moe/sharded_moe.py +++ b/deepspeed/moe/sharded_moe.py @@ -33,7 +33,7 @@ def multiplicative_jitter(x, device: torch.device, epsilon=1e-2): """ - Modified from swtich transformer paper. mesh transformers + Modified from switch transformer paper. mesh transformers Multiply values by a random number between 1-epsilon and 1+epsilon. Makes models more resilient to rounding errors introduced by bfloat16. This seems particularly important for logits. @@ -147,7 +147,7 @@ def top1gating(logits: torch.Tensor, mask1_rand = mask1 * uniform(mask1.shape) - assert logits.shape[0] >= min_capacity, "No. of tokens (batch-size) should be greater than min_capacity. Either set min_capacity to 0 or inrease your batch size." + assert logits.shape[0] >= min_capacity, "No. of tokens (batch-size) should be greater than min_capacity. Either set min_capacity to 0 or increase your batch size." _, top_idx = torch.topk(mask1_rand, k=capacity, dim=0) diff --git a/deepspeed/ops/adam/cpu_adam.py b/deepspeed/ops/adam/cpu_adam.py index 770c9fb24784..5d6b597141c3 100755 --- a/deepspeed/ops/adam/cpu_adam.py +++ b/deepspeed/ops/adam/cpu_adam.py @@ -33,9 +33,9 @@ def __init__(self, In order to apply this optimizer, the model requires to have its master parameter (in FP32) reside on the CPU memory. - To train on a hetrogeneous system, such as coordinating CPU and GPU, DeepSpeed offers + To train on a heterogeneous system, such as coordinating CPU and GPU, DeepSpeed offers the ZeRO-Offload technology which efficiently offloads the optimizer states into CPU memory, - with minimal impact on training througput. DeepSpeedCPUAdam plays an important role to minimize + with minimal impact on training throughput. DeepSpeedCPUAdam plays an important role to minimize the overhead of the optimizer's latency on CPU. Please refer to ZeRO-Offload tutorial (https://www.deepspeed.ai/tutorials/zero-offload/) for more information on how to enable this technology. diff --git a/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py b/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py index 40dc697e11b1..7a0bd4f4c0eb 100755 --- a/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py +++ b/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py @@ -24,7 +24,7 @@ def __init__( Arguments: config: required: Bert model config - sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on FixedSparsityConfig class. + sparsity_config: optional: this parameter determines sparsity pattern configuration; it is based on FixedSparsityConfig class. """ super(BertSparseSelfAttention, self).__init__() @@ -53,11 +53,11 @@ def forward(self, hidden_states, attention_mask): """Applies forward phase of bert sparse self attention Arguments: - hidden_states: required: hidde_states tensor of the bert model + hidden_states: required: hidden_states tensor of the bert model attn_mask: required: a mask tensor of size (SequenceLength X SequenceLength); currently only 2D is supported Return: - context_layer: a dense tensor containing attnetion context + context_layer: a dense tensor containing attention context """ mixed_query_layer = self.query(hidden_states) mixed_key_layer = self.key(hidden_states) diff --git a/deepspeed/ops/sparse_attention/sparse_attention_utils.py b/deepspeed/ops/sparse_attention/sparse_attention_utils.py index 32999f62323f..700363b204af 100644 --- a/deepspeed/ops/sparse_attention/sparse_attention_utils.py +++ b/deepspeed/ops/sparse_attention/sparse_attention_utils.py @@ -94,10 +94,10 @@ def replace_model_self_attention_with_sparse_self_attention( Arguments: model: required: a transformer model max_position: required: an integer determining new position embedding size - sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class + sparsity_config: optional: this parameter determines sparsity pattern configuration; it is based on SparsityConfig class Return: - model: updated model; in which self attention layer has been repleaced with DeepSpeed Sparse Self Attention layer. + model: updated model; in which self attention layer has been replaced with DeepSpeed Sparse Self Attention layer. """ if hasattr(model, 'bert'): @@ -131,10 +131,10 @@ def replace_self_attention_layer_with_sparse_self_attention_layer( Arguments: config: required: transformer model config layers: required: transformer model attention layers - sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class + sparsity_config: optional: this parameter determines sparsity pattern configuration; it is based on SparsityConfig class Return: - layers: updated attention layers; in which self attention layers have been repleaced with DeepSpeed Sparse Self Attention layer. + layers: updated attention layers; in which self attention layers have been replaced with DeepSpeed Sparse Self Attention layer. """ for layer in layers: @@ -161,7 +161,7 @@ def pad_to_block_size(block_size, It needs to be called in your model, such as BertModel, right before you calculate the embedding outputs. Note) 1- instead of passing your embedding layer to this function, you can simply add this function to your model. It can be more simplified if given attention_mask and/or token_type_ids are none. - 2- you need to call unpdad function before returning your model output to unpad the encoder sequence output. + 2- you need to call unpad function before returning your model output to unpad the encoder sequence output. Arguments: block_size: required: an integer determining the block size of sparsity config. diff --git a/deepspeed/ops/sparse_attention/sparse_self_attention.py b/deepspeed/ops/sparse_attention/sparse_self_attention.py index 287e926d4608..3ca5bdd4c034 100644 --- a/deepspeed/ops/sparse_attention/sparse_self_attention.py +++ b/deepspeed/ops/sparse_attention/sparse_self_attention.py @@ -26,7 +26,7 @@ def __init__( max_seq_length=2048): """Initialize the sparse self attention layer. Arguments: - sparsity_config: optional: this parameter determins sparsity pattern configuration; it is based on SparsityConfig class. + sparsity_config: optional: this parameter determines sparsity pattern configuration; it is based on SparsityConfig class. key_padding_mask_mode: optional: a string determining if key padding mask needs to be added, `add`, or be multiplied, `mul`. attn_mask_mode: optional: a string determining if attention mask needs to be added, `add`, or be multiplied, `mul`. max_seq_length: optional: the maximum sequence length this sparse attention module will be applied to; it controls the size of the master_layout. @@ -121,7 +121,7 @@ def forward(self, attn_mask_mode: optional: a boolean determining if attn_mask needs to be added or multiplied Return: - attn_output: a dense tensor containing attnetion context + attn_output: a dense tensor containing attention context """ assert query.dtype == torch.half, "sparse attention only supports training in fp16 currently, please file a github issue if you need fp32 support" bsz, num_heads, tgt_len, head_dim = query.size() diff --git a/deepspeed/ops/sparse_attention/sparsity_config.py b/deepspeed/ops/sparse_attention/sparsity_config.py index 5d8e2fa5fa46..3b9224f136cb 100644 --- a/deepspeed/ops/sparse_attention/sparsity_config.py +++ b/deepspeed/ops/sparse_attention/sparsity_config.py @@ -49,7 +49,7 @@ def check_and_propagate_first_head_layout(self, layout): """If all heads require same sparsity layout, it propagate first head layout to all heads Arguments: - layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step + layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step Return: layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head @@ -152,11 +152,11 @@ def __init__(self, self.num_different_global_patterns = num_different_global_patterns def set_local_layout(self, h, layout): - """Sets local attantion layout used by the given head in the sparse attention. + """Sets local attention layout used by the given head in the sparse attention. Arguments: h: required: an integer determining head index - layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step + layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step Return: layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which local layout is set @@ -173,14 +173,14 @@ def set_local_layout(self, h, layout): return layout def set_global_layout(self, h, layout): - """Sets global attantion layout used by the given head in the sparse attention. + """Sets global attention layout used by the given head in the sparse attention. Currently we set global blocks starting from the last block of a local window to the first one. That means if a local window consists of 4 blocks and global attention size is one block, we use block #4 in each local window as global. If we have different layout per head, then other heads will get #3, #2, and #1. And if we have more heads (and different layout has set) than num of global attentions, multiple head may have same global attentions. Note) if horizontal_global_attention is set, global blocks will be set both horizontally and vertically. Arguments: h: required: an integer determining head index - layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step + layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step Return: layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which global layout is set @@ -307,12 +307,12 @@ def __init__(self, self.horizontal_global_attention = horizontal_global_attention def set_random_layout(self, h, layout): - """Sets random attantion layout used by the given head in the sparse attention. + """Sets random attention layout used by the given head in the sparse attention. Note) By default, it assumes there will be a unique random block layout for all heads; unless `different_layout_per_head` parameter is set in which each head can have a different random layout. Arguments: h: required: an integer determining head index - layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step + layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step Return: layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which random layout is set @@ -321,7 +321,7 @@ def set_random_layout(self, h, layout): num_blocks = layout.shape[1] if (num_blocks < self.num_random_blocks): raise ValueError( - f'Number of random blocks, {self.num_random_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!' + f'Number of random blocks, {self.num_random_blocks}, must be smaller than overall number of blocks in a row, {num_blocks}!' ) for row in range(0, num_blocks): rnd_cols = random.sample(range(0, num_blocks), self.num_random_blocks) @@ -329,10 +329,10 @@ def set_random_layout(self, h, layout): return layout def set_local_layout(self, h, layout): - """Sets local attantion layout used by the given head in the sparse attention. + """Sets local attention layout used by the given head in the sparse attention. Arguments: h: required: an integer determining head index - layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step + layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step Return: layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which local layout is set @@ -362,11 +362,11 @@ def set_local_layout(self, h, layout): return layout def set_global_layout(self, h, layout): - """Sets global attantion layout used by the given head in the sparse attention. + """Sets global attention layout used by the given head in the sparse attention. Arguments: h: required: an integer determining head index - layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step + layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step Return: layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which global layout is set @@ -375,7 +375,7 @@ def set_global_layout(self, h, layout): num_blocks = layout.shape[1] if (self.global_block_end_indices is None): for idx in self.global_block_indices: - # if global block idx is in the range of the sequnce blocks + # if global block idx is in the range of the sequence blocks if (idx < num_blocks): #global rows if (self.horizontal_global_attention): @@ -386,7 +386,7 @@ def set_global_layout(self, h, layout): layout[h, first_row:, idx] = 1 else: for _, (start_idx, end_idx) in enumerate(zip(self.global_block_indices, self.global_block_end_indices)): - # if global block idx is in the range of the sequnce blocks + # if global block idx is in the range of the sequence blocks if (start_idx < num_blocks): end_idx = min(end_idx, num_blocks) #global rows @@ -450,12 +450,12 @@ def __init__(self, self.num_global_blocks = num_global_blocks def set_random_layout(self, h, layout): - """Sets random attantion layout used by the given head in the sparse attention. + """Sets random attention layout used by the given head in the sparse attention. Note) By default, it assumes there will be a unique random block layout for all heads; unless `different_layout_per_head` parameter is set in which each head can have a different random layout. Arguments: h: required: an integer determining head index - layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step + layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step Return: layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which random layout is set @@ -464,7 +464,7 @@ def set_random_layout(self, h, layout): num_blocks = layout.shape[1] if (num_blocks < self.num_random_blocks): raise ValueError( - f'Number of random blocks, {self.num_random_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!' + f'Number of random blocks, {self.num_random_blocks}, must be smaller than overall number of blocks in a row, {num_blocks}!' ) for row in range(0, num_blocks): @@ -473,11 +473,11 @@ def set_random_layout(self, h, layout): return layout def set_sliding_window_layout(self, h, layout): - """Sets sliding local attantion layout used by the given head in the sparse attention. + """Sets sliding local attention layout used by the given head in the sparse attention. Arguments: h: required: an integer determining head index - layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step + layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step Return: layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which local sliding window layout is set @@ -486,7 +486,7 @@ def set_sliding_window_layout(self, h, layout): num_blocks = layout.shape[1] if (num_blocks < self.num_sliding_window_blocks): raise ValueError( - f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!' + f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overall number of blocks in a row, {num_blocks}!' ) w = self.num_sliding_window_blocks // 2 @@ -497,11 +497,11 @@ def set_sliding_window_layout(self, h, layout): return layout def set_global_layout_itc(self, h, layout): - """Sets global attantion layout used by the given head in the sparse attention. + """Sets global attention layout used by the given head in the sparse attention. Arguments: h: required: an integer determining head index - layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step + layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step Return: layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which global layout is set @@ -510,7 +510,7 @@ def set_global_layout_itc(self, h, layout): num_blocks = layout.shape[1] if (num_blocks < self.num_global_blocks): raise ValueError( - f'Number of global blocks, {self.num_global_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!' + f'Number of global blocks, {self.num_global_blocks}, must be smaller than overall number of blocks in a row, {num_blocks}!' ) #global rows @@ -588,11 +588,11 @@ def __init__(self, self.global_block_end_indices = global_block_end_indices def set_sliding_window_layout(self, h, layout): - """Sets sliding local attantion layout used by the given head in the sparse attention. + """Sets sliding local attention layout used by the given head in the sparse attention. Arguments: h: required: an integer determining head index - layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step + layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step Return: layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which local sliding window layout is set @@ -601,7 +601,7 @@ def set_sliding_window_layout(self, h, layout): num_blocks = layout.shape[1] if (num_blocks < self.num_sliding_window_blocks): raise ValueError( - f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overal number of blocks in a row, {num_blocks}!' + f'Number of sliding window blocks, {self.num_sliding_window_blocks}, must be smaller than overall number of blocks in a row, {num_blocks}!' ) w = self.num_sliding_window_blocks // 2 @@ -612,11 +612,11 @@ def set_sliding_window_layout(self, h, layout): return layout def set_global_layout(self, h, layout): - """Sets global attantion layout used by the given head in the sparse attention. + """Sets global attention layout used by the given head in the sparse attention. Arguments: h: required: an integer determining head index - layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completly set at this step + layout: required: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head; may not be completely set at this step Return: layout: a tensor of dimension (num_heads, num_blocks, num_blocks) containing sparsity layout of all head in which global layout is set @@ -625,7 +625,7 @@ def set_global_layout(self, h, layout): num_blocks = layout.shape[1] if (self.global_block_end_indices is None): for idx in self.global_block_indices: - # if global block idx is in the range of the sequnce blocks + # if global block idx is in the range of the sequence blocks if (idx < num_blocks): #global rows layout[h, idx, :] = 1 @@ -634,7 +634,7 @@ def set_global_layout(self, h, layout): layout[h, :, idx] = 1 else: for _, (start_idx, end_idx) in enumerate(zip(self.global_block_indices, self.global_block_end_indices)): - # if global block idx is in the range of the sequnce blocks + # if global block idx is in the range of the sequence blocks if (start_idx < num_blocks): end_idx = min(end_idx, num_blocks) #global rows diff --git a/deepspeed/ops/transformer/inference/transformer_inference.py b/deepspeed/ops/transformer/inference/transformer_inference.py index fadd4ccb2c41..fbc3790e87d9 100755 --- a/deepspeed/ops/transformer/inference/transformer_inference.py +++ b/deepspeed/ops/transformer/inference/transformer_inference.py @@ -525,7 +525,7 @@ class DeepSpeedTransformerInference(nn.Module): config: An object of DeepSpeedInferenceConfig mp_group: Model parallelism group initialized on the modeling side. - quantize_scales: This arguement groups all the layers' scales used for quantization + quantize_scales: This argument groups all the layers' scales used for quantization quantize_groups: Number of groups used for quantizing the model merge_count: Shows the number of model-parallel checkpoints merged before running inference. We use this argument to control the quantization scale for the model parameters if a bigger diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py index c0a125ce851c..b49ea6ef96d5 100755 --- a/deepspeed/ops/transformer/transformer.py +++ b/deepspeed/ops/transformer/transformer.py @@ -88,7 +88,7 @@ class DeepSpeedTransformerConfig(TransformerConfig): a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend to turn it off in order to be able to reproduce the same result through the regular kernel execution. - huggingface: Enbale if using the HuggingFace interface style for sending out the forward results. + huggingface: Enable if using the HuggingFace interface style for sending out the forward results. training: Enable for training rather than inference. """ diff --git a/deepspeed/profiling/flops_profiler/README.md b/deepspeed/profiling/flops_profiler/README.md index 4c796595cd7e..e63b8a310a1c 100644 --- a/deepspeed/profiling/flops_profiler/README.md +++ b/deepspeed/profiling/flops_profiler/README.md @@ -135,14 +135,14 @@ The DeepSpeed Flops Profiler can be used with the DeepSpeed runtime without any ## Flops Measurement -Similar to exsiting flops calculation tools or methods, the DeepSpeed Flops Profiler measures the flops of the forward pass of a module and the flops of the backward pass is estimated as `2` times of that of the forward pass. -Different from the PyTorch profiler which calculates the flops of PyTorch operators, the DeepSpeed Flops Profiler measures the flops witin modules in a model and provides more insights to the users about the model execution. +Similar to existing flops calculation tools or methods, the DeepSpeed Flops Profiler measures the flops of the forward pass of a module and the flops of the backward pass is estimated as `2` times of that of the forward pass. +Different from the PyTorch profiler which calculates the flops of PyTorch operators, the DeepSpeed Flops Profiler measures the flops within modules in a model and provides more insights to the users about the model execution. The flops estimation is partly inspired by [ptflops](https://github.com/sovrasov/flops-counter.pytorch) with the major difference being that the DeepSpeed Flops Profiler not only supports flops computation directly at module level, but can also capture ```torch.nn.functional``` invoked in a module to estimate the flops. Thus the DeepSpeed Flops Profiler allows for customized modules in the model, e.g., ```ParallelTransformerLayerworks, ParallelSelfAttention, RowParallelLinear, etc.``` in [Megatron-LM](https://github.com/NVIDIA/Megatron-LM). This is in contrast to ptflops which requires users to write customized flops calculation functions for each customized module. ## Multi-GPU, Multi-node, Data Parallelism, and Model Parallelism -The DeepSpeed Flops Profiler outputs the per GPU profile as well as the world size, data parallel size, and model paralel size. 1 +The DeepSpeed Flops Profiler outputs the per GPU profile as well as the world size, data parallel size, and model parallel size. 1 For models running on multi-GPU or multi-node, only change of the model parallelism (e.g. ```--model-parallel-size``` in [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)) affects the number of flops and parameters profiled, i.e., `model_parallel_size * flops = total_flops` and `model_parallel_size * parameters = total_parameters`. The data parallel size or world size (related to the number of GPUs or nodes) does not affect the per GPU profile. @@ -372,7 +372,7 @@ with torch.cuda.device(0): #### In Model Training Workflow To profile model forward in a training workflow, use the `FlopsProfiler`class. -The `FlopsProfiler`class provides the follwing methods: +The `FlopsProfiler`class provides the following methods: * `start_profile()` - starts profiling * `get_total_flops(as_string=False)` - returns the total number of MACs in the model * `get_total_params(as_string=False)` - returns the total number of parameters in the model diff --git a/deepspeed/profiling/flops_profiler/profiler.py b/deepspeed/profiling/flops_profiler/profiler.py index 5df7a5a110af..ea27df38dc33 100644 --- a/deepspeed/profiling/flops_profiler/profiler.py +++ b/deepspeed/profiling/flops_profiler/profiler.py @@ -241,13 +241,13 @@ def print_model_profile(self, ) print(f'Profile Summary at step {profile_step}:') print( - "Notations:\ndata parallel size (dp_size), model paralel size(mp_size),\nnumber of parameters (params), number of multiply-accumulate operations(MACs),\number of floating point operations (flops), floating point operations per second (FLOPS),\nfwd latency (forward propagation latency), bwd latency (backward propagation latency),\nstep (weights update latency), iter latency (sum of fwd, bwd and step latency)\n" + "Notations:\ndata parallel size (dp_size), model parallel size(mp_size),\nnumber of parameters (params), number of multiply-accumulate operations(MACs),\number of floating point operations (flops), floating point operations per second (FLOPS),\nfwd latency (forward propagation latency), bwd latency (backward propagation latency),\nstep (weights update latency), iter latency (sum of fwd, bwd and step latency)\n" ) if self.ds_engine: print('{:<60} {:<8}'.format('world size: ', self.ds_engine.world_size)) print('{:<60} {:<8}'.format('data parallel size: ', self.ds_engine.dp_world_size)) - print('{:<60} {:<8}'.format('model paralel size: ', + print('{:<60} {:<8}'.format('model parallel size: ', self.ds_engine.mp_world_size)) print('{:<60} {:<8}'.format( 'batch size per GPU: ', diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py index 9a20c87017d7..f48b79b20521 100644 --- a/deepspeed/runtime/activation_checkpointing/checkpointing.py +++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py @@ -205,7 +205,7 @@ def model_parallel_cuda_manual_seed(seed): Two set of RNG states are tracked: default state: This is for data parallelism and is the same among a set of model parallel GPUs but different across - different model paralle groups. This is used for + different model parallel groups. This is used for example for dropout in the non-model-parallel regions. model-parallel state: This state is different among a set of model parallel GPUs, but the same across data parallel @@ -219,7 +219,7 @@ def model_parallel_cuda_manual_seed(seed): # 2718 is just for fun and any POSITIVE value will work. offset = seed + 2718 model_parallel_seed = offset + tp_rank - # Data parallel gets the original sedd. + # Data parallel gets the original seed. data_parallel_seed = seed if torch.distributed.get_rank() == 0: @@ -534,7 +534,7 @@ def save_args_for_backward(*all_args): global cuda_device, transport_stream, PARTITION_ACTIVATIONS, buffer_0, buffer_1, buffer_0_offset, buffer_1_offset if cuda_device is None: - see_memory_usage("First Forward Begining", force=False) + see_memory_usage("First Forward Beginning", force=False) if dist.get_rank() == 0: logger.info(f"Activation Checkpointing Information") logger.info( @@ -886,7 +886,7 @@ def configure( PROFILE_TIME = profile if CONTIGUOUS_CHECKPOINTING: - assert PARTITION_ACTIVATIONS, "Contiguous Checkpointing is only availble with partitioned activations. Set partitioned activations to true in deepspeed config" + assert PARTITION_ACTIVATIONS, "Contiguous Checkpointing is only available with partitioned activations. Set partitioned activations to true in deepspeed config" if CONTIGUOUS_CHECKPOINTING: assert num_layers is not None, "Must specify the number of layers with contiguous memory checkpointing" diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py index efcfb4d317d2..933e2f02c2f7 100755 --- a/deepspeed/runtime/config.py +++ b/deepspeed/runtime/config.py @@ -46,7 +46,7 @@ # extra optimizer parameters for adam/adamw TORCH_ADAM_PARAM = "torch_adam" -# default to adamw logic for adam/adamw optimizers unless user explictly opts out +# default to adamw logic for adam/adamw optimizers unless user explicitly opts out ADAM_W_MODE = "adam_w_mode" ADAM_W_MODE_DEFAULT = True @@ -665,7 +665,7 @@ def get_dataloader_drop_last(param_dict): '''Write deepspeed config files by modifying basic templates. -Can be used for quicly changing parameters via command line parameters.''' +Can be used for quickly changing parameters via command line parameters.''' class DeepSpeedConfigWriter: @@ -738,7 +738,7 @@ def __init__(self, config: Union[str, dict], mpu=None): f"ds_config ({TRAIN_BATCH_SIZE}, {TRAIN_MICRO_BATCH_SIZE_PER_GPU}, and/or " \ f"{GRADIENT_ACCUMULATION_STEPS}). These parameters *will not be used* since " \ "elastic training is enabled, which takes control of these parameters. " \ - "If you want to supress this error (the parameters will be silently ignored) " \ + "If you want to suppress this error (the parameters will be silently ignored) " \ f"please set {IGNORE_NON_ELASTIC_BATCH_INFO}':true in your elasticity config.") # micro_bsz * world_size * gas = total_batch_size diff --git a/deepspeed/runtime/eigenvalue.py b/deepspeed/runtime/eigenvalue.py index 2675cd7b99c7..b1974d975f9f 100755 --- a/deepspeed/runtime/eigenvalue.py +++ b/deepspeed/runtime/eigenvalue.py @@ -30,7 +30,7 @@ def __init__(self, ranks=[0]) # Replace all nan/pos-inf/neg-inf to zero - # TODO: Pytorch new verion may add this function, replace this one by then. + # TODO: Pytorch new version may add this function, replace this one by then. def nan_to_num(self, x): device = x.device x = x.cpu().numpy() diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 52bbc5164dcb..ead38583fd75 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -658,7 +658,7 @@ def _configure_with_arguments(self, args, mpu): ompi_local_rank = os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK") local_rank = os.environ.get('LOCAL_RANK', ompi_local_rank) assert ompi_local_rank == local_rank, f"LOCAL_RANK ({local_rank}) != OMPI_COMM_WORLD_LOCAL_RANK ({ompi_local_rank}), " \ - "not sure how to proceed as we're seeing conficting local rank info." + "not sure how to proceed as we're seeing conflicting local rank info." os.environ['LOCAL_RANK'] = local_rank self.local_rank = int(os.environ['LOCAL_RANK']) @@ -822,7 +822,7 @@ def _configure_distributed_model(self, model): if not self.amp_enabled(): self._broadcast_model() - #check if parmaeters are duplicated in optimizer param_groups + #check if parameters are duplicated in optimizer param_groups def _check_for_duplicates(self, optimizer): for name, param in self.module.named_parameters(): param_id = id(param) @@ -830,12 +830,12 @@ def _check_for_duplicates(self, optimizer): def ids_list(group): return [id(param) for param in group] - occurance = sum([ + occurrence = sum([ ids_list(group['params']).count(param_id) if param_id in ids_list(group['params']) else 0 for group in optimizer.param_groups ]) - assert occurance <= 1, f"Parameter with name: {name} occurs multiple times in optimizer.param_groups. Make sure it only appears once to prevent undefined behaviour." + assert occurrence <= 1, f"Parameter with name: {name} occurs multiple times in optimizer.param_groups. Make sure it only appears once to prevent undefined behaviour." # Configure optimizer def _configure_optimizer(self, client_optimizer, model_parameters): @@ -918,7 +918,7 @@ def _configure_basic_optimizer(self, model_parameters): torch_adam = optimizer_parameters.pop(TORCH_ADAM_PARAM, False) adam_w_mode = optimizer_parameters.pop(ADAM_W_MODE, ADAM_W_MODE_DEFAULT) - # Optimizer name of Adam forces AdamW logic unless adam_w_mode is explictly set + # Optimizer name of Adam forces AdamW logic unless adam_w_mode is explicitly set effective_adam_w_mode = self.optimizer_name( ) == ADAMW_OPTIMIZER or adam_w_mode @@ -1225,7 +1225,7 @@ def deepspeed_io(self, if route == ROUTE_TRAIN: deepspeed_io_timer = self.tput_timer - # If mpu is provied, forward world size and parallel rank to sampler. + # If mpu is provided, forward world size and parallel rank to sampler. data_parallel_world_size = None data_parallel_rank = None if self.mpu is not None: @@ -1506,7 +1506,7 @@ def _take_model_step(self, lr_kwargs, block_eigenvalue={}): report_progress = self.global_rank == 0 if self.global_rank else True - # Check overlow here since in DS fp16 optimizer, the overflow is updated in above step() function. + # Check overflow here since in DS fp16 optimizer, the overflow is updated in above step() function. overflow = False if hasattr(self.optimizer, 'overflow'): overflow = self.optimizer.overflow diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py index ebf5a1d168e1..ca0129989297 100644 --- a/deepspeed/runtime/fp16/onebit/adam.py +++ b/deepspeed/runtime/fp16/onebit/adam.py @@ -115,7 +115,7 @@ def step(self, closure=None, grads=None): grads (list of tensors, optional): weight gradient to use for the optimizer update. If gradients have type torch.half, parameters are expected to be in type torch.float. (default: None) - output params (list of tensors, optional): A reduced recision copy + output params (list of tensors, optional): A reduced precision copy of the updated weights written out in addition to the regular updated weights. Have to be of same type as gradients. (default: None) scale (float, optional): factor to divide gradient tensor values diff --git a/deepspeed/runtime/lr_schedules.py b/deepspeed/runtime/lr_schedules.py index 7846da12fdbd..b1ce33ffc928 100755 --- a/deepspeed/runtime/lr_schedules.py +++ b/deepspeed/runtime/lr_schedules.py @@ -361,7 +361,7 @@ def __init__(self, self.step_rate = lr_range_test_step_rate self.last_batch_iteration = last_batch_iteration self.staircase = lr_range_test_staircase - self.interval_fn = self._staircase_interval if lr_range_test_staircase else self._continous_interval + self.interval_fn = self._staircase_interval if lr_range_test_staircase else self._continuous_interval if last_batch_iteration == -1: self._update_optimizer(self.min_lr) @@ -369,7 +369,7 @@ def __init__(self, def _staircase_interval(self): return math.floor(float(self.last_batch_iteration + 1) / self.step_size) - def _continous_interval(self): + def _continuous_interval(self): return float(self.last_batch_iteration + 1) / self.step_size def _get_increase(self): @@ -514,7 +514,7 @@ def __init__(self, decay_mom_rate, last_batch_iteration) - # Initalize batch iteration tracker + # Initialize batch iteration tracker self.last_batch_iteration = last_batch_iteration # Configure cycle shape diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py index e65c39fcbe94..1c30abf6f715 100644 --- a/deepspeed/runtime/pipe/module.py +++ b/deepspeed/runtime/pipe/module.py @@ -175,7 +175,7 @@ def forward(self, inputs): topology = PipeDataParallelTopology(num_pp=num_stages, num_dp=dp) self._topo = topology - # Contruct communicators for pipeline topology + # Construct communicators for pipeline topology self._grid = PipelineParallelGrid(process_group=self.world_group, topology=self._topo) diff --git a/deepspeed/runtime/pipe/p2p.py b/deepspeed/runtime/pipe/p2p.py index e129d3d5b177..fa3654eaf8ee 100644 --- a/deepspeed/runtime/pipe/p2p.py +++ b/deepspeed/runtime/pipe/p2p.py @@ -177,7 +177,7 @@ def _get_send_recv_group(src_stage, dest_stage): stage_id = src_stage '''group_id corresponds to group of [group_id, group_id+1] unless group_id is the rank of the last stage - in which case group_id correspods to group[group_id-num_stages+1, group_id] + in which case group_id corresponds to group[group_id-num_stages+1, group_id] ''' group_id = _grid.stage_to_global(stage_id=stage_id) diff --git a/deepspeed/runtime/pipe/schedule.py b/deepspeed/runtime/pipe/schedule.py index 1f6ac1d8dce8..181d43e8062a 100644 --- a/deepspeed/runtime/pipe/schedule.py +++ b/deepspeed/runtime/pipe/schedule.py @@ -392,7 +392,7 @@ class ForwardPass(BufferOpInstruction): .. code-block:: python - buffers['ouputs'][buffer_id] = forward(buffers['inputs'][buffer_id]) + buffers['outputs'][buffer_id] = forward(buffers['inputs'][buffer_id]) """ pass @@ -404,7 +404,7 @@ class BackwardPass(BufferOpInstruction): .. code-block:: python - outputs = buffers['ouputs'][buffer_id] + outputs = buffers['outputs'][buffer_id] gradients = buffers['gradients'][buffer_id] torch.autograd.backward(tensors=outputs, grad_tensors=gradients) diff --git a/deepspeed/runtime/pipe/topology.py b/deepspeed/runtime/pipe/topology.py index 58a5f4a8bc3a..610d23b6faea 100644 --- a/deepspeed/runtime/pipe/topology.py +++ b/deepspeed/runtime/pipe/topology.py @@ -233,7 +233,7 @@ def _prime_factors(N): class PipeDataParallelTopology(ProcessTopology): - """ A topology specialiation for hybrid data and pipeline parallelism. + """ A topology specialization for hybrid data and pipeline parallelism. Uses data parallelism on the last dimension to encourage gradient reductions to use high-bandwidth intra-node links and lower-volume diff --git a/deepspeed/runtime/quantize.py b/deepspeed/runtime/quantize.py index f10c5860a800..a23d189aaab8 100755 --- a/deepspeed/runtime/quantize.py +++ b/deepspeed/runtime/quantize.py @@ -154,7 +154,7 @@ def compute_quantization(self, input, index=0, factor=1): f'Quantization settings: current bit-precision = {self.q_start_bits[index]}, step = {self.qsteps}, quantization period = {self.q_period[index]}, index = {index}' ) assert (self.q_start_bits[index] >= self.q_target_bits), \ - 'Quantization bit is lower thab target precision bits!' + 'Quantization bit is lower than target precision bits!' # quantize the weights base on the selected bits and the value-range if not self.use_quantizer_kernel: diff --git a/deepspeed/runtime/state_dict_factory.py b/deepspeed/runtime/state_dict_factory.py index aa9e5c12828c..e5c4dafb638b 100755 --- a/deepspeed/runtime/state_dict_factory.py +++ b/deepspeed/runtime/state_dict_factory.py @@ -63,7 +63,7 @@ def load(self, a. if no mp_size resizing occurs, for both training & inference, loading the mp_rank related checkpoint directly. b. if has mp_size resizing, only Megatron model inference is supported, - checkpoint file(s) will be merged/splitted according to mp_rank, mp_world_size and + checkpoint file(s) will be merged/split according to mp_rank, mp_world_size and checkpoint file list. 3. Non-PipeModule loading mp_rank_*.pt files, is_pipe_parallel=False @@ -433,7 +433,7 @@ def sanity_check(self, ckpt_file_name): sd = torch.load(ckpt_file_name, map_location=lambda storage, loc: storage) - # partail_key is a sub-string of one key in the sd + # partial_key is a sub-string of one key in the sd def check_key_exist(partial_key, sd): keys = sd.keys() found = False diff --git a/deepspeed/runtime/swap_tensor/optimizer_utils.py b/deepspeed/runtime/swap_tensor/optimizer_utils.py index 81b7a1ff9b1b..a08af96f2a12 100644 --- a/deepspeed/runtime/swap_tensor/optimizer_utils.py +++ b/deepspeed/runtime/swap_tensor/optimizer_utils.py @@ -461,12 +461,12 @@ def _retrieve_unswapped_grad_partitions(self, swap_info, dest_buffer): self._stop_timer(UNSWAPPED_READ_GRADIENTS) self._log_timers([UNSWAPPED_READ_GRADIENTS]) - # It shoud be safe to discard unswapped gradient partitions + # It should be safe to discard unswapped gradient partitions swap_info.release_unswapped_gradients() if SWAPPER_DEBUG_MODE: logger.info( - f'optimizer_retreive_unswapped_radients: param={swap_info.param_id} tensor_count={tensor_count} elem_count={num_elem_count}' + f'optimizer_retrieve_unswapped_gradients: param={swap_info.param_id} tensor_count={tensor_count} elem_count={num_elem_count}' ) def _get_state_tensors(self, parameter): diff --git a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py index fb321f3b3937..cbcb136aa337 100644 --- a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py +++ b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py @@ -124,7 +124,7 @@ def swap_out_optimizer_state(self, parameter, async_swap=False): return self._start_timer(SWAP_OUT_PARAM_TIMER) - pinned_tensors, pinned_paths, unpinned_tensors, unpinned_paths = self._seperate_pinned_tensors(swap_info) + pinned_tensors, pinned_paths, unpinned_tensors, unpinned_paths = self._separate_pinned_tensors(swap_info) swap_bytes = sum([ self._io_aligned_numel(t.numel()) * t.element_size() for t in swap_info.tensors @@ -203,7 +203,7 @@ def _swap_in_parameter(self, aio_handle, parameter, dest_buffers): if DEBUG_MODE and torch.distributed.get_rank() == 0: logger.info(f'optimizer_param_swap_in: {(swap_bytes/(1024**3)):5.2f} GB') - def _seperate_pinned_tensors(self, swap_info): + def _separate_pinned_tensors(self, swap_info): pinned_tensors = [] pinned_paths = [] diff --git a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py index 5fb4de602c99..1188ed68fbc4 100644 --- a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py +++ b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py @@ -125,7 +125,7 @@ def _configure_aio(self, ds_config): self.swap_out_params = [] - #Check if partiitoned param or numel in a tensor is swappable or not + #Check if partitioned param or numel in a tensor is swappable or not def swappable_tensor(self, param=None, numel=None): if param is not None: assert numel is None, "Both parma and numel cannot be provided" diff --git a/deepspeed/runtime/zero/constants.py b/deepspeed/runtime/zero/constants.py index 6c1f249540cb..e3b2dfc0c68f 100755 --- a/deepspeed/runtime/zero/constants.py +++ b/deepspeed/runtime/zero/constants.py @@ -106,7 +106,7 @@ ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE_DEFAULT = 50000000 #parameters smaller than the threshold are only communicated once after the -#parameters are updated and are persisted thoughout the trainging +#parameters are updated and are persisted throughout the training #avoid tons of latency bound communication ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD = 'stage3_param_persistence_threshold' ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT = 100000 @@ -125,7 +125,7 @@ ZERO_OPTIMIZATION_LEGACY_STAGE1 = "legacy_stage1" ZERO_OPTIMIZATION_LEGACY_STAGE1_DEFAULT = False -# Stage 2 - partition gradients in a round robin fashsion to load-balance reduction and offload copying +# Stage 2 - partition gradients in a round robin fashion to load-balance reduction and offload copying ZERO_OPTIMIZATION_ROUND_ROBIN_GRADIENTS = 'round_robin_gradients' ZERO_OPTIMIZATION_ROUND_ROBIN_GRADIENTS_DEFAULT = False diff --git a/deepspeed/runtime/zero/contiguous_memory_allocator.py b/deepspeed/runtime/zero/contiguous_memory_allocator.py index 686f376cfce7..cf1adebece86 100644 --- a/deepspeed/runtime/zero/contiguous_memory_allocator.py +++ b/deepspeed/runtime/zero/contiguous_memory_allocator.py @@ -10,7 +10,7 @@ class ContiguousMemoryAllocator(object): def __init__(self, size, dtype, device): self.buffer = torch.zeros(size, dtype=dtype, device=device) - #address to contiguous size avaialble + #address to contiguous size available self.contiguous_sizes = {} self.contiguous_sizes[0] = size @@ -65,7 +65,7 @@ def allocate_tensor(self, size): print_rank_0( f"Free before allocation {free_before}. Allocating {size}. Free after allocation {self.total_free}. Max allocated {self.max_allocated}" ) - assert self.total_free + size == free_before, "Allcation bookeeping error" + assert self.total_free + size == free_before, "Allocation bookkeeping error" return ret_tensor diff --git a/deepspeed/runtime/zero/linear.py b/deepspeed/runtime/zero/linear.py index 45e94a10d86d..458f4ff392be 100644 --- a/deepspeed/runtime/zero/linear.py +++ b/deepspeed/runtime/zero/linear.py @@ -1,14 +1,14 @@ #Linear Module to use with ZeRO Stage 3 to allow for parameter memory release #after the module execution during forward #Instead of saving variables using save_for_backward, we save variable ids -#Allowing us to retrive the variable without creating pointer to it +#Allowing us to retrieve the variable without creating pointer to it #Which allows for underlying tensor to be garbage collected #When partitioned as needed by the Zero Stage 3 optimizer #TODO instead of patching Linear module, we could patch the ctx.save_for_backward #ctx.saved_tensors so that this approach works for all nn modules that are built upon #torch.nn.function. However the issue is that many modules uses C++ implementations -#which does not have pytroch implementation. Eg torch.addmm which acts as a funcitonal -#when implemeted outside of torch.autograd.Function +#which does not have pytorch implementation. Eg torch.addmm which acts as a functional +#when implemented outside of torch.autograd.Function import math diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py index 50acacc7e70b..2c02b96e79fa 100755 --- a/deepspeed/runtime/zero/partition_parameters.py +++ b/deepspeed/runtime/zero/partition_parameters.py @@ -379,7 +379,7 @@ def __init__(self, effect. Defaults to ``True``. dtype (``dtype``, optional): Can be used to change the data type of the parameters. Supported options are ``torch.half`` and ``torch.float``. Defaults to ``None`` - mpu (``object``, optional): A model parallelism unit object that implements get_{model,data}_parallel_{rank,group,wolrd_size}. + mpu (``object``, optional): A model parallelism unit object that implements get_{model,data}_parallel_{rank,group,world_size}. This context accelerates model initialization and enables models that are too large to allocate in their entirety in CPU memory. It has the @@ -1059,7 +1059,7 @@ def _partition_gradient(self, param, partition_buffer=None, accumulate=False): if not accumulate: dest_tensor.copy_(src_tensor) - # if source and destinatoin are on same device, + # if source and destination are on same device, # add to the provided buffer elif src_tensor.device == dest_tensor.device: dest_tensor.add_(src_tensor) diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py index 603cad2f993c..f502a6807d7f 100755 --- a/deepspeed/runtime/zero/stage2.py +++ b/deepspeed/runtime/zero/stage2.py @@ -195,7 +195,7 @@ def __init__(self, self.fp16_master_weights_and_gradients = fp16_master_weights_and_gradients if self.fp16_master_weights_and_gradients: - assert self.cpu_offload and type(self.optimizer) in [DeepSpeedCPUAdam], f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32. Currenty only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}. Either disable fp16_master_weights_and_gradients or enable ZeRO-2 Offload with DeepSpeedCPUAdam" + assert self.cpu_offload and type(self.optimizer) in [DeepSpeedCPUAdam], f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32. Currently only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}. Either disable fp16_master_weights_and_gradients or enable ZeRO-2 Offload with DeepSpeedCPUAdam" if self.reduce_scatter: assert not self.allreduce_always_fp32, "allreduce_always_fp32 is not yet supported with ZeRO-2 with reduce scatter enabled" @@ -223,7 +223,7 @@ def __init__(self, # These are the parameters that will be updated by this process directly self.params_in_partition = [] - # Offset from the first paramter in the the self.params_in_partition + # Offset from the first parameter in the the self.params_in_partition # the parameter boundaries may not align with partition boundaries # so we need to keep track of the offset self.first_offset = [] @@ -909,7 +909,7 @@ def average_tensor(self, tensor): partition_id, offset = partition_ids_w_offsets[idx] # if dist.get_rank() == 0 and count < 100: - # print(f"Rank {dist.get_rank()} rank offet id {idx} calculated dp size {dist.get_world_size(group=process_group)} real dp size {dist.get_world_size(self.real_dp_process_group[i])} and dst: {partition_id}") + # print(f"Rank {dist.get_rank()} rank offset id {idx} calculated dp size {dist.get_world_size(group=process_group)} real dp size {dist.get_world_size(self.real_dp_process_group[i])} and dst: {partition_id}") # count += 1 # Calculate numel for grad slice depending on partition location @@ -937,7 +937,7 @@ def average_tensor(self, tensor): for i, (dst, bucket_offset, numel) in enumerate(rank_and_offsets): grad_slice = tensor.narrow(0, int(bucket_offset), int(numel)) # if dist.get_rank() == 0: - # print(f"Rank {dist.get_rank()} rank offet id {i} real dp size {dist.get_world_size(group=real_dp_process_group[i])} and dst: {dst}") + # print(f"Rank {dist.get_rank()} rank offset id {i} real dp size {dist.get_world_size(group=real_dp_process_group[i])} and dst: {dst}") # dist.barrier() #dist.barrier() dst_rank = _get_global_rank(real_dp_process_group[i], dst) @@ -1008,7 +1008,7 @@ def buffer_to_accumulate_to_in_cpu(): dest_offset, num_elements) - #accumulate gradients into param.grad or parts of it that belongs to this parittion + #accumulate gradients into param.grad or parts of it that belongs to this partition def accumulate_gradients(): if not self.fp16_master_weights_and_gradients: dest_buffer.copy_(self.accumulated_grads_in_cpu[param_id].view(-1), @@ -1167,7 +1167,7 @@ def copy_grads_in_partition(self, param): device=torch.cuda.current_device()) see_memory_usage(f"after copying {total_size} gradients into partition") - # The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer + # The allreduce buffer will be rewritten. Copy the gradients in partition to a new buffer new_grad_tensor = self.grads_in_partition.view(-1).narrow( 0, self.grads_in_partition_offset, @@ -1257,7 +1257,7 @@ def print_func(): self.sequential_execution(print_func, message) def get_grads_to_reduce(self, i, partition_id): - def get_reducable_portion(key): + def get_reducible_portion(key): grad = self.param_dict[key].grad total_elements = grad.numel() start = self.grad_start_offset[i][partition_id][key] @@ -1283,7 +1283,7 @@ def get_reducable_portion(key): grads_to_reduce = [] for key in self.is_grad_computed[i][partition_id]: - grad = get_reducable_portion(key) + grad = get_reducible_portion(key) grads_to_reduce.append(grad) return grads_to_reduce @@ -1500,7 +1500,7 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2): else: total_norm = 0.0 # if dist.get_rank() == 0: - # logger.info(f"Total Norm begining {total_norm}") + # logger.info(f"Total Norm beginning {total_norm}") for g, p in zip(gradients, params): # Pipeline parallelism may replicate parameters. Avoid multi-counting. if hasattr(p, 'ds_pipe_replicated') and p.ds_pipe_replicated: @@ -1655,7 +1655,7 @@ def step(self, closure=None): self.get_grad_norm_direct(self.averaged_gradients[i], self.params_in_partition[i])) - # free gradients for all the prameters that are not updated by this process + # free gradients for all the parameters that are not updated by this process self.free_grad_in_param_list(self.params_not_in_partition[i]) # create a flat gradients for parameters updated by this process diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 5b6fd276d39e..0f19c4902183 100755 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -100,7 +100,7 @@ def _apply_to_tensors_only(module, functional, backward_function, outputs): return outputs -#for each tensor in outputs run the forward_funciton and register backward_function as hook +#for each tensor in outputs run the forward_function and register backward_function as hook def _apply_forward_and_backward_to_tensors_only(module, forward_function, backward_function, @@ -178,7 +178,7 @@ def __init__(self): # maps sub_module id to submodule objects self.id_to_sub_module_map = {} - # stores the total number of parmeters in each sub_module + # stores the total number of parameters in each sub_module self.id_to_sub_module_size_map = {} self.trace_completed = False @@ -362,7 +362,7 @@ def prefetch_next_sub_modules(self, sub_module, numel=5000000, nvme=False): for param in params_to_prefetch: param.ds_status = ZeroParamStatus.INFLIGHT - # keeping track of number of elements consumed by available parmaeters + # keeping track of number of elements consumed by available parameters self._increment_available_parameter_numel(param.ds_numel) if nvme: @@ -432,7 +432,7 @@ def fetch_sub_module(self, sub_module): ) partitioned_params.append(param) - # keeping track of number of elements consumed by available parmaeters + # keeping track of number of elements consumed by available parameters self._increment_available_parameter_numel(param.ds_numel) print_rank_0(f"Incrementing with parameter id {param.ds_id}") @@ -504,7 +504,7 @@ def release_and_reset_parameter(self, param): param.ds_active_sub_modules = 0 if param.ds_status == ZeroParamStatus.AVAILABLE: print_rank_0( - f"Releasing unpartitioned param {debug_param2name_id_numel(param)} active sub-modules {param.ds_active_sub_modules} and persisitence {param.ds_persist}" + f"Releasing unpartitioned param {debug_param2name_id_numel(param)} active sub-modules {param.ds_active_sub_modules} and persistence {param.ds_persist}" ) self._decrement_available_parameter_numel(param.ds_numel) param.partition() @@ -810,7 +810,7 @@ def __init__(self, see_memory_usage(f"After creating fp16 partitions: {num_fp16_subgroups}", force=False) - # Optimizer ensor swapping + # Optimizer tensor swapping if self.swap_optimizer: self._configure_tensor_swapping(offload_optimizer_config, aio_config) @@ -902,7 +902,7 @@ def __init__(self, # stores if a grad in a partition has been computed or not self.is_grad_computed = {} - # will store the averaged gradients required by this parititon + # will store the averaged gradients required by this paritition self.averaged_gradients = {} #creates backward hooks for gradient partitioning @@ -1046,7 +1046,7 @@ def _move_to_flat_buffer(self, param_list, flat_buffer, avoid_copy=False): src.data = dest.data src.status = PartitionedParamStatus.AVAILABLE else: - assert src.status == PartitionedParamStatus.AVAILABLE, "Partitioned Parm must be avialable here" + assert src.status == PartitionedParamStatus.AVAILABLE, "Partitioned Param must be available here" if not avoid_copy: dest.data.copy_(src.data) src.data = dest.data @@ -1175,7 +1175,7 @@ def _create_fp16_partitions_with_defragmentation(self): flat_offset, total_elements) print_rank_0( - f"Creating a flat buffer for subgroup {i} requiring {total_elements} elements, and cumulative CPU elemets {flat_offset + total_elements}", + f"Creating a flat buffer for subgroup {i} requiring {total_elements} elements, and cumulative CPU elements {flat_offset + total_elements}", force=False) #these parameters reside in NVME and elif self.params_in_nvme_and_cpu: @@ -1410,7 +1410,7 @@ def _end_of_forward_hook(module, *args): self.module.register_forward_hook(_end_of_forward_hook) self.module.register_forward_pre_hook(_pre_forward_hook) - # Add top todule to stack trace + # Add top module to stack trace global FWD_MODULE_STACK FWD_MODULE_STACK.append(self.module) @@ -1793,7 +1793,7 @@ def independent_gradient_partition_epilogue(self): see_memory_usage(f"End ipg_epilogue", force=False) # resets all partition to no reduced - # sets remianing grads to the total number of grads in each partition + # sets remaining grads to the total number of grads in each partition # set is grad computed to false for all grads in partition def reset_partition_gradient_structures(self): total_partitions = dist.get_world_size(group=self.dp_process_group) @@ -2140,7 +2140,7 @@ def partition_previous_reduced_grads(self): # self.reduction_stream.synchronize() if self.gradient_accumulation_steps > 1: - # The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer + # The allreduce buffer will be rewritten. Copy the gradients in partition to a new buffer fp16_grad_tensor = self.grads_in_partition[i].narrow( 0, dest_offset, @@ -2173,7 +2173,7 @@ def partition_previous_reduced_grads(self): param, fp32_grad_tensor) else: - # The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer + # The allreduce buffer will be rewritten. Copy the gradients in partition to a new buffer fp16_grad_tensor = self.grads_in_partition[i].narrow( 0, dest_offset, @@ -2248,7 +2248,7 @@ def print_func(): self.sequential_execution(print_func, message) def get_grads_to_reduce(self, i, partition_id): - def get_reducable_portion(key): + def get_reducible_portion(key): grad = self.param_dict[key].grad total_elements = grad.numel() start = self.grad_start_offset[i][partition_id][key] @@ -2274,7 +2274,7 @@ def get_reducable_portion(key): grads_to_reduce = [] for key in self.is_grad_computed[i][partition_id]: - grad = get_reducable_portion(key) + grad = get_reducible_portion(key) grads_to_reduce.append(grad) return grads_to_reduce @@ -2475,7 +2475,7 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2): else: total_norm = 0.0 # if dist.get_rank() == 0: - # logger.info(f"Total Norm begining {total_norm}") + # logger.info(f"Total Norm beginning {total_norm}") for g, p in zip(gradients, params): if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0): param_norm = g.data.double().norm(2) @@ -2819,7 +2819,7 @@ def step(self, closure=None): return def dump_pre_step_gradients(self, debug_fp32_grads): - # Dump gradient norms for debbuging + # Dump gradient norms for debugging for i, _ in enumerate(self.fp16_groups): print(f'Pre-Step Dump Norms for Group {i} FP16P, FP16G, FP32G, FP32GUC') for fp16_param, fp32_grad in zip(self.fp16_groups[i], debug_fp32_grads[i]): @@ -2831,7 +2831,7 @@ def dump_pre_step_gradients(self, debug_fp32_grads): print(f'Pre-Step Norms {i} {param_id} = {norm_list}') def dump_post_step_gradients(self): - # Dump gradient norms for debbuging + # Dump gradient norms for debugging for i, group in enumerate(self.fp16_groups): print( f'Post-Step Dump Norms for Group {i} FP16P, FP16DS, FP16FLAT, FP32FLAT') @@ -3137,7 +3137,7 @@ def _restore_from_fp16_weights(self): def refresh_fp32_params(self): self._restore_from_fp16_weights() - # Extract flattened partion for current rank from all partitions + # Extract flattened partition for current rank from all partitions def _get_flattened_partition(self, all_partition_states): partition_id = dist.get_rank(group=self.dp_process_group) alignment = dist.get_world_size(group=self.dp_process_group) diff --git a/deepspeed/utils/groups.py b/deepspeed/utils/groups.py index dcd774d76501..28a370edd714 100644 --- a/deepspeed/utils/groups.py +++ b/deepspeed/utils/groups.py @@ -100,7 +100,7 @@ def initialize(ep_size=1, mpu=None): Arguments: ep_size (int, optional): default=1, expert parallel size mpu (module, optional): default=None, model parallel unit (e.g., from Megatron) - that descibes model/data parallel ranks. + that describes model/data parallel ranks. """ if mpu is not None: @@ -121,7 +121,7 @@ def initialize_model_parallel(model_parallel_size_): Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we use 2 GPUs to parallelize the model. The present function will - create 4 model parallel groups and 2 data parallel grous as: + create 4 model parallel groups and 2 data parallel groups as: 4 model parallel groups: [g0, g1], [g2, g3], [g4, g5], [g6, g7] 2 data parallel groups: diff --git a/deepspeed/utils/zero_to_fp32.py b/deepspeed/utils/zero_to_fp32.py index f6f4fadc62a3..357328b10ed9 100755 --- a/deepspeed/utils/zero_to_fp32.py +++ b/deepspeed/utils/zero_to_fp32.py @@ -269,7 +269,7 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None): model.load_state_dict(state_dict) # submit to model hub or save the model to share with others - In this example the ``model`` will no longer be useable in the deepspeed context of the same + In this example the ``model`` will no longer be usable in the deepspeed context of the same application. i.e. you will need to re-initialize the deepspeed engine, since ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. @@ -332,7 +332,7 @@ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) # submit to model hub or save the model to share with others - Note, that once this was run, the ``model`` will no longer be useable in the deepspeed context + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context of the same application. i.e. you will need to re-initialize the deepspeed engine, since ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md index af2d2cc8b83e..3a2da27e3ebd 100755 --- a/docs/_pages/config-json.md +++ b/docs/_pages/config-json.md @@ -4,7 +4,7 @@ title: "DeepSpeed Configuration JSON" ### Batch Size Related Parameters -**Note:** **train_batch_size** must be equal to **train_micro_batch_size_per_gpu** * **gradient_accumulation** * number of GPUs. For simplicty, you can choose to only specify two of the three parameters, the last one will be inferred automatically by DeepSpeed. +**Note:** **train_batch_size** must be equal to **train_micro_batch_size_per_gpu** * **gradient_accumulation** * number of GPUs. For simplicity, you can choose to only specify two of the three parameters, the last one will be inferred automatically by DeepSpeed. {: .notice--warning} **train_batch_size**: [integer] @@ -662,7 +662,7 @@ Configuring the asynchronous I/O module for offloading parameter and optimizer s | Description | Default | | -------------------------------------------------------------------------------------------------------- | ------- | -| Total number of activation checkpoints used to allocate memory buffer for contiguous_memoty_optimization | `None` | +| Total number of activation checkpoints used to allocate memory buffer for contiguous_memory_optimization | `None` | **synchronize_checkpoint_boundary**: [boolean] diff --git a/docs/_posts/2021-05-05-MoQ.md b/docs/_posts/2021-05-05-MoQ.md index 264aa9f4123d..a0ea351ca38b 100644 --- a/docs/_posts/2021-05-05-MoQ.md +++ b/docs/_posts/2021-05-05-MoQ.md @@ -37,7 +37,7 @@ Figure 2 shows another mixed-precision quantization that sets target bits as 4, ![MoQ (mixed-precision)](/assets/images/bingbert-mixedbit.png){: .align-center} Figure 3: Mixed-precision quantization with MoQ for Bert SQuAD plus. -As another example, we use eigenvalue-based MoQ to quantize Bert-Large for SQuAD finetuning. Figure 3 shows the number of bits we get to at the end of finetuning on each layer. Here, we see slightly different percision spectrum compared to BertBase on GLUE tasks. As the figure shows, we can reducethe precision on the first few layers more aggressively than the middle ones. Also, the last few layers can tollerate very low precision similar to the beginning layers. This way of quantization finally results in 90.56 F1 Score which is prettey similar to the baseline. +As another example, we use eigenvalue-based MoQ to quantize Bert-Large for SQuAD finetuning. Figure 3 shows the number of bits we get to at the end of finetuning on each layer. Here, we see slightly different precision spectrum compared to BertBase on GLUE tasks. As the figure shows, we can reduce the precision on the first few layers more aggressively than the middle ones. Also, the last few layers can tolerate very low precision similar to the beginning layers. This way of quantization finally results in 90.56 F1 Score which is pretty similar to the baseline. ## Quantized Inference Kernels diff --git a/docs/_posts/2021-05-05-inference-kernel-optimization.md b/docs/_posts/2021-05-05-inference-kernel-optimization.md index 32078e3bcd98..18ab7c32186d 100644 --- a/docs/_posts/2021-05-05-inference-kernel-optimization.md +++ b/docs/_posts/2021-05-05-inference-kernel-optimization.md @@ -1,6 +1,6 @@ --- layout: single -title: "DeepSpeed Inference: Multi-GPU inference with customized inference kerenls and quantization support" +title: "DeepSpeed Inference: Multi-GPU inference with customized inference kernels and quantization support" excerpt: "" categories: news new_post: false diff --git a/docs/_tutorials/MoQ-tutorial.md b/docs/_tutorials/MoQ-tutorial.md index 00a7024ef4a4..726492c8ca9d 100644 --- a/docs/_tutorials/MoQ-tutorial.md +++ b/docs/_tutorials/MoQ-tutorial.md @@ -27,7 +27,7 @@ MoQ quantization schedule is defined by a number of parameters which allow users `quantize_groups`: Quantization groups, which shows the number of scales used to quantize a model, default is 1. -`quantize_bits`, The numer of bits to control the data-precision transition from a start-bit to thhe final target-bits (e.g. starting from 16-bit down to 8-bit). +`quantize_bits`, The numer of bits to control the data-precision transition from a start-bit to the final target-bits (e.g. starting from 16-bit down to 8-bit). `start_bits`: The start bits in quantization training. Default is set to 16. `target_bits`: The target bits in quantization training. Default is set to 16. @@ -63,7 +63,7 @@ MoQ quantization schedule is defined by a number of parameters which allow users ## How to Use MoQ for GLUE Training Tasks -Before fine-tunning the GLUE tasks using DeepSpeed MoQ, you need: +Before fine-tuning the GLUE tasks using DeepSpeed MoQ, you need: 1. Install DeepSpeed. 2. Checkout Huggingface transformers branch, install it with all required packages. @@ -206,6 +206,6 @@ As we see in the following table, MoQ consistently preserve accuracy across diff ### Tips -When using the MoQ, one needs to consider the number of samples and training iterations before setting the correct quatization period or offset to make sure that the quantization reaches the desired level of precision before training finishes. +When using the MoQ, one needs to consider the number of samples and training iterations before setting the correct quantization period or offset to make sure that the quantization reaches the desired level of precision before training finishes. -Enabling eigenvalues for quantization dynamically adjust the quantization period on the different parts of the network. This has two positive impact: 1) the quantized network can potentially produce higher accuracy than quantizing each layer with same `quantize_period` ; 2) it automatically identifies a good quantization schedule for each layer based on its senitivity. +Enabling eigenvalues for quantization dynamically adjust the quantization period on the different parts of the network. This has two positive impact: 1) the quantized network can potentially produce higher accuracy than quantizing each layer with same `quantize_period` ; 2) it automatically identifies a good quantization schedule for each layer based on its sensitivity. diff --git a/docs/_tutorials/advanced-install.md b/docs/_tutorials/advanced-install.md index a1493a9e9e24..fcb5eb64f38f 100755 --- a/docs/_tutorials/advanced-install.md +++ b/docs/_tutorials/advanced-install.md @@ -26,7 +26,7 @@ ds_report ## Pre-install DeepSpeed Ops -**Note:** [PyTorch](https://pytorch.org/) must be installed _before_ pre-compiling any DeepSpeed c++/cuda ops. However, this is not required if using the default mode of JIT compilition of ops. +**Note:** [PyTorch](https://pytorch.org/) must be installed _before_ pre-compiling any DeepSpeed c++/cuda ops. However, this is not required if using the default mode of JIT compilation of ops. {: .notice--info} Sometimes we have found it useful to pre-install either some or all DeepSpeed diff --git a/docs/_tutorials/flops-profiler.md b/docs/_tutorials/flops-profiler.md index 8a05557c9216..fdd5a0a92e1a 100644 --- a/docs/_tutorials/flops-profiler.md +++ b/docs/_tutorials/flops-profiler.md @@ -139,14 +139,14 @@ The DeepSpeed Flops Profiler can be used with the DeepSpeed runtime without any ## Flops Measurement -Similar to exsiting flops calculation tools or methods, the DeepSpeed Flops Profiler measures the flops of the forward pass of a module and the flops of the backward pass is estimated as `2` times of that of the forward pass. -Different from the PyTorch profiler which calculates the flops of PyTorch operators, the DeepSpeed Flops Profiler measures the flops witin modules in a model and provides more insights to the users about the model execution. +Similar to existing flops calculation tools or methods, the DeepSpeed Flops Profiler measures the flops of the forward pass of a module and the flops of the backward pass is estimated as `2` times of that of the forward pass. +Different from the PyTorch profiler which calculates the flops of PyTorch operators, the DeepSpeed Flops Profiler measures the flops within modules in a model and provides more insights to the users about the model execution. The flops estimation is partly inspired by [ptflops](https://github.com/sovrasov/flops-counter.pytorch) with the major difference being that the DeepSpeed Flops Profiler not only supports flops computation directly at module level, but can also capture ```torch.nn.functional``` invoked in a module to estimate the flops. Thus the DeepSpeed Flops Profiler allows for customized modules in the model, e.g., ```ParallelTransformerLayerworks, ParallelSelfAttention, RowParallelLinear, etc.``` in [Megatron-LM](https://github.com/NVIDIA/Megatron-LM). This is in contrast to ptflops which requires users to write customized flops calculation functions for each customized module. ## Multi-GPU, Multi-node, Data Parallelism, and Model Parallelism -The DeepSpeed Flops Profiler outputs the per GPU profile as well as the world size, data parallel size, and model paralel size. 1 +The DeepSpeed Flops Profiler outputs the per GPU profile as well as the world size, data parallel size, and model parallel size. 1 For models running on multi-GPU or multi-node, only change of the model parallelism (e.g. ```--model-parallel-size``` in [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)) affects the number of flops and parameters profiled, i.e., `model_parallel_size * flops = total_flops` and `model_parallel_size * parameters = total_parameters`. The data parallel size or world size (related to the number of GPUs or nodes) does not affect the per GPU profile. @@ -376,7 +376,7 @@ with torch.cuda.device(0): #### In Model Training Workflow To profile model forward in a training workflow, use the `FlopsProfiler`class. -The `FlopsProfiler`class provides the follwing methods: +The `FlopsProfiler`class provides the following methods: * `start_profile()` - starts profiling * `get_total_flops(as_string=False)` - returns the total number of MACs in the model * `get_total_params(as_string=False)` - returns the total number of parameters in the model diff --git a/docs/_tutorials/pytorch-profiler.md b/docs/_tutorials/pytorch-profiler.md index 86530b686450..7389f9b1e61f 100644 --- a/docs/_tutorials/pytorch-profiler.md +++ b/docs/_tutorials/pytorch-profiler.md @@ -7,7 +7,7 @@ This tutorial describes how to use [PyTorch Profiler](https://pytorch.org/blog/i PyTorch Profiler is an open-source tool that enables accurate and efficient performance analysis and troubleshooting for large-scale deep learning models. The profiling results can be outputted as a `.json` trace file and viewed in Google Chrome's trace viewer (chrome://tracing). Microsoft Visual Studio Code's Python extension integrates TensorBoard into the code editor, including the support for the PyTorch Profiler. -For more detais, refer to [PYTORCH PROFILER](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html#pytorch-profiler). +For more details, refer to [PYTORCH PROFILER](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html#pytorch-profiler). ## Profile the model training loop diff --git a/docs/_tutorials/sparse-attention.md b/docs/_tutorials/sparse-attention.md index 184d3e621e2d..7f7230178617 100644 --- a/docs/_tutorials/sparse-attention.md +++ b/docs/_tutorials/sparse-attention.md @@ -148,7 +148,7 @@ Please refer to the Docstrings for details of how to use each module separately. ## How to config sparsity structures Following we describe supported sparsity structures, their parameter set and the flexibility of adding arbitrary sparsity pattern on the self-attention layer. You can update DeepSpeed config file using any of the supported sparsity structures and set the parameters accordingly. -* **SpasityConfig**: +* **SparsityConfig**: This module, is the parent class for all sparsity structures and contains the shared features of all sparsity structures. It takes the following parameters: * `num_heads`: an integer determining number of attention heads of the layer. * `block`: an integer determining the block size. Current implementation of sparse self-attention is based on blocked sparse matrices. In which this parameter defines size of such square blocks; `Block X Block`. @@ -164,7 +164,7 @@ This structure is based on [Generative Modeling with Sparse Transformers](https: ![Fixed sparsity structure](/assets/images/sa_fixed_sparsity_structure.png) -* **BSLongformer** (BSLongformerSparistyConfig): +* **BSLongformer** (BSLongformerSparsityConfig): This structure is an edited version of [Longformer: The Long-Document Transformer](https://arxiv.org/pdf/2004.05150.pdf), in which instead of single token-wise sparsity, we offer block of tokens sparsity. Parameters that define this patters are: * `num_sliding_window_blocks`: an integer determining the number of blocks in sliding local attention window. * `global_block_indices`: a list of integers determining which blocks are considered as global attention. Given indices, determine the blocks that all other token blocks attend to and they attend to all other token blocks. Notice that if `global_block_end_indices` parameter is set, this parameter is used as starting index of each global window. diff --git a/docs/code-docs/source/initialize.rst b/docs/code-docs/source/initialize.rst index 938045de8fc8..492c42fe9ef6 100644 --- a/docs/code-docs/source/initialize.rst +++ b/docs/code-docs/source/initialize.rst @@ -25,7 +25,7 @@ to add DeepSpeed's builtin arguments to your application's parser. Training Initialization ----------------------- -The entrypoint for all training with DeepSpeed is ``deepspeed.initialize()``. Will initialize distributed backend if it is not intialized already. +The entrypoint for all training with DeepSpeed is ``deepspeed.initialize()``. Will initialize distributed backend if it is not initialized already. Example usage: @@ -39,6 +39,6 @@ Example usage: Distributed Initialization ----------------------- -Optional distributed backend initializating separate from ``deepspeed.initialize()``. Useful in scenarios where the user wants to use torch distributed calls before calling ``deepspeed.initialize()``, such as when using model parallelism, pipeline parallelism, or certain data loader scenarios. +Optional distributed backend initialization separate from ``deepspeed.initialize()``. Useful in scenarios where the user wants to use torch distributed calls before calling ``deepspeed.initialize()``, such as when using model parallelism, pipeline parallelism, or certain data loader scenarios. .. autofunction:: deepspeed.init_distributed diff --git a/op_builder/__init__.py b/op_builder/__init__.py index f19ed916c332..0ad816f74ad5 100755 --- a/op_builder/__init__.py +++ b/op_builder/__init__.py @@ -9,7 +9,7 @@ from .stochastic_transformer import StochasticTransformerBuilder from .utils import UtilsBuilder from .async_io import AsyncIOBuilder -from .builder import get_default_compute_capatabilities +from .builder import get_default_compute_capabilities from .transformer_inference import InferenceBuilder from .quantizer import QuantizerBuilder diff --git a/op_builder/builder.py b/op_builder/builder.py index 78b52640d7a2..de8c9407d1e0 100644 --- a/op_builder/builder.py +++ b/op_builder/builder.py @@ -48,7 +48,7 @@ def installed_cuda_version(): return int(cuda_major), int(cuda_minor) -def get_default_compute_capatabilities(): +def get_default_compute_capabilities(): compute_caps = DEFAULT_COMPUTE_CAPABILITIES import torch.utils.cpp_extension if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version( @@ -172,7 +172,7 @@ def has_function(self, funcname, libraries, verbose=False): ''' Test for existence of a function within a tuple of libraries. - This is used as a smoke test to check whether a certain library is avaiable. + This is used as a smoke test to check whether a certain library is available. As a test, this creates a simple C program that calls the specified function, and then distutils is used to compile that program and link it with the specified libraries. Returns True if both the compile and link are successful, False otherwise. @@ -440,7 +440,7 @@ def compute_capability_args(self, cross_compile_archs=None): cross_compile_archs = cross_compile_archs_env.replace(' ', ';') else: if cross_compile_archs is None: - cross_compile_archs = get_default_compute_capatabilities() + cross_compile_archs = get_default_compute_capabilities() ccs = cross_compile_archs.split(';') args = [] diff --git a/setup.py b/setup.py index c63f12d5b503..58d58511c8b1 100755 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ print('[WARNING] Unable to import torch, pre-compiling ops will be disabled. ' \ 'Please visit https://pytorch.org/ to see how to properly install torch on your system.') -from op_builder import ALL_OPS, get_default_compute_capatabilities +from op_builder import ALL_OPS, get_default_compute_capabilities RED_START = '\033[31m' RED_END = '\033[0m' @@ -88,7 +88,7 @@ def fetch_requirements(path): "you can ignore this message. Adding compute capability for Pascal, Volta, and Turing " "(compute capabilities 6.0, 6.1, 6.2)") if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None: - os.environ["TORCH_CUDA_ARCH_LIST"] = get_default_compute_capatabilities() + os.environ["TORCH_CUDA_ARCH_LIST"] = get_default_compute_capabilities() ext_modules = [] diff --git a/tests/benchmarks/flatten_bench.py b/tests/benchmarks/flatten_bench.py index b3ed3c601492..a337a1b525f3 100755 --- a/tests/benchmarks/flatten_bench.py +++ b/tests/benchmarks/flatten_bench.py @@ -105,7 +105,7 @@ def timeme(): def line_profileme(): - print("--------------- line_profier -----------------") + print("--------------- line_profiler -----------------") print("py") profile(py)() gc.collect() diff --git a/tests/model/Megatron_GPT2/ds_gpt2_test.sh b/tests/model/Megatron_GPT2/ds_gpt2_test.sh index ac5d7e379023..1f60036b83c5 100755 --- a/tests/model/Megatron_GPT2/ds_gpt2_test.sh +++ b/tests/model/Megatron_GPT2/ds_gpt2_test.sh @@ -3,7 +3,7 @@ helpFunction() { echo "" - echo "Usage: $0 -m model-parallelism -g gpu-per-node -n node# -b batch-size -s stpes -l layers -h hidden_size -q seq_length -e heads -c ckpt_num_layers -p [-d]" + echo "Usage: $0 -m model-parallelism -g gpu-per-node -n node# -b batch-size -s steps -l layers -h hidden_size -q seq_length -e heads -c ckpt_num_layers -p [-d]" echo -e "\t-m model parallelism" echo -e "\t-g gpus per node" echo -e "\t-n node count" diff --git a/tests/unit/modeling.py b/tests/unit/modeling.py index 0a2173f0f8f2..8bf2d6dba9da 100755 --- a/tests/unit/modeling.py +++ b/tests/unit/modeling.py @@ -2,7 +2,7 @@ # https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/modeling.py # coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -295,7 +295,7 @@ def __init__(self, layer in the Transformer encoder. hidden_act: The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu" and "swish" are supported. - hidden_dropout_prob: The dropout probabilitiy for all fully connected + hidden_dropout_prob: The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob: The dropout ratio for the attention probabilities. @@ -800,7 +800,7 @@ def forward(self, sequence_output, pooled_output): class BertPreTrainedModel(nn.Module): """ An abstract class to handle weights initialization and - a simple interface for dowloading and loading pretrained models. + a simple interface for downloading and loading pretrained models. """ def __init__(self, config, *inputs, **kwargs): super(BertPreTrainedModel, self).__init__() @@ -856,7 +856,7 @@ def from_pretrained(cls, . `model.chkpt` a TensorFlow checkpoint from_tf: should we load the weights from a locally saved TensorFlow checkpoint cache_dir: an optional path to a folder in which the pre-trained models will be cached. - state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models + state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models *inputs, **kwargs: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification) """ @@ -977,7 +977,7 @@ class BertModel(BertPreTrainedModel): `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`. Outputs: Tuple of (encoded_layers, pooled_output) - `encoded_layers`: controled by `output_all_encoded_layers` argument: + `encoded_layers`: controlled by `output_all_encoded_layers` argument: - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size], diff --git a/tests/unit/modelingpreln.py b/tests/unit/modelingpreln.py index 1053fa685f06..43f210ec9944 100755 --- a/tests/unit/modelingpreln.py +++ b/tests/unit/modelingpreln.py @@ -2,7 +2,7 @@ # https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/modeling.py # coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -295,7 +295,7 @@ def __init__(self, layer in the Transformer encoder. hidden_act: The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu" and "swish" are supported. - hidden_dropout_prob: The dropout probabilitiy for all fully connected + hidden_dropout_prob: The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob: The dropout ratio for the attention probabilities. @@ -895,7 +895,7 @@ def forward(self, sequence_output, pooled_output): class BertPreTrainedModel(nn.Module): """ An abstract class to handle weights initialization and - a simple interface for dowloading and loading pretrained models. + a simple interface for downloading and loading pretrained models. """ def __init__(self, config, *inputs, **kwargs): super(BertPreTrainedModel, self).__init__() @@ -951,7 +951,7 @@ def from_pretrained(cls, . `model.chkpt` a TensorFlow checkpoint from_tf: should we load the weights from a locally saved TensorFlow checkpoint cache_dir: an optional path to a folder in which the pre-trained models will be cached. - state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models + state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models *inputs, **kwargs: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification) """ @@ -1072,7 +1072,7 @@ class BertModel(BertPreTrainedModel): `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`. Outputs: Tuple of (encoded_layers, pooled_output) - `encoded_layers`: controled by `output_all_encoded_layers` argument: + `encoded_layers`: controlled by `output_all_encoded_layers` argument: - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size], diff --git a/tests/unit/test_activation_checkpointing.py b/tests/unit/test_activation_checkpointing.py index ae371f9968f2..73ee6a25df78 100644 --- a/tests/unit/test_activation_checkpointing.py +++ b/tests/unit/test_activation_checkpointing.py @@ -165,7 +165,7 @@ def test_ckpt_inputs1_outputs1(): _test_activation_checkpoint(module, inputs) -# both bool and float are important, as bool is not diffentiable +# both bool and float are important, as bool is not differentiable @pytest.mark.parametrize('mask', [ _mixed_mask(), diff --git a/tests/unit/test_autocast.py b/tests/unit/test_autocast.py index 61ed831dead4..004cd8533869 100644 --- a/tests/unit/test_autocast.py +++ b/tests/unit/test_autocast.py @@ -30,7 +30,7 @@ def test_missing_amp_autocast(tmpdir, half_op): @pytest.mark.parametrize('half_op', [False, True]) def test_disable_autocast_linear(tmpdir, half_op): if _skip_autocast_test(): - pytest.skip("amp autocast is not availalbe") + pytest.skip("amp autocast is not available") hidden_dim = 4 if half_op: @@ -56,7 +56,7 @@ def test_disable_autocast_linear(tmpdir, half_op): True)]) def test_autocast_linear(tmpdir, half_input, half_weight): if _skip_autocast_test(): - pytest.skip("amp autocast is not availalbe") + pytest.skip("amp autocast is not available") hidden_dim = 4 input = torch.randn(hidden_dim).cuda() diff --git a/tests/unit/test_cpu_adam.py b/tests/unit/test_cpu_adam.py index 2ae1f8201328..dd5527b01371 100755 --- a/tests/unit/test_cpu_adam.py +++ b/tests/unit/test_cpu_adam.py @@ -20,7 +20,7 @@ def check_equal(first, second, atol=1e-2, verbose=False): print("x = {}".format(x.flatten())) print("y = {}".format(y.flatten())) print('-' * 80) - np.testing.assert_allclose(x, y, err_msg="param-update dismatch!", atol=atol) + np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol) @pytest.mark.parametrize('model_size', [ diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/test_cuda_backward.py index 62ccdbdc68d9..6db42e4fa1e7 100755 --- a/tests/unit/test_cuda_backward.py +++ b/tests/unit/test_cuda_backward.py @@ -61,16 +61,16 @@ def check_equal(first, second, atol=1e-2, verbose=False): for i in range(len(x.shape) - 1): countx *= x.shape[i + 1] avgx = np.sum(avgx) - tollerance = 1 + tolerance = 1 if avgx != float('inf') and avgx != -float('inf'): avgx = avgx / countx - tollerance = avgx * atol + tolerance = avgx * atol if verbose: - print("tollerance is ", tollerance) + print("tolerance is ", tolerance) print("x = {}".format(x.flatten())) print("y = {}".format(y.flatten())) print('-' * 80) - np.testing.assert_allclose(x, y, err_msg="Index: {}".format(i), atol=tollerance) + np.testing.assert_allclose(x, y, err_msg="Index: {}".format(i), atol=tolerance) def zero_grad(variables): diff --git a/tests/unit/test_dist.py b/tests/unit/test_dist.py index 04b97031b3e5..25a5fd22770f 100644 --- a/tests/unit/test_dist.py +++ b/tests/unit/test_dist.py @@ -13,7 +13,7 @@ def test_init(): assert dist.get_rank() < 3 -# Demonstration of pytest's paramaterization +# Demonstration of pytest's parameterization @pytest.mark.parametrize('number,color', [(1138, 'purple')]) def test_dist_args(number, color): """Outer test function with inputs from pytest.mark.parametrize(). Uses a distributed diff --git a/tests/unit/test_flops_profiler.py b/tests/unit/test_flops_profiler.py index 133610d04ffd..f4654f93fb07 100644 --- a/tests/unit/test_flops_profiler.py +++ b/tests/unit/test_flops_profiler.py @@ -6,7 +6,7 @@ from common import distributed_test -def test_flops_profiler_in_ds_trainning(tmpdir): +def test_flops_profiler_in_ds_training(tmpdir): config_dict = { "train_batch_size": 1, "steps_per_print": 1, @@ -34,7 +34,7 @@ def test_flops_profiler_in_ds_trainning(tmpdir): model = SimpleModel(hidden_dim, empty_grad=False) @distributed_test(world_size=[1]) - def _test_flops_profiler_in_ds_trainning(args, model, hidden_dim): + def _test_flops_profiler_in_ds_training(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) @@ -52,7 +52,7 @@ def _test_flops_profiler_in_ds_trainning(args, model, hidden_dim): assert model.flops_profiler.flops == 100 assert model.flops_profiler.params == 110 - _test_flops_profiler_in_ds_trainning(args, model, hidden_dim) + _test_flops_profiler_in_ds_training(args, model, hidden_dim) class LeNet5(torch.nn.Module):