Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix many typos #1423

Merged
merged 6 commits into from
Oct 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op,
{
const auto op_string = read_op ? "Read" : "Write";
if (num_bytes % get_thread_count()) {
std::cout << "deepseed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
<< " not divisible by thread count = " << get_thread_count() << std::endl;
return false;
}
Expand Down
4 changes: 2 additions & 2 deletions csrc/aio/py_test/ds_aio_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,15 +130,15 @@ def _aio_handle_tasklet(pool_params):
return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops


def _init_takslet(b):
def _init_tasklet(b):
global aio_barrier
aio_barrier = b


def aio_basic_multiprocessing(args, read_op):
b = Barrier(args.threads)
pool_params = [(args, p, read_op) for p in range(args.threads)]
with Pool(processes=args.threads, initializer=_init_takslet, initargs=(b, )) as p:
with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
pool_results = p.map(_aio_handle_tasklet, pool_params)

report_results(args, read_op, pool_results)
4 changes: 2 additions & 2 deletions csrc/aio/py_test/ds_aio_handle.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,15 +162,15 @@ def _aio_handle_tasklet(pool_params):
return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops


def _init_takslet(b):
def _init_tasklet(b):
global aio_barrier
aio_barrier = b


def aio_handle_multiprocessing(args, read_op):
b = Barrier(args.threads)
pool_params = [(args, p, read_op) for p in range(args.threads)]
with Pool(processes=args.threads, initializer=_init_takslet, initargs=(b, )) as p:
with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
pool_results = p.map(_aio_handle_tasklet, pool_params)

report_results(args, read_op, pool_results)
4 changes: 2 additions & 2 deletions csrc/aio/py_test/run_read_sweep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ if [[ $# -ne 2 ]]; then
fi


function validate_enviroment()
function validate_environment()
{
validate_cmd="python ./validate_async_io.py"
eval ${validate_cmd}
Expand All @@ -18,7 +18,7 @@ function validate_enviroment()
}


validate_enviroment
validate_environment

INPUT_FILE=$1
if [[ ! -f ${INPUT_FILE} ]]; then
Expand Down
4 changes: 2 additions & 2 deletions csrc/aio/py_test/run_write_sweep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ function prep_folder()
fi
}

function validate_enviroment()
function validate_environment()
{
validate_cmd="python ./validate_async_io.py"
eval ${validate_cmd}
Expand All @@ -23,7 +23,7 @@ function validate_enviroment()



validate_enviroment
validate_environment

if [[ $# -ne 3 ]]; then
echo "Usage: $0 <write size in MB> <write dir ><output log dir>"
Expand Down
8 changes: 4 additions & 4 deletions csrc/includes/custom_cuda_layers.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,25 +27,25 @@
#define MAX_REG 256

template <typename T>
void launch_qunatize_kernel(T* vals,
void launch_quantize_kernel(T* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template <typename T>
void launch_sr_qunatize_kernel(T* vals,
void launch_sr_quantize_kernel(T* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template <typename T>
void launch_qunatize_kernel_asym(T* vals,
void launch_quantize_kernel_asym(T* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template <typename T>
void launch_sr_qunatize_kernel_asym(T* vals,
void launch_sr_quantize_kernel_asym(T* vals,
int total_count,
int group_num,
int num_bits,
Expand Down
2 changes: 1 addition & 1 deletion csrc/includes/ds_transformer_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,6 @@ class BertTransformerLayer {
bool _normalize_invertible;
bool _gelu_checkpoint;

// High Performace flags
// High Performance flags
bool _stochastic_mode;
};
4 changes: 2 additions & 2 deletions csrc/includes/softmax.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ class Softmax {
size_t heads;
size_t seq_length;
size_t prob_depth;
float temprature;
float temperature;
bool mem_alloc;
Config(size_t batch, size_t h, size_t seq, int prob_size = 0, bool mem_alloc = false)
: batchSize(batch),
heads(h),
seq_length(seq),
prob_depth(prob_size),
temprature(1.0),
temperature(1.0),
mem_alloc(mem_alloc)
{
}
Expand Down
2 changes: 1 addition & 1 deletion csrc/includes/type_shim.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
#include <ATen/ATen.h>

// Forward/backward compatiblity hack around
// Forward/backward compatibility hack around
// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
// pending more future-proof guidance from upstream.
// struct TypeShim
Expand Down
8 changes: 4 additions & 4 deletions csrc/quantization/pt_binding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ at::Tensor ds_quantize(at::Tensor& vals, int groups, int bits)
for (auto dim : t_size) size *= dim;

if ((((size / groups) - 1) / 4096 + 1) <= MAX_REG) {
launch_qunatize_kernel(
launch_quantize_kernel(
(T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
}
return vals;
Expand All @@ -25,7 +25,7 @@ at::Tensor ds_sr_quantize(at::Tensor& vals, int groups, int bits)
for (auto dim : t_size) size *= dim;

if (((size / groups) / 4 / 1024) <= 256) {
launch_sr_qunatize_kernel(
launch_sr_quantize_kernel(
(T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
}
return vals;
Expand All @@ -39,7 +39,7 @@ at::Tensor ds_quantize_asym(at::Tensor& vals, int groups, int bits)
for (auto dim : t_size) size *= dim;

if ((((size / groups) - 1) / 4096 + 1) <= MAX_REG) {
launch_qunatize_kernel_asym(
launch_quantize_kernel_asym(
(T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
}
return vals;
Expand All @@ -53,7 +53,7 @@ at::Tensor ds_sr_quantize_asym(at::Tensor& vals, int groups, int bits)
for (auto dim : t_size) size *= dim;

if (((size / groups) / 4 / 1024) <= 256) {
launch_sr_qunatize_kernel_asym(
launch_sr_quantize_kernel_asym(
(T*)vals.data_ptr(), size, groups, bits, at::cuda::getCurrentCUDAStream());
}
return vals;
Expand Down
48 changes: 24 additions & 24 deletions csrc/quantization/quantizer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

namespace cg = cooperative_groups;

__global__ void qunatize_kernel(__half* vals, int group_size, int num_bits)
__global__ void quantize_kernel(__half* vals, int group_size, int num_bits)
{
#if __CUDA_ARCH__ >= 700

Expand Down Expand Up @@ -93,7 +93,7 @@ __global__ void qunatize_kernel(__half* vals, int group_size, int num_bits)
#endif
}

__global__ void qunatize_kernel(float* vals, int group_size, int num_bits)
__global__ void quantize_kernel(float* vals, int group_size, int num_bits)
{
cg::thread_block b = cg::this_thread_block();
cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
Expand Down Expand Up @@ -176,7 +176,7 @@ __global__ void qunatize_kernel(float* vals, int group_size, int num_bits)
}

template <typename T>
void launch_qunatize_kernel(T* vals,
void launch_quantize_kernel(T* vals,
int total_count,
int group_num,
int num_bits,
Expand All @@ -185,22 +185,22 @@ void launch_qunatize_kernel(T* vals,
dim3 grid_dim(group_num);
dim3 block_dim(1024);

qunatize_kernel<<<grid_dim, block_dim, 0, stream>>>(
quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
vals, (total_count / group_num) / 4, num_bits);
}

template void launch_qunatize_kernel(float* vals,
template void launch_quantize_kernel(float* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template void launch_qunatize_kernel(__half* vals,
template void launch_quantize_kernel(__half* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);

__global__ void sr_qunatize_kernel(__half* vals,
__global__ void sr_quantize_kernel(__half* vals,
int token_size,
int token_num,
int num_bits,
Expand Down Expand Up @@ -336,7 +336,7 @@ __global__ void sr_qunatize_kernel(__half* vals,
#endif
}

__global__ void sr_qunatize_kernel(float* vals,
__global__ void sr_quantize_kernel(float* vals,
int token_size,
int token_num,
int num_bits,
Expand Down Expand Up @@ -456,7 +456,7 @@ __global__ void sr_qunatize_kernel(float* vals,
}

template <typename T>
void launch_sr_qunatize_kernel(T* vals,
void launch_sr_quantize_kernel(T* vals,
int total_count,
int group_num,
int num_bits,
Expand All @@ -468,21 +468,21 @@ void launch_sr_qunatize_kernel(T* vals,
uint64_t inc = total_count / grid_dim.x / block_dim.x;
std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);

sr_qunatize_kernel<<<grid_dim, block_dim, 0, stream>>>(
sr_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
vals, (total_count / group_num) / 4, group_num, num_bits, seed);
}
template void launch_sr_qunatize_kernel(float* vals,
template void launch_sr_quantize_kernel(float* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template void launch_sr_qunatize_kernel(__half* vals,
template void launch_sr_quantize_kernel(__half* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);

__global__ void qunatize_kernel_asym(__half* vals, int group_size, int num_bits)
__global__ void quantize_kernel_asym(__half* vals, int group_size, int num_bits)
{
#if __CUDA_ARCH__ >= 700

Expand Down Expand Up @@ -595,7 +595,7 @@ __global__ void qunatize_kernel_asym(__half* vals, int group_size, int num_bits)
#endif
}

__global__ void qunatize_kernel_asym(float* vals, int group_size, int num_bits)
__global__ void quantize_kernel_asym(float* vals, int group_size, int num_bits)
{
cg::thread_block b = cg::this_thread_block();
cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
Expand Down Expand Up @@ -699,7 +699,7 @@ __global__ void qunatize_kernel_asym(float* vals, int group_size, int num_bits)
}

template <typename T>
void launch_qunatize_kernel_asym(T* vals,
void launch_quantize_kernel_asym(T* vals,
int total_count,
int group_num,
int num_bits,
Expand All @@ -708,22 +708,22 @@ void launch_qunatize_kernel_asym(T* vals,
dim3 grid_dim(group_num);
dim3 block_dim(1024);

qunatize_kernel_asym<<<grid_dim, block_dim, 0, stream>>>(
quantize_kernel_asym<<<grid_dim, block_dim, 0, stream>>>(
vals, (total_count / group_num) / 4, num_bits);
}

template void launch_qunatize_kernel_asym(float* vals,
template void launch_quantize_kernel_asym(float* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template void launch_qunatize_kernel_asym(__half* vals,
template void launch_quantize_kernel_asym(__half* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);

__global__ void sr_qunatize_kernel_asym(__half* vals,
__global__ void sr_quantize_kernel_asym(__half* vals,
int token_size,
int token_num,
int num_bits,
Expand Down Expand Up @@ -879,7 +879,7 @@ __global__ void sr_qunatize_kernel_asym(__half* vals,
#endif
}

__global__ void sr_qunatize_kernel_asym(float* vals,
__global__ void sr_quantize_kernel_asym(float* vals,
int token_size,
int token_num,
int num_bits,
Expand Down Expand Up @@ -1010,7 +1010,7 @@ __global__ void sr_qunatize_kernel_asym(float* vals,
}
}
template <typename T>
void launch_sr_qunatize_kernel_asym(T* vals,
void launch_sr_quantize_kernel_asym(T* vals,
int total_count,
int group_num,
int num_bits,
Expand All @@ -1022,15 +1022,15 @@ void launch_sr_qunatize_kernel_asym(T* vals,
uint64_t inc = total_count / grid_dim.x / block_dim.x;
std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);

sr_qunatize_kernel<<<grid_dim, block_dim, 0, stream>>>(
sr_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
vals, (total_count / group_num) / 4, group_num, num_bits, seed);
}
template void launch_sr_qunatize_kernel_asym(float* vals,
template void launch_sr_quantize_kernel_asym(float* vals,
int total_count,
int group_num,
int num_bits,
cudaStream_t stream);
template void launch_sr_qunatize_kernel_asym(__half* vals,
template void launch_sr_quantize_kernel_asym(__half* vals,
int total_count,
int group_num,
int num_bits,
Expand Down
4 changes: 2 additions & 2 deletions deepspeed/inference/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def _create_model_parallel_group(self):
self.mp_group = InferenceEngine.inference_mp_group

def _check_quantize_setting(self, quantization_setting):
self.quatize_bits = 8
self.quantize_bits = 8
self.mlp_extra_grouping = False
self.quantize_groups = 1
if quantization_setting is None:
Expand Down Expand Up @@ -177,7 +177,7 @@ def _convert_to_dtype(self):
quantizer = WeightQuantization(mlp_extra_grouping=self.mlp_extra_grouping)
model, self.quantization_scales = quantizer.model_quantize(self.module,
self.injection_dict,
self.quatize_bits,
self.quantize_bits,
self.quantize_groups)
elif self.dtype == torch.half:
self.module.half()
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/launcher/launch.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright 2020 The Microsoft DeepSpeed Team
"""
DeepSpeed launcher, this is similar to torch.distributed.launch but supports
additional features such as abitrary gpu exclusion.
additional features such as arbitrary gpu exclusion.

deepspeed.launcher.launch is intended to be run on a single worker node and
will spawn several worker sub-processes depending on how many devices/ranks
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/launcher/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""
DeepSpeed runner is the main front-end to launching multi-worker
training jobs with DeepSpeed. By default this uses pdsh to parallel
ssh into multiple worker nodes and launch all the neccisary processes
ssh into multiple worker nodes and launch all the necessary processes
per rank for training.
"""

Expand Down
Loading