Skip to content

Commit

Permalink
clean up logging (#1190)
Browse files Browse the repository at this point in the history
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
  • Loading branch information
stas00 and jeffra authored Jun 28, 2021
1 parent bc019a5 commit a029239
Show file tree
Hide file tree
Showing 7 changed files with 73 additions and 39 deletions.
45 changes: 20 additions & 25 deletions csrc/adam/cpu_adam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -322,42 +322,37 @@ int create_adam_optimizer(int optimizer_id,
float betta2 = 0.999,
float eps = 1e-8,
float weight_decay = 0,
bool adamw_mode = true)
bool adamw_mode = true,
bool should_log = false)
{
auto opt =
std::make_shared<Adam_Optimizer>(alpha, betta1, betta2, eps, weight_decay, adamw_mode);

s_optimizers[optimizer_id] = opt;

if (should_log) {
std::string avx_type = "";
#if defined(__AVX512__)
std::cout << "Adam Optimizer #" << optimizer_id
<< " is created with AVX512 arithmetic capability." << std::endl;
printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
alpha,
betta1,
betta2,
weight_decay,
(int)adamw_mode);
avx_type = "AVX512";
#else
#if defined(__AVX256__)
std::cout << "Adam Optimizer #" << optimizer_id
<< " is created with AVX2 arithmetic capability." << std::endl;
printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
alpha,
betta1,
betta2,
weight_decay,
(int)adamw_mode);
avx_type = "AVX2";
#else
std::cout << "Adam Optimizer #" << optimizer_id
<< " is created with scalar arithmetic capability." << std::endl;
printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
alpha,
betta1,
betta2,
weight_decay,
(int)adamw_mode);
avx_type = "scalar";
#endif
#endif

printf("Adam Optimizer #%d is created with %s arithmetic capability.\n",
optimizer_id,
avx_type.c_str());
printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
alpha,
betta1,
betta2,
weight_decay,
(int)adamw_mode);
}

return 0;
}

Expand Down
4 changes: 3 additions & 1 deletion deepspeed/ops/adam/cpu_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import time
from pathlib import Path
from ..op_builder import CPUAdamBuilder
from deepspeed.utils.logging import should_log_le


class DeepSpeedCPUAdam(torch.optim.Optimizer):
Expand Down Expand Up @@ -83,7 +84,8 @@ def __init__(self,
betas[1],
eps,
weight_decay,
adamw_mode)
adamw_mode,
should_log_le("info"))

def __del__(self):
# need to destroy the C++ object explicitly to avoid a memory leak when deepspeed.initialize
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/runtime/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -932,7 +932,7 @@ def _configure_zero_optimizer(self, optimizer):
ignore_unused_parameters=self.zero_ignore_unused_parameters(),
partition_grads=zero_stage == ZERO_OPTIMIZATION_GRADIENTS)
elif zero_stage == ZERO_OPTIMIZATION_WEIGHTS:
print("Initializing ZeRO Stage 3") if dist.get_rank() == 0 else None
logger.info("Initializing ZeRO Stage 3") if dist.get_rank() == 0 else None
from deepspeed.runtime.zero.stage3 import FP16_DeepSpeedZeroOptimizer_Stage3
optimizer = FP16_DeepSpeedZeroOptimizer_Stage3(
self.module,
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/runtime/zero/partition_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ def _init_subclass(cls, **kwargs):
if self.mem_efficient_linear:
print_rank_0(
"nn.functional.linear has been overridden with a more memory efficient version. This will persist unless manually reset.",
force=True)
force=False)
self.linear_bk = torch.nn.functional.linear
torch.nn.functional.linear = LinearFunctionForZeroStage3.apply

Expand Down
20 changes: 10 additions & 10 deletions deepspeed/runtime/zero/stage3.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,7 @@ def __init__(self,
elastic_checkpoint=False,
aio_config=None):

see_memory_usage("Stage 3 initialize beginning", force=True)
see_memory_usage("Stage 3 initialize beginning", force=False)

if dist.get_rank() == 0:
logger.info(f"Reduce bucket size {reduce_bucket_size}")
Expand Down Expand Up @@ -702,7 +702,7 @@ def __init__(self,
self.max_params_in_cpu = offload_param_config[OFFLOAD_PARAM_MAX_IN_CPU]
print_rank_0(
f"FP16 params swapping is {self.params_in_nvme_and_cpu}, Max params in CPU is {self.max_params_in_cpu}",
force=True)
force=False)

self.deepspeed_adam_offload = (self.offload_optimizer
and type(init_optimizer) == DeepSpeedCPUAdam)
Expand Down Expand Up @@ -798,7 +798,7 @@ def __init__(self,
self.sub_group_size = sub_group_size

self.sub_group_to_group_id = {}
see_memory_usage("Before creating fp16 partitions", force=True)
see_memory_usage("Before creating fp16 partitions", force=False)
self._create_fp16_partitions_with_defragmentation()
num_fp16_subgroups = len(self.fp16_partitioned_groups_flat)
see_memory_usage(f"After creating fp16 partitions: {num_fp16_subgroups}",
Expand Down Expand Up @@ -867,7 +867,7 @@ def __init__(self,
])
print_rank_0(
f'Largest partitioned param numel = {largest_partitioned_param_numel}',
force=True)
force=False)

see_memory_usage(f"Before Set Grad positions", force=False)

Expand Down Expand Up @@ -922,7 +922,7 @@ def __init__(self,
self.debug_fp16_grads = [{} for _ in self.fp16_groups]

if dist.get_rank(group=self.dp_process_group) == 0:
see_memory_usage(f"After initializing ZeRO optimizer", force=True)
see_memory_usage(f"After initializing ZeRO optimizer", force=False)

def _configure_tensor_swapping(self, offload_optimizer_config, aio_config):
nvme_swap_folder = os.path.join(
Expand Down Expand Up @@ -1096,7 +1096,7 @@ def _create_fp16_partitions_with_defragmentation(self):
for j, param_group in enumerate(self.optimizer.param_groups):

sub_groups = self._create_fp16_sub_groups(param_group['params'])
print_rank_0(f'fp16 group {j} has {len(sub_groups)} subgroups', force=True)
print_rank_0(f'fp16 group {j} has {len(sub_groups)} subgroups', force=False)

flat_offset = 0
for sub_group in sub_groups:
Expand Down Expand Up @@ -1331,19 +1331,19 @@ def _create_fp32_partitions(self):
nvme_gigabytes = nvme_memory_usage / GIGA_BYTES
print_rank_0(
f'Swappable FP32 Partitions: count={num_swappable_partitions} size={nvme_gigabytes:5.2f} GB',
force=True)
force=False)
if self.params_in_nvme_and_cpu:
print_rank_0(
f'Swap from NVMe Partitions: count = {num_swap_from_nvme_partitions}, size = {swap_from_nvme_memory_usage/GIGA_BYTES:5.2f}GB',
force=True)
force=False)
print_rank_0(
f'Swap from CPU Partitions: count = {num_swap_from_cpu_partitions}, size = {swap_from_cpu_memory_usage/GIGA_BYTES:5.2f}GB',
force=True)
force=False)

cpu_memory_gigabytes = cpu_memory_usage / GIGA_BYTES
print_rank_0(
f'In-Memory FP32 Partitions: count={cpu_memory_sub_groups} size={cpu_memory_gigabytes:5.2f} GB',
force=True)
force=False)

# Clear for on-the-fly population before the optimizer step
for param_group in self.optimizer.param_groups:
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/runtime/zero/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def _initialize_parameter_parallel_groups(parameter_parallel_size=None):

def is_zero_supported_optimizer(optimizer):
if dist.get_rank() == 0:
print(
logger.info(
f'Checking ZeRO support for optimizer={optimizer.__class__.__name__} type={type(optimizer)}'
)
return type(optimizer) in ZERO_SUPPORTED_OPTIMIZERS
37 changes: 37 additions & 0 deletions deepspeed/utils/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@

import torch.distributed as dist

log_levels = {
"debug": logging.DEBUG,
"info": logging.INFO,
"warning": logging.WARNING,
"error": logging.ERROR,
"critical": logging.CRITICAL,
}


class LoggerFactory:
@staticmethod
Expand Down Expand Up @@ -58,3 +66,32 @@ def log_dist(message, ranks=None, level=logging.INFO):
if should_log:
final_message = "[Rank {}] {}".format(my_rank, message)
logger.log(level, final_message)


def get_current_level():
"""
Return logger's current log level
"""
return logger.getEffectiveLevel()


def should_log_le(max_log_level_str):
"""
Args:
max_log_level_str: maximum log level as a string
Returns ``True`` if the current log_level is less or equal to the specified log level. Otherwise ``False``.
Example:
``should_log_le("info")`` will return ``True`` if the current log level is either ``logging.INFO`` or ``logging.DEBUG``
"""

if not isinstance(max_log_level_str, str):
raise ValueError(f"{max_log_level_str} is not a string")

max_log_level_str = max_log_level_str.lower()
if max_log_level_str not in log_levels:
raise ValueError(f"{max_log_level_str} is not one of the `logging` levels")

return get_current_level() <= log_levels[max_log_level_str]

0 comments on commit a029239

Please sign in to comment.