Skip to content

Commit

Permalink
More logging for debugging
Browse files Browse the repository at this point in the history
  • Loading branch information
zyzzhaoyuzhe committed Jan 11, 2024
1 parent 95a460d commit 287d769
Show file tree
Hide file tree
Showing 5 changed files with 1,948 additions and 1,468 deletions.
46 changes: 28 additions & 18 deletions deepspeed/runtime/fp16/loss_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,21 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#Taken and modified for DeepSpeed from:
# Taken and modified for DeepSpeed from:
# https://github.com/NVIDIA/Megatron-LM/blob/master/fp16/loss_scaler.py
#Commit: 93ab4bea59dc5cbf97c079d313741866af4deac9
# Commit: 93ab4bea59dc5cbf97c079d313741866af4deac9

INITIAL_LOSS_SCALE = 'init_scale'
SCALE_WINDOW = 'scale_window'
DELAYED_SHIFT = 'delayed_shift'
MIN_LOSS_SCALE = 'min_scale'
from deepspeed.utils import logger

INITIAL_LOSS_SCALE = "init_scale"
SCALE_WINDOW = "scale_window"
DELAYED_SHIFT = "delayed_shift"
MIN_LOSS_SCALE = "min_scale"


# item() is a recent addition, so this helps with backward compatibility.
def to_python_float(t):
if hasattr(t, 'item'):
if hasattr(t, "item"):
return t.item()
return t[0]

Expand All @@ -33,6 +35,7 @@ class LossScalerBase:
"""LossScalarBase
Base class for a loss scaler
"""

def __init__(self, cur_scale):
self.cur_scale = cur_scale

Expand Down Expand Up @@ -62,6 +65,7 @@ class LossScaler(LossScalerBase):
Args:
scale (float, optional, default=1.0): The loss scale.
"""

def __init__(self, scale=1):
super(LossScaler, self).__init__(scale)

Expand Down Expand Up @@ -99,14 +103,17 @@ class DynamicLossScaler(LossScalerBase):
scale_factor (float, optional, default=2.0): Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``. If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
scale_window (int, optional, default=1000): Number of consecutive iterations without an overflow to wait before increasing the loss scale.
"""
def __init__(self,
init_scale=2**32,
scale_factor=2.,
scale_window=1000,
min_scale=1,
delayed_shift=1,
consecutive_hysteresis=False,
raise_error_at_min_scale=True):

def __init__(
self,
init_scale=2**32,
scale_factor=2.0,
scale_window=1000,
min_scale=1,
delayed_shift=1,
consecutive_hysteresis=False,
raise_error_at_min_scale=True,
):
super(DynamicLossScaler, self).__init__(init_scale)
self.cur_iter = 0
self.last_overflow_iter = -1
Expand All @@ -122,6 +129,7 @@ def __init__(self,
def has_overflow_serial(self, params):
for p in params:
if p.grad is not None and self._has_inf_or_nan(p.grad.data):
logger.warning("Gradient overflow: ", p)
return True
return False

Expand All @@ -142,7 +150,7 @@ def _has_inf_or_nan(x):
raise
return True
else:
if cpu_sum in [float('inf'), -float('inf')] or cpu_sum != cpu_sum:
if cpu_sum in [float("inf"), -float("inf")] or cpu_sum != cpu_sum:
return True
return False

Expand All @@ -152,8 +160,10 @@ def update_scale(self, overflow):
# self.cur_scale /= self.scale_factor
if self.delayed_shift == 1 or self.cur_hysteresis == 1:
if (self.cur_scale == self.min_scale) and self.raise_error_at_min_scale:
raise Exception("Current loss scale already at minimum - cannot decrease scale anymore. Exiting "
"run.")
raise Exception(
"Current loss scale already at minimum - cannot decrease scale anymore. Exiting "
"run."
)
self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_scale)
else:
self.cur_hysteresis -= 1
Expand Down
Loading

0 comments on commit 287d769

Please sign in to comment.