More logging for debugging

augmentcode · Jan 11, 2024 · 287d769 · 287d769
1 parent 95a460d
commit 287d769
Show file tree

Hide file tree

Showing 5 changed files with 1,948 additions and 1,468 deletions.
diff --git a/deepspeed/runtime/fp16/loss_scaler.py b/deepspeed/runtime/fp16/loss_scaler.py
@@ -12,19 +12,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#Taken and modified for DeepSpeed from:
+# Taken and modified for DeepSpeed from:
 #    https://github.com/NVIDIA/Megatron-LM/blob/master/fp16/loss_scaler.py
-#Commit: 93ab4bea59dc5cbf97c079d313741866af4deac9
+# Commit: 93ab4bea59dc5cbf97c079d313741866af4deac9
 
-INITIAL_LOSS_SCALE = 'init_scale'
-SCALE_WINDOW = 'scale_window'
-DELAYED_SHIFT = 'delayed_shift'
-MIN_LOSS_SCALE = 'min_scale'
+from deepspeed.utils import logger
+
+INITIAL_LOSS_SCALE = "init_scale"
+SCALE_WINDOW = "scale_window"
+DELAYED_SHIFT = "delayed_shift"
+MIN_LOSS_SCALE = "min_scale"
 
 
 # item() is a recent addition, so this helps with backward compatibility.
 def to_python_float(t):
-    if hasattr(t, 'item'):
+    if hasattr(t, "item"):
         return t.item()
     return t[0]
 
@@ -33,6 +35,7 @@ class LossScalerBase:
     """LossScalarBase
     Base class for a loss scaler
     """
+
     def __init__(self, cur_scale):
         self.cur_scale = cur_scale
 
@@ -62,6 +65,7 @@ class LossScaler(LossScalerBase):
     Args:
         scale (float, optional, default=1.0):  The loss scale.
     """
+
     def __init__(self, scale=1):
         super(LossScaler, self).__init__(scale)
 
@@ -99,14 +103,17 @@ class DynamicLossScaler(LossScalerBase):
         scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
         scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
     """
-    def __init__(self,
-                 init_scale=2**32,
-                 scale_factor=2.,
-                 scale_window=1000,
-                 min_scale=1,
-                 delayed_shift=1,
-                 consecutive_hysteresis=False,
-                 raise_error_at_min_scale=True):
+
+    def __init__(
+        self,
+        init_scale=2**32,
+        scale_factor=2.0,
+        scale_window=1000,
+        min_scale=1,
+        delayed_shift=1,
+        consecutive_hysteresis=False,
+        raise_error_at_min_scale=True,
+    ):
         super(DynamicLossScaler, self).__init__(init_scale)
         self.cur_iter = 0
         self.last_overflow_iter = -1
@@ -122,6 +129,7 @@ def __init__(self,
     def has_overflow_serial(self, params):
         for p in params:
             if p.grad is not None and self._has_inf_or_nan(p.grad.data):
+                logger.warning("Gradient overflow: ", p)
                 return True
         return False
 
@@ -142,7 +150,7 @@ def _has_inf_or_nan(x):
                 raise
             return True
         else:
-            if cpu_sum in [float('inf'), -float('inf')] or cpu_sum != cpu_sum:
+            if cpu_sum in [float("inf"), -float("inf")] or cpu_sum != cpu_sum:
                 return True
             return False
 
@@ -152,8 +160,10 @@ def update_scale(self, overflow):
             # self.cur_scale /= self.scale_factor
             if self.delayed_shift == 1 or self.cur_hysteresis == 1:
                 if (self.cur_scale == self.min_scale) and self.raise_error_at_min_scale:
-                    raise Exception("Current loss scale already at minimum - cannot decrease scale anymore. Exiting "
-                                    "run.")
+                    raise Exception(
+                        "Current loss scale already at minimum - cannot decrease scale anymore. Exiting "
+                        "run."
+                    )
                 self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_scale)
             else:
                 self.cur_hysteresis -= 1