Add training argument to Model.compute_loss(). (#19840)

hertschuh · web-flow · commit 46df34110339 · 2024-06-12T08:58:16.000-07:00
This allows models to perform different computations during training and evaluation. For instance, some expensive to compute metrics can be skipped during training and only computed during evaluation.

Note that backwards compatibility with overrides that do not have the `training` argument is maintained.
diff --git a/keras/src/backend/jax/trainer.py b/keras/src/backend/jax/trainer.py
@@ -63,6 +63,7 @@ def compute_loss_and_updates(
             y=y,
             y_pred=y_pred,
             sample_weight=sample_weight,
+            training=training,
         )
         if losses:
             self._losses_override.clear()
diff --git a/keras/src/backend/numpy/trainer.py b/keras/src/backend/numpy/trainer.py
@@ -28,8 +28,8 @@ def test_step(self, data):
             y_pred = self(x, training=False)
         else:
             y_pred = self(x)
-        loss = self.compute_loss(
-            x=x, y=y, y_pred=y_pred, sample_weight=sample_weight
+        loss = self._compute_loss(
+            x=x, y=y, y_pred=y_pred, sample_weight=sample_weight, training=False
         )
         self._loss_tracker.update_state(
             loss, sample_weight=tree.flatten(x)[0].shape[0]
diff --git a/keras/src/backend/tensorflow/trainer.py b/keras/src/backend/tensorflow/trainer.py
@@ -51,8 +51,12 @@ def train_step(self, data):
                 y_pred = self(x, training=True)
             else:
                 y_pred = self(x)
-            loss = self.compute_loss(
-                x=x, y=y, y_pred=y_pred, sample_weight=sample_weight
+            loss = self._compute_loss(
+                x=x,
+                y=y,
+                y_pred=y_pred,
+                sample_weight=sample_weight,
+                training=True,
             )
             self._loss_tracker.update_state(
                 loss, sample_weight=tf.shape(tree.flatten(x)[0])[0]
@@ -78,8 +82,8 @@ def test_step(self, data):
             y_pred = self(x, training=False)
         else:
             y_pred = self(x)
-        loss = self.compute_loss(
-            x=x, y=y, y_pred=y_pred, sample_weight=sample_weight
+        loss = self._compute_loss(
+            x=x, y=y, y_pred=y_pred, sample_weight=sample_weight, training=False
         )
         self._loss_tracker.update_state(
             loss, sample_weight=tf.shape(tree.flatten(x)[0])[0]
@@ -601,17 +605,17 @@ def compiled_loss(
         self, y, y_pred, sample_weight=None, regularization_losses=None
     ):
         warnings.warn(
-            "`model.compiled_loss()` is deprecated. "
-            "Instead, use `model.compute_loss(x, y, y_pred, sample_weight)`.",
+            "`model.compiled_loss()` is deprecated. Instead, use "
+            "`model.compute_loss(x, y, y_pred, sample_weight, training)`.",
         )
         return self.compute_loss(
             x=None, y=y, y_pred=y_pred, sample_weight=sample_weight
         )
 
     def loss(self, y, y_pred, sample_weight=None):
         warnings.warn(
-            "`model.loss` is deprecated. "
-            "Instead, use `model.compute_loss(x, y, y_pred, sample_weight)`.",
+            "`model.loss()` is deprecated. Instead, use "
+            "`model.compute_loss(x, y, y_pred, sample_weight, training)`.",
         )
         return self.compute_loss(
             x=None, y=y, y_pred=y_pred, sample_weight=sample_weight
diff --git a/keras/src/backend/torch/trainer.py b/keras/src/backend/torch/trainer.py
@@ -49,8 +49,8 @@ def train_step(self, data):
         # for the weights from the previous train step.
         self.zero_grad()
 
-        loss = self.compute_loss(
-            x=x, y=y, y_pred=y_pred, sample_weight=sample_weight
+        loss = self._compute_loss(
+            x=x, y=y, y_pred=y_pred, sample_weight=sample_weight, training=True
         )
         self._loss_tracker.update_state(
             loss, sample_weight=tree.flatten(x)[0].shape[0]
@@ -85,8 +85,8 @@ def test_step(self, data):
             y_pred = self(x, training=False)
         else:
             y_pred = self(x)
-        loss = self.compute_loss(
-            x=x, y=y, y_pred=y_pred, sample_weight=sample_weight
+        loss = self._compute_loss(
+            x=x, y=y, y_pred=y_pred, sample_weight=sample_weight, training=False
         )
         self._loss_tracker.update_state(
             loss, sample_weight=tree.flatten(x)[0].shape[0]
diff --git a/keras/src/trainers/trainer.py b/keras/src/trainers/trainer.py
@@ -1,3 +1,4 @@
+import inspect
 import platform
 import warnings
 
@@ -25,6 +26,9 @@ def __init__(self):
         self.steps_per_execution = 1
         # Can be set by callbacks in on_train_begin
         self._initial_epoch = None
+        self._compute_loss_has_training_arg = (
+            "training" in inspect.signature(self.compute_loss).parameters
+        )
 
     @traceback_utils.filter_traceback
     @tracking.no_automatic_dependency_tracking
@@ -262,6 +266,7 @@ def compute_loss(
         y=None,
         y_pred=None,
         sample_weight=None,
+        training=True,
     ):
         """Compute the total loss, validate it, and return it.
 
@@ -276,7 +281,7 @@ def __init__(self, *args, **kwargs):
                 super().__init__(*args, **kwargs)
                 self.loss_tracker = metrics.Mean(name='loss')
 
-            def compute_loss(self, x, y, y_pred, sample_weight):
+            def compute_loss(self, x, y, y_pred, sample_weight, training=True):
                 loss = ops.means((y_pred - y) ** 2)
                 loss += ops.sum(self.losses)
                 self.loss_tracker.update_state(loss)
@@ -306,12 +311,15 @@ def metrics(self):
             y: Target data.
             y_pred: Predictions returned by the model (output of `model(x)`)
             sample_weight: Sample weights for weighting the loss function.
+            training: Whether we are training or evaluating the model.
 
         Returns:
             The total loss as a scalar tensor, or `None` if no loss results
             (which is the case when called by `Model.test_step`).
         """
-        del x  # The default implementation does not use `x`.
+        # The default implementation does not use `x` or `training`.
+        del x
+        del training
         losses = []
         if self._compile_loss is not None:
             loss = self._compile_loss(y, y_pred, sample_weight)
@@ -331,6 +339,27 @@ def metrics(self):
             total_loss = ops.sum(losses)
         return total_loss
 
+    def _compute_loss(
+        self,
+        x=None,
+        y=None,
+        y_pred=None,
+        sample_weight=None,
+        training=True,
+    ):
+        """Backwards compatibility wrapper for `compute_loss`.
+
+        This should be used instead `compute_loss` within `train_step` and
+        `test_step` to support overrides of `compute_loss` that may not have
+        the `training` argument, as this argument was added in Keras 3.3.
+        """
+        if self._compute_loss_has_training_arg:
+            return self.compute_loss(
+                x, y, y_pred, sample_weight, training=training
+            )
+        else:
+            return self.compute_loss(x, y, y_pred, sample_weight)
+
     def stateless_compute_loss(
         self,
         trainable_variables,
@@ -340,6 +369,7 @@ def stateless_compute_loss(
         y=None,
         y_pred=None,
         sample_weight=None,
+        training=True,
     ):
         var_mapping = list(zip(self.trainable_variables, trainable_variables))
         var_mapping.extend(
@@ -349,11 +379,12 @@ def stateless_compute_loss(
         with backend.StatelessScope(state_mapping=var_mapping) as scope:
             # Note that this is needed for the regularization loss, which need
             # the latest value of train/non-trainable variables.
-            loss = self.compute_loss(
+            loss = self._compute_loss(
                 x,
                 y,
                 y_pred,
                 sample_weight=sample_weight,
+                training=training,
             )
 
         # Update non trainable vars (may have been updated in compute_loss)
diff --git a/keras/src/trainers/trainer_test.py b/keras/src/trainers/trainer_test.py
@@ -1387,6 +1387,7 @@ def on_predict_batch_end(self, *args, **kwargs):
 
     @pytest.mark.requires_trainable_backend
     def test_metric_update_in_compute_loss(self):
+        test_self = self
 
         class MyModel(keras.Model):
             def __init__(self):
@@ -1398,9 +1399,17 @@ def call(self, x):
                 return self.dense(x)
 
             def compute_loss(
-                self, x=None, y=None, y_pred=None, sample_weight=None
+                self,
+                x=None,
+                y=None,
+                y_pred=None,
+                sample_weight=None,
+                training=True,
             ):
-                loss = super().compute_loss(x, y, y_pred, sample_weight)
+                test_self.assertTrue(training)
+                loss = super().compute_loss(
+                    x, y, y_pred, sample_weight, training
+                )
                 self.custom_metric.update_state(loss * 4)
                 return loss
 
@@ -1415,6 +1424,7 @@ def compute_loss(
 
     @pytest.mark.requires_trainable_backend
     def test_fwd_pass_loss_presence_in_compute_loss(self):
+        test_self = self
 
         class MyModel(keras.Model):
             def __init__(self):
@@ -1426,9 +1436,17 @@ def call(self, x):
                 return self.dense(x)
 
             def compute_loss(
-                self, x=None, y=None, y_pred=None, sample_weight=None
+                self,
+                x=None,
+                y=None,
+                y_pred=None,
+                sample_weight=None,
+                training=True,
             ):
-                loss = super().compute_loss(x, y, y_pred, sample_weight)
+                test_self.assertTrue(training)
+                loss = super().compute_loss(
+                    x, y, y_pred, sample_weight, training
+                )
                 self.custom_metric.update_state(sum(self.losses))
                 return loss
 
@@ -1439,6 +1457,75 @@ def compute_loss(
         history = model.fit(x, y)
         self.assertGreater(history.history["custom"][0], 0.0)
 
+    @pytest.mark.requires_trainable_backend
+    def test_evaluate_with_custom_compute_loss(self):
+        test_self = self
+
+        class MyModel(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.custom_metric = keras.metrics.Mean(name="custom")
+                self.dense = keras.layers.Dense(2, activity_regularizer="l2")
+
+            def call(self, x):
+                return self.dense(x)
+
+            def compute_loss(
+                self,
+                x=None,
+                y=None,
+                y_pred=None,
+                sample_weight=None,
+                training=True,
+            ):
+                test_self.assertFalse(training)
+                loss = super().compute_loss(
+                    x, y, y_pred, sample_weight, training
+                )
+                self.custom_metric.update_state(loss * 4)
+                return loss
+
+        model = MyModel()
+        model.compile(optimizer="sgd", loss="mse")
+        x = np.ones((32, 4))
+        y = np.ones((32, 2)) * 2
+        logs = model.evaluate(x, y, return_dict=True)
+        self.assertAlmostEqual(logs["custom"], logs["loss"] * 4)
+
+    @pytest.mark.requires_trainable_backend
+    def test_compute_loss_no_training_backwards_compatibility(self):
+
+        class MyModel(keras.Model):
+            def __init__(self):
+                super().__init__()
+                self.custom_metric = keras.metrics.Mean(name="custom")
+                self.dense = keras.layers.Dense(2, activity_regularizer="l2")
+
+            def call(self, x):
+                return self.dense(x)
+
+            def compute_loss(
+                self,
+                x=None,
+                y=None,
+                y_pred=None,
+                sample_weight=None,
+            ):
+                loss = super().compute_loss(x, y, y_pred, sample_weight)
+                self.custom_metric.update_state(loss * 4)
+                return loss
+
+        model = MyModel()
+        model.compile(optimizer="sgd", loss="mse")
+        x = np.ones((32, 4))
+        y = np.ones((32, 2)) * 2
+        logs = model.evaluate(x, y, return_dict=True)
+        self.assertAlmostEqual(logs["custom"], logs["loss"] * 4)
+        history = model.fit(x, y)
+        self.assertAlmostEqual(
+            history.history["custom"][0], history.history["loss"][0] * 4
+        )
+
     @pytest.mark.requires_trainable_backend
     def test_loss_weights(self):
         epochs = 3

Original file line number	Diff line number	Diff line change
`@@ -63,6 +63,7 @@ def compute_loss_and_updates(`
`63`	`63`	`y=y,`
`64`	`64`	`y_pred=y_pred,`
`65`	`65`	`sample_weight=sample_weight,`
	`66`	`+ training=training,`
`66`	`67`	`)`
`67`	`68`	`if losses:`
`68`	`69`	`self._losses_override.clear()`