Support PGD adversarial regularization in Keras and Estimator APIs.

csferng · tensorflow-copybara · commit 4945b777f48f · 2020-06-03T15:05:03.000-07:00
PiperOrigin-RevId: 314612195
diff --git a/neural_structured_learning/configs/configs.py b/neural_structured_learning/configs/configs.py
@@ -58,17 +58,19 @@ class AdvNeighborConfig(object):
       corresponding feature.
     clip_value_max: maximum value to clip the feature after perturbation. (See
       `clip_value_min` for the structure and shape limitations.)
-    iterations: number of iterations to run the attack for. Defaults to a single
-      step, used for the Fast Gradient Sign Method (FGSM) attack.
-    epsilon: Defines radius of the epsilon ball to project back to.
+    pgd_iterations: number of attack iterations for Projected Gradient Descent
+      (PGD) attack. Defaults to 1, which resembles the Fast Gradient Sign Method
+      (FGSM) attack.
+    pgd_epsilon: radius of the epsilon ball to project back to. Only used in
+      Projected Gradient Descent (PGD) attack.
   """
   feature_mask = attr.ib(default=None)
   adv_step_size = attr.ib(default=0.001)
   adv_grad_norm = attr.ib(converter=NormType, default='l2')
   clip_value_min = attr.ib(default=None)
   clip_value_max = attr.ib(default=None)
-  iterations = attr.ib(default=1)  # 1 is the FGSM attack.
-  epsilon = attr.ib(default=None)
+  pgd_iterations = attr.ib(default=1)  # 1 is the FGSM attack.
+  pgd_epsilon = attr.ib(default=None)
 
 
 @attr.s
@@ -91,7 +93,9 @@ def make_adv_reg_config(
     adv_step_size=attr.fields(AdvNeighborConfig).adv_step_size.default,
     adv_grad_norm=attr.fields(AdvNeighborConfig).adv_grad_norm.default,
     clip_value_min=attr.fields(AdvNeighborConfig).clip_value_min.default,
-    clip_value_max=attr.fields(AdvNeighborConfig).clip_value_max.default):
+    clip_value_max=attr.fields(AdvNeighborConfig).clip_value_max.default,
+    pgd_iterations=attr.fields(AdvNeighborConfig).pgd_iterations.default,
+    pgd_epsilon=attr.fields(AdvNeighborConfig).pgd_epsilon.default):
   """Creates an `nsl.configs.AdvRegConfig` object.
 
   Args:
@@ -115,6 +119,11 @@ def make_adv_reg_config(
       corresponding feature.
     clip_value_max: maximum value to clip the feature after perturbation. (See
       `clip_value_min` for the structure and shape limitations.)
+    pgd_iterations: number of attack iterations for Projected Gradient Descent
+      (PGD) attack. Defaults to 1, which resembles the Fast Gradient Sign Method
+      (FGSM) attack.
+    pgd_epsilon: radius of the epsilon ball to project back to. Only used in
+      Projected Gradient Descent (PGD) attack.
 
   Returns:
     An `nsl.configs.AdvRegConfig` object.
@@ -126,7 +135,9 @@ def make_adv_reg_config(
           adv_step_size=adv_step_size,
           adv_grad_norm=adv_grad_norm,
           clip_value_min=clip_value_min,
-          clip_value_max=clip_value_max))
+          clip_value_max=clip_value_max,
+          pgd_iterations=pgd_iterations,
+          pgd_epsilon=pgd_epsilon))
 
 
 class AdvTargetType(enum.Enum):
diff --git a/neural_structured_learning/estimator/BUILD b/neural_structured_learning/estimator/BUILD
@@ -52,6 +52,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":estimator",
+        # package absl/testing:parameterized
         "//neural_structured_learning/configs",
         # package numpy
         # package tensorflow
diff --git a/neural_structured_learning/estimator/adversarial_regularization.py b/neural_structured_learning/estimator/adversarial_regularization.py
@@ -91,24 +91,27 @@ def adv_model_fn(features, labels, mode, params=None, config=None):
       # If no 'params' is passed, then it is possible for base_model_fn not to
       # accept a 'params' argument. See documentation for tf.estimator.Estimator
       # for additional context.
-      # pylint: disable=g-long-lambda
-      spec_fn = ((lambda features: base_model_fn(
-          features, labels, mode, params, config)) if params else (
-              lambda features: base_model_fn(features, labels, mode, config)))
+      base_args = [mode, params, config] if params else [mode, config]
+      spec_fn = lambda feature, label: base_model_fn(feature, label, *base_args)
 
-      original_spec = spec_fn(features)
-
-      print("ORIGINAL_SPEC", original_spec)
+      original_spec = spec_fn(features, labels)
 
       # Adversarial regularization only happens in training.
       if mode != tf.estimator.ModeKeys.TRAIN:
         return original_spec
 
-      adv_neighbor, _ = nsl_lib.gen_adv_neighbor(features, original_spec.loss,
-                                                 adv_config.adv_neighbor_config)
+      adv_neighbor, _ = nsl_lib.gen_adv_neighbor(
+          features,
+          original_spec.loss,
+          adv_config.adv_neighbor_config,
+          # The pgd_model_fn is a dummy identity function since loss is
+          # directly available from spec_fn.
+          pgd_model_fn=lambda features: features,
+          pgd_loss_fn=lambda labels, features: spec_fn(features, labels).loss,
+          pgd_labels=labels)
 
       # Runs the base model again to compute loss on adv_neighbor.
-      adv_spec = spec_fn(adv_neighbor)
+      adv_spec = spec_fn(adv_neighbor, labels)
 
       final_loss = original_spec.loss + adv_config.multiplier * adv_spec.loss
 
diff --git a/neural_structured_learning/estimator/adversarial_regularization_test.py b/neural_structured_learning/estimator/adversarial_regularization_test.py
@@ -21,9 +21,9 @@
 import shutil
 import tempfile
 
+from absl.testing import parameterized
 import neural_structured_learning.configs as nsl_configs
 import neural_structured_learning.estimator as nsl_estimator
-
 import numpy as np
 import tensorflow as tf
 
@@ -43,7 +43,7 @@ def input_fn():
   return input_fn
 
 
-class AdversarialRegularizationTest(tf.test.TestCase):
+class AdversarialRegularizationTest(tf.test.TestCase, parameterized.TestCase):
 
   def setUp(self):
     super(AdversarialRegularizationTest, self).setUp()
@@ -79,19 +79,25 @@ def test_adversarial_wrapper_not_affecting_predictions(self):
     predicted_scores = [x['predictions'] for x in predictions]
     self.assertAllClose([[3.0], [4.0]], predicted_scores)
 
+  @parameterized.named_parameters([
+      ('fgsm', 0.1, 1, None),
+      ('pgd', 0.1, 3, 0.25),
+  ])
   @test_util.run_v1_only('Requires tf.GraphKeys')
-  def test_adversarial_wrapper_adds_regularization(self):
+  def test_adversarial_wrapper_adds_regularization(self, adv_step_size,
+                                                   pgd_iterations, pgd_epsilon):
     # base model: y = w*x+b = 4*x1 + 3*x2 + 2
     weight = np.array([[4.0], [3.0]], dtype=np.float32)
     bias = np.array([2.0], dtype=np.float32)
     x0, y0 = np.array([[1.0, 1.0]]), np.array([8.0])
-    adv_step_size = 0.1
     learning_rate = 0.01
 
     base_est = self.build_linear_regressor(weight=weight, bias=bias)
     adv_config = nsl_configs.make_adv_reg_config(
         multiplier=1.0,  # equal weight on original and adv examples
-        adv_step_size=adv_step_size)
+        adv_step_size=adv_step_size,
+        pgd_iterations=pgd_iterations,
+        pgd_epsilon=pgd_epsilon)
     adv_est = nsl_estimator.add_adversarial_regularization(
         base_est,
         optimizer_fn=lambda: tf.train.GradientDescentOptimizer(learning_rate),
@@ -104,11 +110,16 @@ def test_adversarial_wrapper_adds_regularization(self):
     orig_grad_w = 2 * (orig_pred - y0) * x0.T  # [[2.0], [2.0]]
     orig_grad_b = 2 * (orig_pred - y0).reshape((1,))  # [2.0]
     grad_x = 2 * (orig_pred - y0) * weight.T  # [[8.0, 6.0]]
-    perturbation = adv_step_size * grad_x / np.linalg.norm(grad_x)
-    x_adv = x0 + perturbation  # [[1.08, 1.06]]
-    adv_pred = np.dot(x_adv, weight) + bias  # [9.5]
-    adv_grad_w = 2 * (adv_pred - y0) * x_adv.T  # [[3.24], [3.18]]
-    adv_grad_b = 2 * (adv_pred - y0).reshape((1,))  # [3.0]
+    # Gradient direction is independent of x, so perturbing for multiple
+    # iterations is the same as scaling the perturbation.
+    perturbation_magnitude = pgd_iterations * adv_step_size
+    if pgd_epsilon is not None:
+      perturbation_magnitude = np.minimum(perturbation_magnitude, pgd_epsilon)
+    perturbation = perturbation_magnitude * grad_x / np.linalg.norm(grad_x)
+    x_adv = x0 + perturbation  # fgm: [[1.08, 1.06]]; pgd: [[1.20, 1.15]]
+    adv_pred = np.dot(x_adv, weight) + bias  # fgm: [9.5]; pgd: [10.25]
+    adv_grad_w = 2 * (adv_pred - y0) * x_adv.T  # fgm: [[3.24], [3.18]]
+    adv_grad_b = 2 * (adv_pred - y0).reshape((1,))  # fgm: [3.0]; pgd: [4.5]
 
     new_bias = bias - learning_rate * (orig_grad_b + adv_grad_b)
     new_weight = weight - learning_rate * (orig_grad_w + adv_grad_w)
diff --git a/neural_structured_learning/keras/adversarial_regularization.py b/neural_structured_learning/keras/adversarial_regularization.py
@@ -138,7 +138,10 @@ def adversarial_loss(features,
       features,
       labeled_loss,
       config=adv_config.adv_neighbor_config,
-      gradient_tape=gradient_tape)
+      gradient_tape=gradient_tape,
+      pgd_model_fn=model,
+      pgd_loss_fn=functools.partial(loss_fn, sample_weights=sample_weights),
+      pgd_labels=labels)
   adv_output = model(adv_input)
   if sample_weights is not None:
     adv_sample_weights = tf.math.multiply(sample_weights, adv_sample_weights)
@@ -713,7 +716,13 @@ def perturb_on_batch(self, x, **config_kwargs):
     config_kwargs = {k: v for k, v in config_kwargs.items() if v is not None}
     config = attr.evolve(self.adv_config.adv_neighbor_config, **config_kwargs)
     adv_inputs, _ = nsl_lib.gen_adv_neighbor(
-        inputs, labeled_loss, config=config, gradient_tape=tape)
+        inputs,
+        labeled_loss,
+        config=config,
+        gradient_tape=tape,
+        pgd_model_fn=self._call_base_model,
+        pgd_loss_fn=self._compute_total_loss,
+        pgd_labels=labels)
 
     if tf.executing_eagerly():
       # Converts `Tensor` objects to NumPy arrays and keeps other objects (e.g.
diff --git a/neural_structured_learning/keras/adversarial_regularization_test.py b/neural_structured_learning/keras/adversarial_regularization_test.py
@@ -542,6 +542,45 @@ def call(self, inputs):
 
     self.assertIn('label', model.seen_input_keys)
 
+  @parameterized.named_parameters([
+      ('sequential', build_linear_keras_sequential_model),
+      ('sequential_no_input_layer',
+       build_linear_keras_sequential_model_no_input_layer),
+      ('functional', build_linear_keras_functional_model),
+      ('subclassed', build_linear_keras_subclassed_model),
+  ])
+  def test_train_pgd(self, model_fn):
+    w = np.array([[4.0], [-3.0]])
+    x0 = np.array([[2.0, 3.0]])
+    y0 = np.array([[0.0]])
+    adv_multiplier = 0.2
+    adv_step_size = 0.01
+    learning_rate = 0.01
+    pgd_iterations = 3
+    pgd_epsilon = 2.5 * adv_step_size
+    adv_config = configs.make_adv_reg_config(
+        multiplier=adv_multiplier,
+        adv_step_size=adv_step_size,
+        adv_grad_norm='infinity',
+        pgd_iterations=pgd_iterations,
+        pgd_epsilon=pgd_epsilon)
+    y_hat = np.dot(x0, w)
+    # The adversarial perturbation is constant across PGD iterations.
+    x_adv = x0 + pgd_epsilon * np.sign((y_hat - y0) * w.T)
+    y_hat_adv = np.dot(x_adv, w)
+    grad_w_labeled_loss = 2. * (y_hat - y0) * x0.T
+    grad_w_adv_loss = adv_multiplier * 2. * (y_hat_adv - y0) * x_adv.T
+    w_new = w - learning_rate * (grad_w_labeled_loss + grad_w_adv_loss)
+
+    inputs = {'feature': tf.constant(x0), 'label': tf.constant(y0)}
+    model = model_fn(input_shape=(2,), weights=w)
+    adv_model = adversarial_regularization.AdversarialRegularization(
+        model, label_keys=['label'], adv_config=adv_config)
+    adv_model.compile(tf.keras.optimizers.SGD(learning_rate), loss='MSE')
+    adv_model.fit(x=inputs, batch_size=1, steps_per_epoch=1)
+
+    self.assertAllClose(w_new, tf.keras.backend.get_value(model.weights[0]))
+
   def test_evaluate_binary_classification_metrics(self):
     # multi-label binary classification model
     w = np.array([[4.0, 1.0, -5.0], [-3.0, 1.0, 2.0]])
@@ -633,6 +672,30 @@ def test_perturb_on_batch_custom_config(self):
     self.assertAllClose(x_adv, adv_inputs['feature'])
     self.assertAllClose(y0, adv_inputs['label'])
 
+  @parameterized.named_parameters([
+      ('sequential', build_linear_keras_sequential_model),
+      ('sequential_no_input_layer',
+       build_linear_keras_sequential_model_no_input_layer),
+      ('functional', build_linear_keras_functional_model),
+      ('subclassed', build_linear_keras_subclassed_model),
+  ])
+  def test_perturb_on_batch_pgd(self, model_fn):
+    w, x0, y0, lr, adv_config, _ = self._set_up_linear_regression()
+    pgd_epsilon = 4.5 * adv_config.adv_neighbor_config.adv_step_size
+    adv_config.adv_neighbor_config.pgd_iterations = 5
+    adv_config.adv_neighbor_config.pgd_epsilon = pgd_epsilon
+    inputs = {'feature': x0, 'label': y0}
+    model = model_fn(input_shape=(2,), weights=w)
+    adv_model = adversarial_regularization.AdversarialRegularization(
+        model, label_keys=['label'], adv_config=adv_config)
+    adv_model.compile(optimizer=tf.keras.optimizers.SGD(lr), loss=['MSE'])
+    adv_inputs = adv_model.perturb_on_batch(inputs)
+
+    y_hat = np.dot(x0, w)
+    x_adv = x0 + pgd_epsilon * np.sign((y_hat - y0) * w.T)
+    self.assertAllClose(x_adv, adv_inputs['feature'])
+    self.assertAllClose(y0, adv_inputs['label'])
+
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/neural_structured_learning/lib/adversarial_neighbor.py b/neural_structured_learning/lib/adversarial_neighbor.py
@@ -202,7 +202,7 @@ def gen_neighbor(self, input_features, pgd_labels=None):
       logging.log_first_n(logging.WARNING,
                           'Cannot perturb non-Tensor input: %s', 1, sparse_keys)
     dense_features = dense_original_features
-    for t in range(self._adv_config.iterations):
+    for t in range(self._adv_config.pgd_iterations):
       keyed_grads = self._compute_gradient(loss, dense_features, gradient_tape)
       masked_grads = {
           key: utils.apply_feature_mask(grad, feature_masks.get(key, None))
@@ -221,8 +221,8 @@ def gen_neighbor(self, input_features, pgd_labels=None):
         # Only include features for which perturbation occurred. There is
         # nothing to project for features without perturbations.
         diff[key] = dense_features[key] + perturb - dense_original_features[key]
-      if self._adv_config.epsilon is not None:
-        bounded_diff = utils.project_to_ball(diff, self._adv_config.epsilon,
+      if self._adv_config.pgd_epsilon is not None:
+        bounded_diff = utils.project_to_ball(diff, self._adv_config.pgd_epsilon,
                                              self._adv_config.adv_grad_norm)
       else:
         bounded_diff = diff
@@ -239,7 +239,7 @@ def gen_neighbor(self, input_features, pgd_labels=None):
                 feature_min.get(key, None), feature_max.get(key, None)))
 
       # Update for the next iteration.
-      if t < self._adv_config.iterations - 1:
+      if t < self._adv_config.pgd_iterations - 1:
         inputs_t = self._decompose_as(input_features, adv_neighbor)
         # Compute the new loss to calculate gradients with.
         features = self._compose_as_dict(inputs_t)
diff --git a/neural_structured_learning/lib/adversarial_neighbor_test.py b/neural_structured_learning/lib/adversarial_neighbor_test.py
diff --git a/neural_structured_learning/lib/utils.py b/neural_structured_learning/lib/utils.py