add synthetic expt, confidence estimators, training code

PiperOrigin-RevId: 596072058 Change-Id: If8e1f3743507a9c473609124418d2f1175df0951
google-research · Jan 5, 2024 · 9969a86 · 9969a86
1 parent 2d68af9
commit 9969a86
Show file tree

Hide file tree

Showing 22 changed files with 2,271 additions and 6 deletions.
diff --git a/lidp_auditing/BUILD b/lidp_auditing/BUILD
@@ -1,6 +1,6 @@
 # Build all for the code.
 
-load("@rules_python//python:defs.bzl", "py_library", "py_test")
+load("@rules_python//python:defs.bzl", "py_binary", "py_library", "py_test")
 
 licenses(["notice"])
 
@@ -34,3 +34,45 @@ py_test(
         ":data_lib",
     ],
 )
+
+py_library(
+    name = "models_lib",
+    srcs = ["models.py"],
+    deps = [":constants_lib"],
+)
+
+py_library(
+    name = "utils_lib",
+    srcs = ["utils.py"],
+)
+
+py_library(
+    name = "auditing_eval_lib",
+    srcs = ["auditing_eval.py"],
+    deps = [
+        ":constants_lib",
+        ":utils_lib",
+    ],
+)
+
+py_library(
+    name = "auditing_trainer_lib",
+    srcs = ["auditing_trainer.py"],
+    deps = [
+        ":auditing_eval_lib",
+        ":constants_lib",
+        ":utils_lib",
+    ],
+)
+
+py_binary(
+    name = "main_central",
+    srcs = ["main_central.py"],
+    deps = [
+        ":auditing_trainer_lib",
+        ":constants_lib",
+        ":data_lib",
+        ":models_lib",
+        ":utils_lib",
+    ],
+)
diff --git a/lidp_auditing/README.md b/lidp_auditing/README.md
@@ -8,11 +8,11 @@ times and giving high-confidence estimates on the success of the attack (i.e.,
 we try to detect the presence of a crafted datapoint, called a "canary" in the
 training data).
 
-[This paper](\(https://arxiv.org/abs/2305.18447\)) introduces a variant of DP
-called "Lifted DP" (or "LiDP" in short) that is equivalent to the usual notions
-of DP. It also gives a recipe to audit LiDP with multiple randomized hypothesis
-tests and adaptive confidence intervals to improve the sample complexity of
-auditing DP by 4 to 16 times.
+[This paper](https://arxiv.org/abs/2305.18447) introduces a variant of DP called
+"Lifted DP" (or "LiDP" in short) that is equivalent to the usual notions of DP.
+It also gives a recipe to audit LiDP with multiple randomized hypothesis tests
+and adaptive confidence intervals to improve the sample complexity of auditing
+DP by 4 to 16 times.
 
 ## Cite
 
@@ -28,3 +28,58 @@ booktitle = {NeurIPS},
 year = {2023},
 }
 ```
+
+## Generating the experimental results
+
+For the synthetic experiments with the Gaussian mechanism, see the
+`synthetic/README.md`.
+
+For the experiments with real data, follow the steps below:
+
+1.  Train 2000 models (1000 models for parameter tuning and the other 1000 for
+    reporting the epsilon lower bounds). See `main_central.py` for details on
+    the command line arguments.
+
+```bash
+### General arguments
+output_dir="./outputs"  # NOTE: set the output directory
+dataset="fashion_mnist"  # Name of the dataset
+model="mlp"  # Model type: can be "linear" or "mlp"
+seed=0  # Random seed, vary from 0 to 1999 (total 2000 seeds)
+### Canary arguments
+canary_type="random_gradient"  # Can be "random_gradient" or "static_data"
+num_canaries=64  # Vary from 1 to 512
+min_dimension=0  # Minimum random seed for the random canary gradient
+max_dimension=1000000  # Minimum random seed for the random canary gradient
+### For canary_type="static_data", uncomment the following two lines:
+# min_dimension=300  # Minimum PCA direction for the canary
+# max_dimension=784  # Maximum PCA direction for the canary (= data dimension)
+### Learning arguments
+learning_rate=0.01  # Use 0.02 for the linear model
+dp_epsilon=2  # Vary from 1 to 32 in powers of 2
+dp_delta="1e-5"
+
+arguments="--experiment_name="run_${dataset}_${model}" --output_dir=${output_dir}  \
+    --dataset_name=${dataset}  --model_type=${model}  \
+    --canary_type=${canary_type}  \
+    --min_dimension=${min_dimension}  --max_dimension=${max_dimension}  \
+    --batch_size=100 --num_epochs=30 --learning_rate=${learning_rate}  \
+    --dp_epsilon=${dp_epsilon} --dp_delta=${dp_delta} \
+    --seed=${seed}"
+
+# Alternate hypothesis: run with k canaries
+bazel run :main_central -- ${arguments} \
+    --num_canaries=${num_canaries}
+
+# Null hypothesis: run with k-1 canaries (for training but test on k canaries)
+bazel run :main_central -- ${arguments} \
+    --num_canaries=$((num_canaries-1))  \
+    --test_canary_add_one=True
+```
+
+This code also saves various files in ${output_dir}/run_${dataset}_${model}
+tracking the test statistic of the training and test canaries.
+
+1.  Obtain the confidence intervals from the saved logs using the confidence
+    estimators from `lidp_auditing/confidence_estimators`. These instructions
+    will be completed later.
diff --git a/lidp_auditing/auditing_eval.py b/lidp_auditing/auditing_eval.py
@@ -0,0 +1,133 @@
+# Copyright 2023, Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation utilities required for auditing."""
+
+# from absl import logging
+import numpy as np
+import tensorflow as tf
+
+from lidp_auditing import constants
+from lidp_auditing import utils
+
+
+def get_evaluate_fn():
+  """Return a `tf.function` to evaluate a keras model."""
+
+  @tf.function
+  def evaluate_model(dataset, model, metric):
+    # Note: pass in batched dataset
+    metric.reset_state()
+    for x, y, _ in dataset:
+      predictions = model(x, training=False)
+      metric.update_state(y, predictions)
+    return metric.result()
+
+  return evaluate_model
+
+
+def evaluate_canary_dataset(
+    canary_type: str,
+    canary_dataset: tf.data.Dataset,
+    model: tf.keras.Model,
+    vector_loss_fn: tf.keras.losses.Loss,
+    batch_size: int,
+) -> np.ndarray:
+  """Run the test to see if the canary can be found."""
+  if canary_dataset is None:
+    return np.array([])
+  if canary_type == constants.RANDOM_GRADIENT_CANARY:
+    # return evaluate_random_gradient_canary(canary_dataset, model)
+    return evaluate_random_gradient_canary_batched(canary_dataset, model)
+
+  # Static or adaptive data canary
+  return evaluate_data_canary(canary_dataset, model, vector_loss_fn, batch_size)
+
+
+def evaluate_data_canary(canary_dataset, model, vector_loss_fn, batch_size):
+  """Compute the loss on the canaries."""
+  if canary_dataset is None:
+    return np.array([])
+  all_losses = []
+  for x, y, _ in canary_dataset.batch(batch_size, drop_remainder=False):
+    predictions = model(x, training=False)
+    loss_vector = vector_loss_fn(y, predictions)
+    all_losses.append(loss_vector.numpy())
+  return np.concatenate(all_losses)
+
+
+def evaluate_random_gradient_canary(canary_dataset, model):
+  """Compute the cosines of the parameters with the canaries."""
+  if canary_dataset is None:
+    return np.array([])
+  all_cosines = []
+  weights = tf.nest.flatten(model.trainable_variables)
+  weight_norm = tf.sqrt(
+      tf.add_n(
+          tf.nest.map_structure(
+              lambda x: tf.linalg.norm(x) ** 2, tf.nest.flatten(weights)
+          )
+      )
+  )
+  for _, _, z in canary_dataset:  # all examples are canaries
+    # Note: We use canaries of norm = 1 because we normalize by the norm of
+    # canaries anyway in our final statistic. So the clip norm does not matter.
+    noise = utils.get_random_normal_like(weights, z, flat_l2_norm=1)
+    dot_product = tf.add_n(
+        tf.nest.map_structure(lambda a, b: tf.reduce_sum(a * b), noise, weights)
+    )
+    cosine = dot_product / weight_norm
+    all_cosines.append(cosine.numpy())
+  return np.array(all_cosines)
+
+
+def evaluate_random_gradient_canary_batched(
+    canary_dataset, model, max_batch_size=1024
+):
+  """Batched computation of the cosines of the parameters with the canaries."""
+  # Batching gives a 20x speedup on the evaluation.
+  if canary_dataset is None:
+    return np.array([])
+  all_cosines = []
+  weights = tf.nest.flatten(model.trainable_variables)
+  weight_norm = tf.sqrt(
+      tf.add_n(
+          tf.nest.map_structure(
+              lambda x: tf.linalg.norm(x) ** 2, tf.nest.flatten(weights)
+          )
+      )
+  )
+  # All examples are canaries, so no special filtering necessary.
+  for _, _, z in canary_dataset.batch(max_batch_size):
+    # Note: We use canaries of norm = 1 because we normalize by the norm of
+    # canaries anyway in our final statistic. So the clip norm does not matter.
+    noise = utils.get_batched_random_normal_like(
+        weights, z, flat_l2_norm=tf.constant(1.0)
+    )  # list of (batch_size, *weights[i])
+    dot_product = tf.add_n(
+        tf.nest.map_structure(batched_dot, noise, weights)
+    )  # (batch_size,)
+    cosine = dot_product / weight_norm
+    all_cosines.append(cosine.numpy())
+  return np.concatenate(all_cosines)
+
+
+def batched_dot(a, b):
+  """Return [dot(c, b) for c in a] but in TF."""
+  # a: (bsz, s1, s2, ...)
+  # b: (s1, s2, ...)
+  return tf.tensordot(
+      tf.reshape(a, (tf.shape(a)[0], -1)),  # (bsz, s1, s2, ...) -> (bsz, s)
+      tf.reshape(b, -1),  # (s1, s2, ...) -> (s,)
+      axes=1,
+  )  # (bsz,)