Skip to content

Commit

Permalink
add synthetic expt, confidence estimators, training code
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 596072058
Change-Id: If8e1f3743507a9c473609124418d2f1175df0951
  • Loading branch information
Krishna Pillutla authored and copybara-github committed Jan 5, 2024
1 parent 2d68af9 commit 9969a86
Show file tree
Hide file tree
Showing 22 changed files with 2,271 additions and 6 deletions.
44 changes: 43 additions & 1 deletion lidp_auditing/BUILD
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Build all for the code.

load("@rules_python//python:defs.bzl", "py_library", "py_test")
load("@rules_python//python:defs.bzl", "py_binary", "py_library", "py_test")

licenses(["notice"])

Expand Down Expand Up @@ -34,3 +34,45 @@ py_test(
":data_lib",
],
)

py_library(
name = "models_lib",
srcs = ["models.py"],
deps = [":constants_lib"],
)

py_library(
name = "utils_lib",
srcs = ["utils.py"],
)

py_library(
name = "auditing_eval_lib",
srcs = ["auditing_eval.py"],
deps = [
":constants_lib",
":utils_lib",
],
)

py_library(
name = "auditing_trainer_lib",
srcs = ["auditing_trainer.py"],
deps = [
":auditing_eval_lib",
":constants_lib",
":utils_lib",
],
)

py_binary(
name = "main_central",
srcs = ["main_central.py"],
deps = [
":auditing_trainer_lib",
":constants_lib",
":data_lib",
":models_lib",
":utils_lib",
],
)
65 changes: 60 additions & 5 deletions lidp_auditing/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ times and giving high-confidence estimates on the success of the attack (i.e.,
we try to detect the presence of a crafted datapoint, called a "canary" in the
training data).

[This paper](\(https://arxiv.org/abs/2305.18447\)) introduces a variant of DP
called "Lifted DP" (or "LiDP" in short) that is equivalent to the usual notions
of DP. It also gives a recipe to audit LiDP with multiple randomized hypothesis
tests and adaptive confidence intervals to improve the sample complexity of
auditing DP by 4 to 16 times.
[This paper](https://arxiv.org/abs/2305.18447) introduces a variant of DP called
"Lifted DP" (or "LiDP" in short) that is equivalent to the usual notions of DP.
It also gives a recipe to audit LiDP with multiple randomized hypothesis tests
and adaptive confidence intervals to improve the sample complexity of auditing
DP by 4 to 16 times.

## Cite

Expand All @@ -28,3 +28,58 @@ booktitle = {NeurIPS},
year = {2023},
}
```

## Generating the experimental results

For the synthetic experiments with the Gaussian mechanism, see the
`synthetic/README.md`.

For the experiments with real data, follow the steps below:

1. Train 2000 models (1000 models for parameter tuning and the other 1000 for
reporting the epsilon lower bounds). See `main_central.py` for details on
the command line arguments.

```bash
### General arguments
output_dir="./outputs" # NOTE: set the output directory
dataset="fashion_mnist" # Name of the dataset
model="mlp" # Model type: can be "linear" or "mlp"
seed=0 # Random seed, vary from 0 to 1999 (total 2000 seeds)
### Canary arguments
canary_type="random_gradient" # Can be "random_gradient" or "static_data"
num_canaries=64 # Vary from 1 to 512
min_dimension=0 # Minimum random seed for the random canary gradient
max_dimension=1000000 # Minimum random seed for the random canary gradient
### For canary_type="static_data", uncomment the following two lines:
# min_dimension=300 # Minimum PCA direction for the canary
# max_dimension=784 # Maximum PCA direction for the canary (= data dimension)
### Learning arguments
learning_rate=0.01 # Use 0.02 for the linear model
dp_epsilon=2 # Vary from 1 to 32 in powers of 2
dp_delta="1e-5"

arguments="--experiment_name="run_${dataset}_${model}" --output_dir=${output_dir} \
--dataset_name=${dataset} --model_type=${model} \
--canary_type=${canary_type} \
--min_dimension=${min_dimension} --max_dimension=${max_dimension} \
--batch_size=100 --num_epochs=30 --learning_rate=${learning_rate} \
--dp_epsilon=${dp_epsilon} --dp_delta=${dp_delta} \
--seed=${seed}"

# Alternate hypothesis: run with k canaries
bazel run :main_central -- ${arguments} \
--num_canaries=${num_canaries}

# Null hypothesis: run with k-1 canaries (for training but test on k canaries)
bazel run :main_central -- ${arguments} \
--num_canaries=$((num_canaries-1)) \
--test_canary_add_one=True
```

This code also saves various files in ${output_dir}/run_${dataset}_${model}
tracking the test statistic of the training and test canaries.

1. Obtain the confidence intervals from the saved logs using the confidence
estimators from `lidp_auditing/confidence_estimators`. These instructions
will be completed later.
133 changes: 133 additions & 0 deletions lidp_auditing/auditing_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# Copyright 2023, Google LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Evaluation utilities required for auditing."""

# from absl import logging
import numpy as np
import tensorflow as tf

from lidp_auditing import constants
from lidp_auditing import utils


def get_evaluate_fn():
"""Return a `tf.function` to evaluate a keras model."""

@tf.function
def evaluate_model(dataset, model, metric):
# Note: pass in batched dataset
metric.reset_state()
for x, y, _ in dataset:
predictions = model(x, training=False)
metric.update_state(y, predictions)
return metric.result()

return evaluate_model


def evaluate_canary_dataset(
canary_type: str,
canary_dataset: tf.data.Dataset,
model: tf.keras.Model,
vector_loss_fn: tf.keras.losses.Loss,
batch_size: int,
) -> np.ndarray:
"""Run the test to see if the canary can be found."""
if canary_dataset is None:
return np.array([])
if canary_type == constants.RANDOM_GRADIENT_CANARY:
# return evaluate_random_gradient_canary(canary_dataset, model)
return evaluate_random_gradient_canary_batched(canary_dataset, model)

# Static or adaptive data canary
return evaluate_data_canary(canary_dataset, model, vector_loss_fn, batch_size)


def evaluate_data_canary(canary_dataset, model, vector_loss_fn, batch_size):
"""Compute the loss on the canaries."""
if canary_dataset is None:
return np.array([])
all_losses = []
for x, y, _ in canary_dataset.batch(batch_size, drop_remainder=False):
predictions = model(x, training=False)
loss_vector = vector_loss_fn(y, predictions)
all_losses.append(loss_vector.numpy())
return np.concatenate(all_losses)


def evaluate_random_gradient_canary(canary_dataset, model):
"""Compute the cosines of the parameters with the canaries."""
if canary_dataset is None:
return np.array([])
all_cosines = []
weights = tf.nest.flatten(model.trainable_variables)
weight_norm = tf.sqrt(
tf.add_n(
tf.nest.map_structure(
lambda x: tf.linalg.norm(x) ** 2, tf.nest.flatten(weights)
)
)
)
for _, _, z in canary_dataset: # all examples are canaries
# Note: We use canaries of norm = 1 because we normalize by the norm of
# canaries anyway in our final statistic. So the clip norm does not matter.
noise = utils.get_random_normal_like(weights, z, flat_l2_norm=1)
dot_product = tf.add_n(
tf.nest.map_structure(lambda a, b: tf.reduce_sum(a * b), noise, weights)
)
cosine = dot_product / weight_norm
all_cosines.append(cosine.numpy())
return np.array(all_cosines)


def evaluate_random_gradient_canary_batched(
canary_dataset, model, max_batch_size=1024
):
"""Batched computation of the cosines of the parameters with the canaries."""
# Batching gives a 20x speedup on the evaluation.
if canary_dataset is None:
return np.array([])
all_cosines = []
weights = tf.nest.flatten(model.trainable_variables)
weight_norm = tf.sqrt(
tf.add_n(
tf.nest.map_structure(
lambda x: tf.linalg.norm(x) ** 2, tf.nest.flatten(weights)
)
)
)
# All examples are canaries, so no special filtering necessary.
for _, _, z in canary_dataset.batch(max_batch_size):
# Note: We use canaries of norm = 1 because we normalize by the norm of
# canaries anyway in our final statistic. So the clip norm does not matter.
noise = utils.get_batched_random_normal_like(
weights, z, flat_l2_norm=tf.constant(1.0)
) # list of (batch_size, *weights[i])
dot_product = tf.add_n(
tf.nest.map_structure(batched_dot, noise, weights)
) # (batch_size,)
cosine = dot_product / weight_norm
all_cosines.append(cosine.numpy())
return np.concatenate(all_cosines)


def batched_dot(a, b):
"""Return [dot(c, b) for c in a] but in TF."""
# a: (bsz, s1, s2, ...)
# b: (s1, s2, ...)
return tf.tensordot(
tf.reshape(a, (tf.shape(a)[0], -1)), # (bsz, s1, s2, ...) -> (bsz, s)
tf.reshape(b, -1), # (s1, s2, ...) -> (s,)
axes=1,
) # (bsz,)
Loading

0 comments on commit 9969a86

Please sign in to comment.