Skip to content

Commit

Permalink
Add code sample for string replacement based deidentification. (#3956)
Browse files Browse the repository at this point in the history
Adds a code sample corresponding to the replacement based deidentification in the Cloud DLP API. The detected sensitive value is replaced with a specified surrogate.
  • Loading branch information
ackul committed Jun 9, 2020
1 parent 66825c2 commit 2b508e6
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 9 deletions.
112 changes: 103 additions & 9 deletions dlp/deid.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@

# [START dlp_deidentify_masking]
def deidentify_with_mask(
project, string, info_types, masking_character=None, number_to_mask=0
project, input_str, info_types, masking_character=None, number_to_mask=0
):
"""Uses the Data Loss Prevention API to deidentify sensitive data in a
string by masking it with a character.
Args:
project: The Google Cloud project id to use as a parent resource.
item: The string to deidentify (will be treated as text).
input_str: The string to deidentify (will be treated as text).
masking_character: The character to mask matching sensitive data with.
number_to_mask: The maximum number of sensitive characters to mask in
a match. If omitted or set to zero, the API will default to no
Expand Down Expand Up @@ -67,7 +67,7 @@ def deidentify_with_mask(
}

# Construct item
item = {"value": string}
item = {"value": input_str}

# Call the API
response = dlp.deidentify_content(
Expand All @@ -83,11 +83,76 @@ def deidentify_with_mask(

# [END dlp_deidentify_masking]

# [START dlp_deidentify_replace]
def deidentify_with_replace(
project,
input_str,
info_types,
replacement_str="REPLACEMENT_STR",
):
"""Uses the Data Loss Prevention API to deidentify sensitive data in a
string by replacing matched input values with a value you specify.
Args:
project: The Google Cloud project id to use as a parent resource.
input_str: The string to deidentify (will be treated as text).
info_types: A list of strings representing info types to look for.
replacement_str: The string to replace all values that match given
info types.
Returns:
None; the response from the API is printed to the terminal.
"""
import google.cloud.dlp

# Instantiate a client
dlp = google.cloud.dlp_v2.DlpServiceClient()

# Convert the project id into a full resource id.
parent = dlp.project_path(project)

# Construct inspect configuration dictionary
inspect_config = {
"info_types": [{"name": info_type} for info_type in info_types]
}

# Construct deidentify configuration dictionary
deidentify_config = {
"info_type_transformations": {
"transformations": [
{
"primitive_transformation": {
"replace_config": {
"new_value": {
"string_value": replacement_str,
}
}
}
}
]
}
}

# Construct item
item = {"value": input_str}

# Call the API
response = dlp.deidentify_content(
parent,
inspect_config=inspect_config,
deidentify_config=deidentify_config,
item=item,
)

# Print out the results.
print(response.item.value)

# [END dlp_deidentify_replace]

# [START dlp_deidentify_fpe]


def deidentify_with_fpe(
project,
string,
input_str,
info_types,
alphabet=None,
surrogate_type=None,
Expand All @@ -98,7 +163,7 @@ def deidentify_with_fpe(
string using Format Preserving Encryption (FPE).
Args:
project: The Google Cloud project id to use as a parent resource.
item: The string to deidentify (will be treated as text).
input_str: The string to deidentify (will be treated as text).
alphabet: The set of characters to replace sensitive ones with. For
more information, see https://cloud.google.com/dlp/docs/reference/
rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet
Expand Down Expand Up @@ -166,7 +231,7 @@ def deidentify_with_fpe(
}

# Convert string to item
item = {"value": string}
item = {"value": input_str}

# Call the API
response = dlp.deidentify_content(
Expand All @@ -186,7 +251,7 @@ def deidentify_with_fpe(
# [START dlp_reidentify_fpe]
def reidentify_with_fpe(
project,
string,
input_str,
alphabet=None,
surrogate_type=None,
key_name=None,
Expand All @@ -196,7 +261,7 @@ def reidentify_with_fpe(
string that was encrypted by Format Preserving Encryption (FPE).
Args:
project: The Google Cloud project id to use as a parent resource.
item: The string to deidentify (will be treated as text).
input_str: The string to deidentify (will be treated as text).
alphabet: The set of characters to replace sensitive ones with. For
more information, see https://cloud.google.com/dlp/docs/reference/
rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet
Expand Down Expand Up @@ -255,7 +320,7 @@ def reidentify_with_fpe(
}

# Convert string to item
item = {"value": string}
item = {"value": input_str}

# Call the API
response = dlp.reidentify_content(
Expand Down Expand Up @@ -531,6 +596,28 @@ def redact_sensitive_data(project, item, info_types):
help="The character to mask matching sensitive data with.",
)

replace_parser = subparsers.add_parser(
"deid_replace",
help="Deidentify sensitive data in a string by replacing it with "
"another string.",
)
replace_parser.add_argument(
"--info_types",
nargs="+",
help="Strings representing info types to look for. A full list of "
"info categories and types is available from the API. Examples "
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
"If unspecified, the three above examples will be used.",
default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
)
replace_parser.add_argument(
"project",
help="The Google Cloud project id to use as a parent resource.",
)
replace_parser.add_argument("item", help="The string to deidentify.")
replace_parser.add_argument("replacement_str", help="The string to "
"replace all matched values with.")

fpe_parser = subparsers.add_parser(
"deid_fpe",
help="Deidentify sensitive data in a string using Format Preserving "
Expand Down Expand Up @@ -715,6 +802,13 @@ def redact_sensitive_data(project, item, info_types):
masking_character=args.masking_character,
number_to_mask=args.number_to_mask,
)
elif args.content == "deid_replace":
deidentify_with_replace(
args.project,
args.item,
args.info_types,
replacement_str=args.replacement_str,
)
elif args.content == "deid_fpe":
deidentify_with_fpe(
args.project,
Expand Down
10 changes: 10 additions & 0 deletions dlp/deid_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,16 @@ def test_deidentify_with_mask_masking_number_specified(capsys):
assert "My SSN is *******27" in out


def test_deidentify_with_replace(capsys):
deid.deidentify_with_replace(
GCLOUD_PROJECT, HARMFUL_STRING, ["US_SOCIAL_SECURITY_NUMBER"],
replacement_str="REPLACEMENT_STR"
)

out, _ = capsys.readouterr()
assert "My SSN is REPLACEMENT_STR" in out


def test_deidentify_with_fpe(capsys):
deid.deidentify_with_fpe(
GCLOUD_PROJECT,
Expand Down

0 comments on commit 2b508e6

Please sign in to comment.