From 2b508e61ff8fb045d48f3c4a320e688bb0b523ad Mon Sep 17 00:00:00 2001 From: Ace Date: Mon, 8 Jun 2020 20:40:09 -0700 Subject: [PATCH] Add code sample for string replacement based deidentification. (#3956) Adds a code sample corresponding to the replacement based deidentification in the Cloud DLP API. The detected sensitive value is replaced with a specified surrogate. --- dlp/deid.py | 112 +++++++++++++++++++++++++++++++++++++++++++---- dlp/deid_test.py | 10 +++++ 2 files changed, 113 insertions(+), 9 deletions(-) diff --git a/dlp/deid.py b/dlp/deid.py index 81847690866c..24ec6913eb36 100644 --- a/dlp/deid.py +++ b/dlp/deid.py @@ -21,13 +21,13 @@ # [START dlp_deidentify_masking] def deidentify_with_mask( - project, string, info_types, masking_character=None, number_to_mask=0 + project, input_str, info_types, masking_character=None, number_to_mask=0 ): """Uses the Data Loss Prevention API to deidentify sensitive data in a string by masking it with a character. Args: project: The Google Cloud project id to use as a parent resource. - item: The string to deidentify (will be treated as text). + input_str: The string to deidentify (will be treated as text). masking_character: The character to mask matching sensitive data with. number_to_mask: The maximum number of sensitive characters to mask in a match. If omitted or set to zero, the API will default to no @@ -67,7 +67,7 @@ def deidentify_with_mask( } # Construct item - item = {"value": string} + item = {"value": input_str} # Call the API response = dlp.deidentify_content( @@ -83,11 +83,76 @@ def deidentify_with_mask( # [END dlp_deidentify_masking] +# [START dlp_deidentify_replace] +def deidentify_with_replace( + project, + input_str, + info_types, + replacement_str="REPLACEMENT_STR", +): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string by replacing matched input values with a value you specify. + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + info_types: A list of strings representing info types to look for. + replacement_str: The string to replace all values that match given + info types. + Returns: + None; the response from the API is printed to the terminal. + """ + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Construct inspect configuration dictionary + inspect_config = { + "info_types": [{"name": info_type} for info_type in info_types] + } + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "replace_config": { + "new_value": { + "string_value": replacement_str, + } + } + } + } + ] + } + } + + # Construct item + item = {"value": input_str} + + # Call the API + response = dlp.deidentify_content( + parent, + inspect_config=inspect_config, + deidentify_config=deidentify_config, + item=item, + ) + + # Print out the results. + print(response.item.value) + +# [END dlp_deidentify_replace] # [START dlp_deidentify_fpe] + + def deidentify_with_fpe( project, - string, + input_str, info_types, alphabet=None, surrogate_type=None, @@ -98,7 +163,7 @@ def deidentify_with_fpe( string using Format Preserving Encryption (FPE). Args: project: The Google Cloud project id to use as a parent resource. - item: The string to deidentify (will be treated as text). + input_str: The string to deidentify (will be treated as text). alphabet: The set of characters to replace sensitive ones with. For more information, see https://cloud.google.com/dlp/docs/reference/ rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet @@ -166,7 +231,7 @@ def deidentify_with_fpe( } # Convert string to item - item = {"value": string} + item = {"value": input_str} # Call the API response = dlp.deidentify_content( @@ -186,7 +251,7 @@ def deidentify_with_fpe( # [START dlp_reidentify_fpe] def reidentify_with_fpe( project, - string, + input_str, alphabet=None, surrogate_type=None, key_name=None, @@ -196,7 +261,7 @@ def reidentify_with_fpe( string that was encrypted by Format Preserving Encryption (FPE). Args: project: The Google Cloud project id to use as a parent resource. - item: The string to deidentify (will be treated as text). + input_str: The string to deidentify (will be treated as text). alphabet: The set of characters to replace sensitive ones with. For more information, see https://cloud.google.com/dlp/docs/reference/ rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet @@ -255,7 +320,7 @@ def reidentify_with_fpe( } # Convert string to item - item = {"value": string} + item = {"value": input_str} # Call the API response = dlp.reidentify_content( @@ -531,6 +596,28 @@ def redact_sensitive_data(project, item, info_types): help="The character to mask matching sensitive data with.", ) + replace_parser = subparsers.add_parser( + "deid_replace", + help="Deidentify sensitive data in a string by replacing it with " + "another string.", + ) + replace_parser.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + replace_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + replace_parser.add_argument("item", help="The string to deidentify.") + replace_parser.add_argument("replacement_str", help="The string to " + "replace all matched values with.") + fpe_parser = subparsers.add_parser( "deid_fpe", help="Deidentify sensitive data in a string using Format Preserving " @@ -715,6 +802,13 @@ def redact_sensitive_data(project, item, info_types): masking_character=args.masking_character, number_to_mask=args.number_to_mask, ) + elif args.content == "deid_replace": + deidentify_with_replace( + args.project, + args.item, + args.info_types, + replacement_str=args.replacement_str, + ) elif args.content == "deid_fpe": deidentify_with_fpe( args.project, diff --git a/dlp/deid_test.py b/dlp/deid_test.py index db0c94e35dd6..8aa130be0eae 100644 --- a/dlp/deid_test.py +++ b/dlp/deid_test.py @@ -88,6 +88,16 @@ def test_deidentify_with_mask_masking_number_specified(capsys): assert "My SSN is *******27" in out +def test_deidentify_with_replace(capsys): + deid.deidentify_with_replace( + GCLOUD_PROJECT, HARMFUL_STRING, ["US_SOCIAL_SECURITY_NUMBER"], + replacement_str="REPLACEMENT_STR" + ) + + out, _ = capsys.readouterr() + assert "My SSN is REPLACEMENT_STR" in out + + def test_deidentify_with_fpe(capsys): deid.deidentify_with_fpe( GCLOUD_PROJECT,