@@ -914,6 +914,137 @@ def write_data(data: types.storage.Value) -> str:
914914# [END dlp_deidentify_date_shift]
915915
916916
917+ # [START dlp_deidentify_time_extract]
918+ import csv # noqa: F811, E402, I100
919+ from datetime import datetime # noqa: F811, E402, I100
920+ from typing import List # noqa: F811, E402
921+
922+ import google .cloud .dlp # noqa: F811, E402
923+
924+
925+ def deidentify_with_time_extract (
926+ project : str ,
927+ date_fields : List [str ],
928+ input_csv_file : str ,
929+ output_csv_file : str ,
930+ ) -> None :
931+ """ Uses the Data Loss Prevention API to deidentify dates in a CSV file through
932+ time part extraction.
933+ Args:
934+ project: The Google Cloud project id to use as a parent resource.
935+ date_fields: A list of (date) fields in CSV file to de-identify
936+ through time extraction. Example: ['birth_date', 'register_date'].
937+ Date values in format: mm/DD/YYYY are considered as part of this
938+ sample.
939+ input_csv_file: The path to the CSV file to deidentify. The first row
940+ of the file must specify column names, and all other rows must
941+ contain valid values.
942+ output_csv_file: The output file path to save the time extracted data.
943+ """
944+
945+ # Instantiate a client.
946+ dlp = google .cloud .dlp_v2 .DlpServiceClient ()
947+
948+ # Convert date field list to Protobuf type.
949+ def map_fields (field ):
950+ return {"name" : field }
951+
952+ if date_fields :
953+ date_fields = map (map_fields , date_fields )
954+ else :
955+ date_fields = []
956+
957+ csv_lines = []
958+ with open (input_csv_file ) as csvfile :
959+ reader = csv .reader (csvfile )
960+ for row in reader :
961+ csv_lines .append (row )
962+
963+ # Helper function for converting CSV rows to Protobuf types
964+ def map_headers (header ):
965+ return {"name" : header }
966+
967+ def map_data (value ):
968+ try :
969+ date = datetime .strptime (value , "%m/%d/%Y" )
970+ return {
971+ "date_value" : {
972+ "year" : date .year , "month" : date .month , "day" : date .day
973+ }
974+ }
975+ except ValueError :
976+ return {"string_value" : value }
977+
978+ def map_rows (row ):
979+ return {"values" : map (map_data , row )}
980+
981+ # Using the helper functions, convert CSV rows to protobuf-compatible
982+ # dictionaries.
983+ csv_headers = map (map_headers , csv_lines [0 ])
984+ csv_rows = map (map_rows , csv_lines [1 :])
985+
986+ # Construct the table dictionary.
987+ table = {"headers" : csv_headers , "rows" : csv_rows }
988+
989+ # Construct the `item` for table to de-identify.
990+ item = {"table" : table }
991+
992+ # Construct deidentify configuration dictionary.
993+ deidentify_config = {
994+ "record_transformations" : {
995+ "field_transformations" : [
996+ {
997+ "primitive_transformation" : {
998+ "time_part_config" : {
999+ "part_to_extract" : "YEAR"
1000+ }
1001+ },
1002+ "fields" : date_fields ,
1003+ }
1004+ ]
1005+ }
1006+ }
1007+
1008+ # Write to CSV helper methods.
1009+ def write_header (header ):
1010+ return header .name
1011+
1012+ def write_data (data ):
1013+ return data .string_value or "{}/{}/{}" .format (
1014+ data .date_value .month ,
1015+ data .date_value .day ,
1016+ data .date_value .year ,
1017+ )
1018+
1019+ # Convert the project id into a full resource id.
1020+ parent = f"projects/{ project } "
1021+
1022+ # Call the API
1023+ response = dlp .deidentify_content (
1024+ request = {
1025+ "parent" : parent ,
1026+ "deidentify_config" : deidentify_config ,
1027+ "item" : item ,
1028+ }
1029+ )
1030+
1031+ # Print the result.
1032+ print ("Table after de-identification: {}" .format (response .item .table ))
1033+
1034+ # Write results to CSV file.
1035+ with open (output_csv_file , "w" ) as csvfile :
1036+ write_file = csv .writer (csvfile , delimiter = "," )
1037+ write_file .writerow (map (write_header , response .item .table .headers ))
1038+ for row in response .item .table .rows :
1039+ write_file .writerow (map (write_data , row .values ))
1040+
1041+ # Print status.
1042+ print (f"Successfully saved date-extracted output to { output_csv_file } " )
1043+
1044+
1045+ # [END dlp_deidentify_time_extract]
1046+
1047+
9171048# [START dlp_deidentify_replace_infotype]
9181049from typing import List # noqa: F811, E402, I100
9191050
@@ -2124,6 +2255,30 @@ def deidentify_table_with_multiple_crypto_hash(
21242255 "key_name." ,
21252256 )
21262257
2258+ time_extract_parser = subparsers .add_parser (
2259+ "deid_time_extract" ,
2260+ help = "Deidentify dates in a CSV file by extracting a date part." ,
2261+ )
2262+ time_extract_parser .add_argument (
2263+ "project" ,
2264+ help = "The Google Cloud project id to use as a parent resource." ,
2265+ )
2266+ time_extract_parser .add_argument (
2267+ "input_csv_file" ,
2268+ help = "The path to the CSV file to deidentify. The first row of the "
2269+ "file must specify column names, and all other rows must contain "
2270+ "valid values." ,
2271+ )
2272+ time_extract_parser .add_argument (
2273+ "date_fields" ,
2274+ nargs = "+" ,
2275+ help = "The list of date fields in the CSV file to de-identify. Example: "
2276+ "['birth_date', 'register_date']" ,
2277+ )
2278+ time_extract_parser .add_argument (
2279+ "output_csv_file" , help = "The path to save the time-extracted data."
2280+ )
2281+
21272282 replace_with_infotype_parser = subparsers .add_parser (
21282283 "replace_with_infotype" ,
21292284 help = "Deidentify sensitive data in a string by replacing it with the "
@@ -2485,6 +2640,13 @@ def deidentify_table_with_multiple_crypto_hash(
24852640 wrapped_key = args .wrapped_key ,
24862641 key_name = args .key_name ,
24872642 )
2643+ elif args .content == "deid_time_extract" :
2644+ deidentify_with_time_extract (
2645+ args .project ,
2646+ date_fields = args .date_fields ,
2647+ input_csv_file = args .input_csv_file ,
2648+ output_csv_file = args .output_csv_file ,
2649+ )
24882650 elif args .content == "replace_with_infotype" :
24892651 deidentify_with_replace_infotype (
24902652 args .project ,
0 commit comments