From 4b9bd20fc131f4fccd837da687c988b247e5790d Mon Sep 17 00:00:00 2001 From: dinesh-crest Date: Thu, 27 Jul 2023 15:44:57 +0530 Subject: [PATCH] DLP: Added sample for k anonymity with entity ID and deidentify cloud storage Added unit test cases for same --- dlp/deIdentifyCloudStorage.js | 175 +++++++++++++++++++++++++++++++++ dlp/kAnonymityWithEntityIds.js | 153 ++++++++++++++++++++++++++++ dlp/system-test/deid.test.js | 106 ++++++++++++++++++++ dlp/system-test/mockdata.js | 157 +++++++++++++++++++++++++++++ dlp/system-test/risk.test.js | 89 +++++++++++++++++ 5 files changed, 680 insertions(+) create mode 100644 dlp/deIdentifyCloudStorage.js create mode 100644 dlp/kAnonymityWithEntityIds.js diff --git a/dlp/deIdentifyCloudStorage.js b/dlp/deIdentifyCloudStorage.js new file mode 100644 index 00000000000..cb600da2a35 --- /dev/null +++ b/dlp/deIdentifyCloudStorage.js @@ -0,0 +1,175 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +'use strict'; + +// sample-metadata: +// title: De-identify sensitive data in a Cloud Storage directory. +// description: Uses the Data Loss Prevention API To de-identify sensitive data in a Cloud Storage directory. +// usage: node deIdentifyCloudStorage.js projectId, inputDirectory, tableId, datasetId, outputDirectory, deidentifyTemplateId, structuredDeidentifyTemplateId, imageRedactTemplateId +async function main( + projectId, + inputDirectory, + tableId, + datasetId, + outputDirectory, + deidentifyTemplateId, + structuredDeidentifyTemplateId, + imageRedactTemplateId +) { + // [START dlp_deidentify_cloud_storage] + // Imports the Google Cloud client library + const DLP = require('@google-cloud/dlp'); + // Instantiates a client + const dlp = new DLP.DlpServiceClient(); + + // The project ID to run the API call under + // const projectId = 'my-project'; + + // The Cloud Storage directory that needs to be inspected + // const inputDirectory = 'your-google-cloud-storage-path'; + + // The ID of the dataset to inspect, e.g. 'my_dataset' + // const datasetId = 'my_dataset'; + + // The ID of the table to inspect, e.g. 'my_table' + // const tableId = 'my_table'; + + // The Cloud Storage directory that will be used to store the de-identified files + // const outputDirectory = 'your-output-directory'; + + // The full resource name of the default de-identify template + // const deidentifyTemplateId = 'your-deidentify-template-id'; + + // The full resource name of the de-identify template for structured files + // const structuredDeidentifyTemplateId = 'your-structured-deidentify-template-id'; + + // The full resource name of the image redaction template for images + // const imageRedactTemplateId = 'your-image-redact-template-id'; + + async function deidentifyCloudStorage() { + // Specify storage configuration that uses file set. + const storageConfig = { + cloudStorageOptions: { + fileSet: { + url: inputDirectory, + }, + }, + }; + + // Specify the type of info the inspection will look for. + const infoTypes = [{name: 'PERSON_NAME'}, {name: 'EMAIL_ADDRESS'}]; + + // Construct inspect configuration + const inspectConfig = { + infoTypes: infoTypes, + includeQuote: true, + }; + + // Types of files to include for de-identification. + const fileTypesToTransform = [ + {fileType: 'IMAGE'}, + {fileType: 'CSV'}, + {fileType: 'TEXT_FILE'}, + ]; + + // Specify the big query table to store the transformation details. + const transformationDetailsStorageConfig = { + table: { + projectId: projectId, + tableId: tableId, + datasetId: datasetId, + }, + }; + + // Specify the de-identify template used for the transformation. + const transformationConfig = { + deidentifyTemplate: deidentifyTemplateId, + structuredDeidentifyTemplate: structuredDeidentifyTemplateId, + imageRedactTemplate: imageRedactTemplateId, + }; + + // Construct action to de-identify sensitive data. + const action = { + deidentify: { + cloudStorageOutput: outputDirectory, + transformationConfig: transformationConfig, + transformationDetailsStorageConfig: transformationDetailsStorageConfig, + fileTypes: fileTypesToTransform, + }, + }; + + // Construct the inspect job configuration. + const inspectJobConfig = { + inspectConfig: inspectConfig, + storageConfig: storageConfig, + actions: [action], + }; + + // Construct the job creation request to be sent by the client. + const request = { + parent: `projects/${projectId}/locations/global`, + inspectJob: inspectJobConfig, + }; + // Send the job creation request and process the response. + const [response] = await dlp.createDlpJob(request); + const jobName = response.name; + + // Waiting for a maximum of 15 minutes for the job to get complete. + let job; + let numOfAttempts = 30; + while (numOfAttempts > 0) { + // Fetch DLP Job status + [job] = await dlp.getDlpJob({name: jobName}); + + // Check if the job has completed. + if (job.state === 'DONE') { + break; + } + if (job.state === 'FAILED') { + console.log('Job Failed, Please check the configuration.'); + return; + } + // Sleep for a short duration before checking the job status again. + await new Promise(resolve => { + setTimeout(() => resolve(), 30000); + }); + numOfAttempts -= 1; + } + + // Print out the results. + const infoTypeStats = job.inspectDetails.result.infoTypeStats; + if (infoTypeStats.length > 0) { + infoTypeStats.forEach(infoTypeStat => { + console.log( + ` Found ${infoTypeStat.count} instance(s) of infoType ${infoTypeStat.infoType.name}.` + ); + }); + } else { + console.log('No findings.'); + } + } + await deidentifyCloudStorage(); + // [END dlp_deidentify_cloud_storage] +} + +process.on('unhandledRejection', err => { + console.error(err.message); + process.exitCode = 1; +}); + +// TODO(developer): Please uncomment below line before running sample +// main(...process.argv.slice(2)); + +module.exports = main; diff --git a/dlp/kAnonymityWithEntityIds.js b/dlp/kAnonymityWithEntityIds.js new file mode 100644 index 00000000000..e51584c3def --- /dev/null +++ b/dlp/kAnonymityWithEntityIds.js @@ -0,0 +1,153 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +'use strict'; + +// sample-metadata: +// title: Create a Dlp Job to visualize the k-anonymity re-identification risk analysis metric +// description: Uses the Data Loss Prevention API to visualize the k-anonymity re-identification risk analysis metric. +// usage: node kAnonymityWithEntityIds.js projectId, datasetId, sourceTableId, outputTableId +async function main(projectId, datasetId, sourceTableId, outputTableId) { + // [START dlp_k_anonymity_with_entity_id] + // Imports the Google Cloud Data Loss Prevention library + const DLP = require('@google-cloud/dlp'); + + // Instantiates a client + const dlp = new DLP.DlpServiceClient(); + + // The project ID to run the API call under. + // const projectId = "your-project-id"; + + // The ID of the dataset to inspect, e.g. 'my_dataset' + // const datasetId = 'my_dataset'; + + // The ID of the table to inspect, e.g. 'my_table' + // const sourceTableId = 'my_source_table'; + + // The ID of the table where outputs are stored + // const outputTableId = 'my_output_table'; + + async function kAnonymityWithEntityIds() { + // Specify the BigQuery table to analyze. + const sourceTable = { + projectId: projectId, + datasetId: datasetId, + tableId: sourceTableId, + }; + + // Specify quasi-identifiers to analyze + const privacyMetric = { + kAnonymityConfig: { + entityId: { + field: { + name: 'Name', + }, + }, + quasiIds: [ + { + name: 'Age', + }, + { + name: 'Mystery', + }, + ], + }, + }; + // Create action to publish job status notifications to BigQuery table. + const action = [ + { + saveFindings: { + outputConfig: { + table: { + projectId: projectId, + datasetId: datasetId, + tableId: outputTableId, + }, + }, + }, + }, + ]; + + // Configure the risk analysis job to perform. + const riskJob = { + sourceTable: sourceTable, + privacyMetric: privacyMetric, + actions: action, + }; + // Combine configurations into a request for the service. + const request = { + parent: `projects/${projectId}/locations/global`, + riskJob: riskJob, + }; + + // Send the request and receive response from the service + const [createdDlpJob] = await dlp.createDlpJob(request); + const jobName = createdDlpJob.name; + + // Waiting for a maximum of 15 minutes for the job to get complete. + let job; + let numOfAttempts = 30; + while (numOfAttempts > 0) { + // Fetch DLP Job status + [job] = await dlp.getDlpJob({name: jobName}); + + // Check if the job has completed. + if (job.state === 'DONE') { + break; + } + if (job.state === 'FAILED') { + console.log('Job Failed, Please check the configuration.'); + return; + } + // Sleep for a short duration before checking the job status again. + await new Promise(resolve => { + setTimeout(() => resolve(), 30000); + }); + numOfAttempts -= 1; + } + + // Create helper function for unpacking values + const getValue = obj => obj[Object.keys(obj)[0]]; + + // Print out the results. + const histogramBuckets = + job.riskDetails.kAnonymityResult.equivalenceClassHistogramBuckets; + + histogramBuckets.forEach((histogramBucket, histogramBucketIdx) => { + console.log(`Bucket ${histogramBucketIdx}:`); + console.log( + ` Bucket size range: [${histogramBucket.equivalenceClassSizeLowerBound}, ${histogramBucket.equivalenceClassSizeUpperBound}]` + ); + + histogramBucket.bucketValues.forEach(valueBucket => { + const quasiIdValues = valueBucket.quasiIdsValues + .map(getValue) + .join(', '); + console.log(` Quasi-ID values: {${quasiIdValues}}`); + console.log(` Class size: ${valueBucket.equivalenceClassSize}`); + }); + }); + } + await kAnonymityWithEntityIds(); + // [END dlp_k_anonymity_with_entity_id] +} +process.on('unhandledRejection', err => { + console.error(err.message); + process.exitCode = 1; +}); + +// TODO(developer): Please uncomment below line before running sample +// main(...process.argv.slice(2)); + +module.exports = main; diff --git a/dlp/system-test/deid.test.js b/dlp/system-test/deid.test.js index 98f1eb45d68..33c9ec36ddf 100644 --- a/dlp/system-test/deid.test.js +++ b/dlp/system-test/deid.test.js @@ -37,6 +37,15 @@ const keyName = 'KEY_NAME'; const wrappedKey = 'WRAPPED_KEY'; const unwrappedKey = 'YWJjZGVmZ2hpamtsbW5vcA=='; +// Dummy resource names used in test cases mocking API Calls. +const inputDirectory = 'MOCK_INPUT_DIRECTORY'; +const datasetId = 'MOCK_DATASET_ID'; +const tableId = 'MOCK_TABLE_ID'; +const outputDirectory = 'MOCK_OUTPUT_DIRECTORY'; +const deidentifyTemplateId = 'MOCK_DEIDENTIFY_TEMPLATE'; +const structuredDeidentifyTemplateId = 'MOCK_STRUCTURED_ DEIDENTIFY_TEMPLATE'; +const imageRedactTemplateId = 'MOCK_IMAGE_REDACT_TEMPLATE'; + const client = new DLP.DlpServiceClient(); describe('deid', () => { let projectId; @@ -682,4 +691,101 @@ describe('deid', () => { } assert.include(output, 'INVALID_ARGUMENT'); }); + + // dlp_deidentify_cloud_storage + it('should de-identify a cloud storage directory', async () => { + const jobName = 'test-job-name'; + const DATA_CONSTANTS = MOCK_DATA.DEIDENTIFY_CLOUD_STORAGE( + projectId, + inputDirectory, + tableId, + datasetId, + outputDirectory, + deidentifyTemplateId, + structuredDeidentifyTemplateId, + imageRedactTemplateId, + jobName + ); + const mockCreateDlpJob = sinon.stub().resolves([{name: jobName}]); + sinon.replace( + DLP.DlpServiceClient.prototype, + 'createDlpJob', + mockCreateDlpJob + ); + + const mockGetDlpJob = sinon.fake.resolves( + DATA_CONSTANTS.RESPONSE_GET_DLP_JOB_SUCCESS + ); + sinon.replace(DLP.DlpServiceClient.prototype, 'getDlpJob', mockGetDlpJob); + const mockConsoleLog = sinon.stub(); + sinon.replace(console, 'log', mockConsoleLog); + + const deIdentifyCloudStorage = proxyquire('../deIdentifyCloudStorage', { + '@google-cloud/dlp': {DLP: DLP}, + }); + + await deIdentifyCloudStorage( + projectId, + inputDirectory, + tableId, + datasetId, + outputDirectory, + deidentifyTemplateId, + structuredDeidentifyTemplateId, + imageRedactTemplateId + ); + sinon.assert.calledOnceWithExactly( + mockCreateDlpJob, + DATA_CONSTANTS.REQUEST_CREATE_DLP_JOB + ); + sinon.assert.calledOnce(mockGetDlpJob); + }); + + it('should handle error if inspect cloud storage job fails', async () => { + const jobName = 'test-job-name'; + const DATA_CONSTANTS = MOCK_DATA.DEIDENTIFY_CLOUD_STORAGE( + projectId, + inputDirectory, + tableId, + datasetId, + outputDirectory, + deidentifyTemplateId, + structuredDeidentifyTemplateId, + imageRedactTemplateId, + jobName + ); + const mockCreateDlpJob = sinon.stub().resolves([{name: jobName}]); + sinon.replace( + DLP.DlpServiceClient.prototype, + 'createDlpJob', + mockCreateDlpJob + ); + + const mockGetDlpJob = sinon.fake.resolves( + DATA_CONSTANTS.RESPONSE_GET_DLP_JOB_FAILED + ); + sinon.replace(DLP.DlpServiceClient.prototype, 'getDlpJob', mockGetDlpJob); + const mockConsoleLog = sinon.stub(); + sinon.replace(console, 'log', mockConsoleLog); + + const deIdentifyCloudStorage = proxyquire('../deIdentifyCloudStorage', { + '@google-cloud/dlp': {DLP: DLP}, + }); + + await deIdentifyCloudStorage( + projectId, + inputDirectory, + tableId, + datasetId, + outputDirectory, + deidentifyTemplateId, + structuredDeidentifyTemplateId, + imageRedactTemplateId + ); + sinon.assert.calledOnce(mockGetDlpJob); + sinon.assert.calledWithMatch( + mockConsoleLog, + 'Job Failed, Please check the configuration.' + ); + }); }); diff --git a/dlp/system-test/mockdata.js b/dlp/system-test/mockdata.js index 2996cde0bd8..2cadd59ca40 100644 --- a/dlp/system-test/mockdata.js +++ b/dlp/system-test/mockdata.js @@ -286,6 +286,163 @@ const MOCK_DATA = { }, RESPONSE_REIDENTIFY_CONTENT: [{item: {value: ''}}], }), + DEIDENTIFY_CLOUD_STORAGE: ( + projectId, + inputDirectory, + tableId, + datasetId, + outputDirectory, + deidentifyTemplateId, + structuredDeidentifyTemplateId, + imageRedactTemplateId, + jobName + ) => ({ + REQUEST_CREATE_DLP_JOB: { + parent: `projects/${projectId}/locations/global`, + inspectJob: { + inspectConfig: { + infoTypes: [{name: 'PERSON_NAME'}, {name: 'EMAIL_ADDRESS'}], + includeQuote: true, + }, + storageConfig: { + cloudStorageOptions: { + fileSet: {url: inputDirectory}, + }, + }, + actions: [ + { + deidentify: { + cloudStorageOutput: outputDirectory, + transformationConfig: { + deidentifyTemplate: deidentifyTemplateId, + structuredDeidentifyTemplate: structuredDeidentifyTemplateId, + imageRedactTemplate: imageRedactTemplateId, + }, + transformationDetailsStorageConfig: { + table: { + projectId: projectId, + tableId: tableId, + datasetId: datasetId, + }, + }, + fileTypes: [ + {fileType: 'IMAGE'}, + {fileType: 'CSV'}, + {fileType: 'TEXT_FILE'}, + ], + }, + }, + ], + }, + }, + RESPONSE_GET_DLP_JOB_SUCCESS: [ + { + name: jobName, + state: 'DONE', + inspectDetails: { + result: { + infoTypeStats: [ + { + count: 1, + infoType: { + name: 'PERSON_NAME', + }, + }, + ], + }, + }, + }, + ], + RESPONSE_GET_DLP_JOB_FAILED: [ + { + name: jobName, + state: 'FAILED', + inspectDetails: {}, + }, + ], + }), + K_ANONYMITY_WITH_ENTITY_ID: ( + projectId, + datasetId, + sourceTableId, + outputTableId, + jobName + ) => ({ + REQUEST_CREATE_DLP_JOB: { + parent: `projects/${projectId}/locations/global`, + riskJob: { + sourceTable: { + projectId: projectId, + datasetId: datasetId, + tableId: sourceTableId, + }, + privacyMetric: { + kAnonymityConfig: { + entityId: {field: {name: 'Name'}}, + quasiIds: [{name: 'Age'}, {name: 'Mystery'}], + }, + }, + actions: [ + { + saveFindings: { + outputConfig: { + table: { + projectId: projectId, + datasetId: datasetId, + tableId: outputTableId, + }, + }, + }, + }, + ], + }, + }, + RESPONSE_GET_DLP_JOB_SUCCESS: [ + { + name: jobName, + state: 'DONE', + riskDetails: { + kAnonymityResult: { + equivalenceClassHistogramBuckets: [ + { + bucketValues: [ + { + quasiIdsValues: [ + { + stringValue: '["19","8291 3627 8250 1234"]', + type: 'stringValue', + }, + ], + equivalenceClassSize: '1', + }, + { + quasiIdsValues: [ + { + stringValue: '["27","4231 5555 6781 9876"]', + type: 'stringValue', + }, + ], + equivalenceClassSize: '1', + }, + ], + equivalenceClassSizeLowerBound: '1', + equivalenceClassSizeUpperBound: '1', + bucketSize: '2', + bucketValueCount: '2', + }, + ], + }, + }, + }, + ], + RESPONSE_GET_DLP_JOB_FAILED: [ + { + name: jobName, + state: 'FAILED', + inspectDetails: {}, + }, + ], + }), }; module.exports = {MOCK_DATA}; diff --git a/dlp/system-test/risk.test.js b/dlp/system-test/risk.test.js index 8811926545c..6ecaeb45b28 100644 --- a/dlp/system-test/risk.test.js +++ b/dlp/system-test/risk.test.js @@ -20,6 +20,10 @@ const uuid = require('uuid'); const {PubSub} = require('@google-cloud/pubsub'); const cp = require('child_process'); const DLP = require('@google-cloud/dlp'); +const proxyquire = require('proxyquire'); +const sinon = require('sinon'); + +const {MOCK_DATA} = require('./mockdata'); const execSync = cmd => { return cp.execSync(cmd, { @@ -34,6 +38,11 @@ const numericField = 'Age'; const pubsub = new PubSub(); const client = new DLP.DlpServiceClient(); +// Dummy resource names used in test cases mocking API Calls. +const datasetId = 'MOCK_DATASET_ID'; +const sourceTableId = 'MOCK_SOURCE_TABLE'; +const outputTableId = 'MOCK_OUTPUT_TABLE'; + /* * The tests in this file rely on a table in BigQuery entitled * "integration_tests_dlp.harmful" with the following fields: @@ -86,6 +95,7 @@ describe('risk', () => { // Delete risk analysis job created in the snippets. afterEach(async () => { + sinon.restore(); const request = { name: jobName, }; @@ -233,4 +243,83 @@ describe('risk', () => { } assert.include(output, 'fail'); }); + + // dlp_k_anonymity_with_entity_id + it('should perform k-map analysis using entity ID', async () => { + const jobName = 'test-job-name'; + const DATA_CONSTANTS = MOCK_DATA.K_ANONYMITY_WITH_ENTITY_ID( + projectId, + datasetId, + sourceTableId, + outputTableId, + jobName + ); + const mockCreateDlpJob = sinon.stub().resolves([{name: jobName}]); + sinon.replace( + DLP.DlpServiceClient.prototype, + 'createDlpJob', + mockCreateDlpJob + ); + + const mockGetDlpJob = sinon.fake.resolves( + DATA_CONSTANTS.RESPONSE_GET_DLP_JOB_SUCCESS + ); + sinon.replace(DLP.DlpServiceClient.prototype, 'getDlpJob', mockGetDlpJob); + const mockConsoleLog = sinon.stub(); + sinon.replace(console, 'log', mockConsoleLog); + + const kAnonymityWithEntityIds = proxyquire('../kAnonymityWithEntityIds', { + '@google-cloud/dlp': {DLP: DLP}, + }); + await kAnonymityWithEntityIds( + projectId, + datasetId, + sourceTableId, + outputTableId + ); + sinon.assert.calledOnceWithExactly( + mockCreateDlpJob, + DATA_CONSTANTS.REQUEST_CREATE_DLP_JOB + ); + sinon.assert.calledOnce(mockGetDlpJob); + }); + + it('should handle error if risk job fails', async () => { + const jobName = 'test-job-name'; + const DATA_CONSTANTS = MOCK_DATA.K_ANONYMITY_WITH_ENTITY_ID( + projectId, + datasetId, + sourceTableId, + outputTableId, + jobName + ); + const mockCreateDlpJob = sinon.stub().resolves([{name: jobName}]); + sinon.replace( + DLP.DlpServiceClient.prototype, + 'createDlpJob', + mockCreateDlpJob + ); + + const mockGetDlpJob = sinon.fake.resolves( + DATA_CONSTANTS.RESPONSE_GET_DLP_JOB_FAILED + ); + sinon.replace(DLP.DlpServiceClient.prototype, 'getDlpJob', mockGetDlpJob); + const mockConsoleLog = sinon.stub(); + sinon.replace(console, 'log', mockConsoleLog); + + const kAnonymityWithEntityIds = proxyquire('../kAnonymityWithEntityIds', { + '@google-cloud/dlp': {DLP: DLP}, + }); + await kAnonymityWithEntityIds( + projectId, + datasetId, + sourceTableId, + outputTableId + ); + sinon.assert.calledOnce(mockGetDlpJob); + sinon.assert.calledWithMatch( + mockConsoleLog, + 'Job Failed, Please check the configuration.' + ); + }); });