Skip to content

Commit

Permalink
Add DLP sample for redacting all image text
Browse files Browse the repository at this point in the history
The sample shows how to remove all text found in an image with DLP.
The sample is integrated into the existing redact.py CLI application.
  • Loading branch information
sethmoo committed Jun 9, 2020
1 parent ee5be6d commit f17c23a
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 42 deletions.
30 changes: 7 additions & 23 deletions dlp/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -136,37 +136,21 @@ To run this sample:
$ python redact.py
usage: redact.py [-h] [--project PROJECT]
[--info_types INFO_TYPES [INFO_TYPES ...]]
[--min_likelihood {LIKELIHOOD_UNSPECIFIED,VERY_UNLIKELY,UNLIKELY,POSSIBLE,LIKELY,VERY_LIKELY}]
[--mime_type MIME_TYPE]
filename output_filename
usage: redact.py [-h] {info_types,all_text} ...
Sample app that uses the Data Loss Prevent API to redact the contents of an
image file.
positional arguments:
filename The path to the file to inspect.
output_filename The path to which the redacted image will be written.
{info_types,all_text}
Select which content should be redacted.
info_types Redact specific infoTypes from an image.
all_text Redact all text from an image. The MIME type of the
file is inferred via the Python standard library's
mimetypes module.
optional arguments:
-h, --help show this help message and exit
--project PROJECT The Google Cloud project id to use as a parent
resource.
--info_types INFO_TYPES [INFO_TYPES ...]
Strings representing info types to look for. A full
list of info categories and types is available from
the API. Examples include "FIRST_NAME", "LAST_NAME",
"EMAIL_ADDRESS". If unspecified, the three above
examples will be used.
--min_likelihood {LIKELIHOOD_UNSPECIFIED,VERY_UNLIKELY,UNLIKELY,POSSIBLE,LIKELY,VERY_LIKELY}
A string representing the minimum likelihood threshold
that constitutes a match.
--mime_type MIME_TYPE
The MIME type of the file. If not specified, the type
is inferred via the Python standard library's
mimetypes module.
Metadata
Expand Down
116 changes: 97 additions & 19 deletions dlp/redact.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,23 +121,87 @@ def redact_image(

# [END dlp_redact_image]

# [START dlp_redact_image_all_text]

if __name__ == "__main__":
default_project = os.environ.get("GOOGLE_CLOUD_PROJECT")

parser = argparse.ArgumentParser(description=__doc__)
def redact_image_all_text(
project,
filename,
output_filename,
):
"""Uses the Data Loss Prevention API to redact all text in an image.
parser.add_argument("filename", help="The path to the file to inspect.")
parser.add_argument(
"output_filename",
help="The path to which the redacted image will be written.",
Args:
project: The Google Cloud project id to use as a parent resource.
filename: The path to the file to inspect.
output_filename: The path to which the redacted image will be written.
Returns:
None; the response from the API is printed to the terminal.
"""
# Import the client library
import google.cloud.dlp

# Instantiate a client.
dlp = google.cloud.dlp_v2.DlpServiceClient()

# Construct the image_redaction_configs, indicating to DLP that all text in
# the input image should be redacted.
image_redaction_configs = [{
"redact_all_text": True,
}]

# Construct the byte_item, containing the file's byte data.
with open(filename, mode="rb") as f:
byte_item = {"type": "IMAGE", "data": f.read()}

# Convert the project id into a full resource id.
parent = dlp.project_path(project)

# Call the API.
response = dlp.redact_image(
parent,
image_redaction_configs=image_redaction_configs,
byte_item=byte_item,
)
parser.add_argument(

# Write out the results.
with open(output_filename, mode="wb") as f:
f.write(response.redacted_image)

print("Wrote {byte_count} to {filename}".format(
byte_count=len(response.redacted_image), filename=output_filename))


# [END dlp_redact_image_all_text]

if __name__ == "__main__":
default_project = os.environ.get("GOOGLE_CLOUD_PROJECT")

common_args_parser = argparse.ArgumentParser(add_help=False)
common_args_parser.add_argument(
"--project",
help="The Google Cloud project id to use as a parent resource.",
default=default_project,
)
parser.add_argument(
common_args_parser.add_argument(
"filename", help="The path to the file to inspect.")
common_args_parser.add_argument(
"output_filename",
help="The path to which the redacted image will be written.",
)

parser = argparse.ArgumentParser(description=__doc__)
subparsers = parser.add_subparsers(
dest="content", help="Select which content should be redacted.")
subparsers.required = True

info_types_parser = subparsers.add_parser(
"info_types",
help="Redact specific infoTypes from an image.",
parents=[common_args_parser],
)
info_types_parser.add_argument(
"--info_types",
nargs="+",
help="Strings representing info types to look for. A full list of "
Expand All @@ -146,7 +210,7 @@ def redact_image(
"If unspecified, the three above examples will be used.",
default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
)
parser.add_argument(
info_types_parser.add_argument(
"--min_likelihood",
choices=[
"LIKELIHOOD_UNSPECIFIED",
Expand All @@ -159,19 +223,33 @@ def redact_image(
help="A string representing the minimum likelihood threshold that "
"constitutes a match.",
)
parser.add_argument(
info_types_parser.add_argument(
"--mime_type",
help="The MIME type of the file. If not specified, the type is "
"inferred via the Python standard library's mimetypes module.",
)

all_text_parser = subparsers.add_parser(
"all_text",
help="Redact all text from an image. The MIME type of the file is "
"inferred via the Python standard library's mimetypes module.",
parents=[common_args_parser],
)

args = parser.parse_args()

redact_image(
args.project,
args.filename,
args.output_filename,
args.info_types,
min_likelihood=args.min_likelihood,
mime_type=args.mime_type,
)
if args.content == "info_types":
redact_image(
args.project,
args.filename,
args.output_filename,
args.info_types,
min_likelihood=args.min_likelihood,
mime_type=args.mime_type,
)
elif args.content == "all_text":
redact_image_all_text(
args.project,
args.filename,
args.output_filename,
)
14 changes: 14 additions & 0 deletions dlp/redact_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,17 @@ def test_redact_image_file(tempdir, capsys):

out, _ = capsys.readouterr()
assert output_filepath in out


def test_redact_image_all_text(tempdir, capsys):
test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png")
output_filepath = os.path.join(tempdir, "redacted.png")

redact.redact_image_all_text(
GCLOUD_PROJECT,
test_filepath,
output_filepath,
)

out, _ = capsys.readouterr()
assert output_filepath in out

0 comments on commit f17c23a

Please sign in to comment.