Closed
Description
Description of the bug
Check the attached pdf Original -> Annotated -> Redacted.
You can see the insult word is getting redacted -
Original -
"The woman whom it was intended to insult or whose privacy was intruded upon."
Redacted -
"The woman whom it was intended to ult or whose privacy was intruded upon."
How to reproduce the bug
Code
import fitz
import json
fitz.TOOLS.set_small_glyph_heights(True)
filepath = f"original.pdf"
doc = fitz.open(filepath)
def get_redact_box(bbox):
return [round(x, 2) for x in bbox]
LEFT_START = 115
RIGHT_END = 482
def redact_text_outside_bounds(page_number):
"""Prints text and line numbers for a specific page of a PDF."""
if page_number < 1 or page_number > len(doc):
print("Invalid page number.")
return
page = doc[page_number - 1] # Get the page object (zero-based indexing)
blocks = page.get_text("dict", sort=True)["blocks"]
for block in blocks:
bbox = block["bbox"]
x0, y0, x1, y1 = bbox
for line in block["lines"]:
bbox = line["bbox"]
x0, y0, x1, y1 = bbox
if x0 <= x1 and (x1 <= LEFT_START or x0 >= RIGHT_END):
page.add_redact_annot(
get_redact_box(bbox),
fontname="helv",
fontsize=8,
align=fitz.TEXT_ALIGN_CENTER,
)
# text = ""
# for line in block["lines"]:
# for d in line["spans"]:
# text += " " + d["text"]
# print()
# print(bbox)
# print(text)
page.apply_redactions()
if __name__ == "__main__":
pages = range(1, doc.page_count+1)
# pages = range(128, 129)
for page_no in pages:
redact_text_outside_bounds(page_no)
doc.save("test.pdf")
original.pdf
annotated.pdf
redacted.pdf
PyMuPDF version
1.23.26
Operating system
MacOS
Python version
3.12