-
Notifications
You must be signed in to change notification settings - Fork 0
Open
Labels
documentationImprovements or additions to documentationImprovements or additions to documentation
Description
#!/usr/bin/env python3
"""
merge_pdf_pages.py
------------------
Converts a multi-page PDF into a single seamless long-page PDF by:
1. Rendering every page at high DPI
2. Detecting and trimming the blank top/bottom margins on each page
3. Stitching all pages together (no gap between them)
4. Cropping trailing whitespace below the last content line
5. Saving the result as a single-page PDF
Usage:
python merge_pdf_pages.py input.pdf [output.pdf] [--dpi 300] [--margin-color 255] [--threshold 245]
"""
import argparse
import sys
from pathlib import Path
from pdf2image import convert_from_path
from PIL import Image, ImageChops
import numpy as np
# ── helpers ────────────────────────────────────────────────────────────────────
def to_gray(img: Image.Image) -> np.ndarray:
"""Convert PIL image to a 2-D numpy array of luminance values."""
return np.array(img.convert("L"))
def first_content_row(gray: np.ndarray, threshold: int) -> int:
"""Return the index of the first row that has at least one dark pixel."""
for i, row in enumerate(gray):
if np.any(row < threshold):
return i
return 0
def last_content_row(gray: np.ndarray, threshold: int) -> int:
"""Return the index of the last row that has at least one dark pixel."""
for i in range(len(gray) - 1, -1, -1):
if np.any(gray[i] < threshold):
return i
return len(gray) - 1
def trim_page(img: Image.Image, threshold: int, padding: int = 4) -> Image.Image:
"""Crop blank rows from the top and bottom of a page image."""
gray = to_gray(img)
top = max(0, first_content_row(gray, threshold) - padding)
bottom = min(img.height, last_content_row(gray, threshold) + 1 + padding)
return img.crop((0, top, img.width, bottom))
# ── main ───────────────────────────────────────────────────────────────────────
def merge_pdf(
input_path: str,
output_path: str,
dpi: int = 300,
threshold: int = 245,
padding: int = 6,
) -> None:
print(f"Reading: {input_path}")
pages = convert_from_path(input_path, dpi=dpi)
print(f" {len(pages)} page(s) found")
if not pages:
print("No pages found – aborting.")
sys.exit(1)
# Measure the top margin from page 1 so we can mirror it at the bottom of the last page
first_gray = to_gray(pages[0])
top_margin_px = first_content_row(first_gray, threshold)
print(f" Detected top margin: {top_margin_px}px (will be mirrored at bottom of last page)")
trimmed: list[Image.Image] = []
for i, page in enumerate(pages):
gray = to_gray(page)
if i == 0:
# Preserve the original top margin exactly as-is
top = 0
else:
# Trim right to content edge so the seam is flush (no padding)
top = first_content_row(gray, threshold)
if i == len(pages) - 1:
# Trim trailing whitespace then add the same margin as the top of page 1
last_row = last_content_row(gray, threshold)
bottom = min(page.height, last_row + 1 + top_margin_px)
else:
# Trim right to content edge so the seam is flush (no padding)
bottom = last_content_row(gray, threshold) + 1
cropped = page.crop((0, top, page.width, bottom))
trimmed.append(cropped)
print(f" Page {i + 1}: height {page.height}px → {cropped.height}px (top={top}, bottom={bottom})")
# Stitch vertically
total_height = sum(img.height for img in trimmed)
width = trimmed[0].width
canvas = Image.new("RGB", (width, total_height), color=(255, 255, 255))
y_offset = 0
for img in trimmed:
canvas.paste(img, (0, y_offset))
y_offset += img.height
# Save as PDF (Pillow can save a PIL image directly as PDF)
canvas.save(output_path, "PDF", resolution=dpi)
print(f"\nSaved → {output_path} ({width}×{total_height}px at {dpi} dpi)")
# ── CLI ────────────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(
description="Merge a multi-page PDF into a single seamless long page."
)
parser.add_argument("input", help="Input PDF path")
parser.add_argument(
"output",
nargs="?",
help="Output PDF path (default: <input>_merged.pdf)",
)
parser.add_argument(
"--dpi",
type=int,
default=300,
help="Rendering DPI – higher = sharper but larger file (default: 300)",
)
parser.add_argument(
"--threshold",
type=int,
default=245,
help="Pixel luminance threshold for 'blank' detection (0-255, default: 245)",
)
parser.add_argument(
"--padding",
type=int,
default=6,
help="Pixels of padding to keep around content edges (default: 6)",
)
args = parser.parse_args()
input_path = args.input
output_path = args.output or str(Path(input_path).with_suffix("")) + "_merged.pdf"
merge_pdf(
input_path=input_path,
output_path=output_path,
dpi=args.dpi,
threshold=args.threshold,
padding=args.padding,
)
if __name__ == "__main__":
main()
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
documentationImprovements or additions to documentationImprovements or additions to documentation