Skip to content

Store Python Scripts #12

@AbdulDevHub

Description

@AbdulDevHub
#!/usr/bin/env python3
"""
merge_pdf_pages.py
------------------
Converts a multi-page PDF into a single seamless long-page PDF by:
  1. Rendering every page at high DPI
  2. Detecting and trimming the blank top/bottom margins on each page
  3. Stitching all pages together (no gap between them)
  4. Cropping trailing whitespace below the last content line
  5. Saving the result as a single-page PDF

Usage:
    python merge_pdf_pages.py input.pdf [output.pdf] [--dpi 300] [--margin-color 255] [--threshold 245]
"""

import argparse
import sys
from pathlib import Path

from pdf2image import convert_from_path
from PIL import Image, ImageChops
import numpy as np


# ── helpers ────────────────────────────────────────────────────────────────────

def to_gray(img: Image.Image) -> np.ndarray:
    """Convert PIL image to a 2-D numpy array of luminance values."""
    return np.array(img.convert("L"))


def first_content_row(gray: np.ndarray, threshold: int) -> int:
    """Return the index of the first row that has at least one dark pixel."""
    for i, row in enumerate(gray):
        if np.any(row < threshold):
            return i
    return 0


def last_content_row(gray: np.ndarray, threshold: int) -> int:
    """Return the index of the last row that has at least one dark pixel."""
    for i in range(len(gray) - 1, -1, -1):
        if np.any(gray[i] < threshold):
            return i
    return len(gray) - 1


def trim_page(img: Image.Image, threshold: int, padding: int = 4) -> Image.Image:
    """Crop blank rows from the top and bottom of a page image."""
    gray = to_gray(img)
    top = max(0, first_content_row(gray, threshold) - padding)
    bottom = min(img.height, last_content_row(gray, threshold) + 1 + padding)
    return img.crop((0, top, img.width, bottom))


# ── main ───────────────────────────────────────────────────────────────────────

def merge_pdf(
    input_path: str,
    output_path: str,
    dpi: int = 300,
    threshold: int = 245,
    padding: int = 6,
) -> None:
    print(f"Reading: {input_path}")
    pages = convert_from_path(input_path, dpi=dpi)
    print(f"  {len(pages)} page(s) found")

    if not pages:
        print("No pages found – aborting.")
        sys.exit(1)

    # Measure the top margin from page 1 so we can mirror it at the bottom of the last page
    first_gray = to_gray(pages[0])
    top_margin_px = first_content_row(first_gray, threshold)
    print(f"  Detected top margin: {top_margin_px}px (will be mirrored at bottom of last page)")

    trimmed: list[Image.Image] = []
    for i, page in enumerate(pages):
        gray = to_gray(page)

        if i == 0:
            # Preserve the original top margin exactly as-is
            top = 0
        else:
            # Trim right to content edge so the seam is flush (no padding)
            top = first_content_row(gray, threshold)

        if i == len(pages) - 1:
            # Trim trailing whitespace then add the same margin as the top of page 1
            last_row = last_content_row(gray, threshold)
            bottom = min(page.height, last_row + 1 + top_margin_px)
        else:
            # Trim right to content edge so the seam is flush (no padding)
            bottom = last_content_row(gray, threshold) + 1

        cropped = page.crop((0, top, page.width, bottom))
        trimmed.append(cropped)
        print(f"  Page {i + 1}: height {page.height}px → {cropped.height}px  (top={top}, bottom={bottom})")

    # Stitch vertically
    total_height = sum(img.height for img in trimmed)
    width = trimmed[0].width
    canvas = Image.new("RGB", (width, total_height), color=(255, 255, 255))

    y_offset = 0
    for img in trimmed:
        canvas.paste(img, (0, y_offset))
        y_offset += img.height

    # Save as PDF (Pillow can save a PIL image directly as PDF)
    canvas.save(output_path, "PDF", resolution=dpi)
    print(f"\nSaved → {output_path}  ({width}×{total_height}px at {dpi} dpi)")


# ── CLI ────────────────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(
        description="Merge a multi-page PDF into a single seamless long page."
    )
    parser.add_argument("input", help="Input PDF path")
    parser.add_argument(
        "output",
        nargs="?",
        help="Output PDF path (default: <input>_merged.pdf)",
    )
    parser.add_argument(
        "--dpi",
        type=int,
        default=300,
        help="Rendering DPI – higher = sharper but larger file (default: 300)",
    )
    parser.add_argument(
        "--threshold",
        type=int,
        default=245,
        help="Pixel luminance threshold for 'blank' detection (0-255, default: 245)",
    )
    parser.add_argument(
        "--padding",
        type=int,
        default=6,
        help="Pixels of padding to keep around content edges (default: 6)",
    )
    args = parser.parse_args()

    input_path = args.input
    output_path = args.output or str(Path(input_path).with_suffix("")) + "_merged.pdf"

    merge_pdf(
        input_path=input_path,
        output_path=output_path,
        dpi=args.dpi,
        threshold=args.threshold,
        padding=args.padding,
    )


if __name__ == "__main__":
    main()

Metadata

Metadata

Assignees

Labels

documentationImprovements or additions to documentation

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions