Skip to content

Commit

Permalink
Add our own method for determining the bounding box
Browse files Browse the repository at this point in the history
I found some pdfs where pdfcrop/ghostscript determined
the bounding box incorrectly. With this commit we introduce
a simple way to determine the bounding box by finding the
smallest rectangle that is completely surrounded by white
pixels. This will work well for most PDFs. The centering
functionality is also improved, and now correctly centers
the PDF on the reMarkable both vertically and horizontally.
  • Loading branch information
GjjvdBurg committed Jul 5, 2019
1 parent 2dc8700 commit 4cb3af8
Show file tree
Hide file tree
Showing 3 changed files with 256 additions and 36 deletions.
204 changes: 169 additions & 35 deletions arxiv2remarkable.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import bs4
import datetime
import os
import pdfplumber
import re
import requests
import shutil
Expand Down Expand Up @@ -118,37 +119,6 @@ def create_filename(self, info, filename=None):
self.log("Created filename: %s" % name)
return name

def center_pdf(self, filepath):
if not self.center:
return filepath
pdf_file = PyPDF2.PdfFileReader(filepath)
mediaBox = pdf_file.getPage(0).mediaBox
width = mediaBox[2] - mediaBox[0]
height = mediaBox[3] - mediaBox[1]
padding = (height * RM_WIDTH - width * RM_HEIGHT) / RM_HEIGHT
left_margin = padding / 2 + 15

self.log("Centering PDF file")
status = subprocess.call(
[
self.pdfcrop_path,
"--margins",
"%i 40 15 15" % left_margin,
filepath,
],
stdout=subprocess.DEVNULL,
)
if not status == 0:
self.warn("Failed to crop the pdf file at: %s" % filepath)
return filepath
centered_file = os.path.splitext(filepath)[0] + "-crop.pdf"
if not os.path.exists(centered_file):
self.warn(
"Can't find centered file '%s' where expected." % centered_file
)
return filepath
return centered_file

def blank_pdf(self, filepath):
if not self.blank:
return filepath
Expand All @@ -167,21 +137,42 @@ def blank_pdf(self, filepath):

def crop_pdf(self, filepath):
self.log("Cropping pdf file")
status = subprocess.call(
[self.pdfcrop_path, "--margins", "15 40 15 15", filepath],
stdout=subprocess.DEVNULL,
cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
cropper = Cropper(
filepath, cropped_file, pdfcrop_path=self.pdfcrop_path
)
status = cropper.crop(margins=15)

if not status == 0:
self.warn("Failed to crop the pdf file at: %s" % filepath)
return filepath
cropped_file = os.path.splitext(filepath)[0] + "-crop.pdf"
if not os.path.exists(cropped_file):
self.warn(
"Can't find cropped file '%s' where expected." % cropped_file
)
return filepath
return cropped_file

def center_pdf(self, filepath):
if not self.center:
return filepath

self.log("Centering PDF file")
centered_file = os.path.splitext(filepath)[0] + "-center.pdf"
cropper = Cropper(
filepath, centered_file, pdfcrop_path=self.pdfcrop_path
)
status = cropper.center()
if not status == 0:
self.warn("Failed to center the pdf file at: %s" % filepath)
return filepath
if not os.path.exists(centered_file):
self.warn(
"Can't find centered file '%s' where expected." % centered_file
)
return filepath
return centered_file

def shrink_pdf(self, filepath):
self.log("Shrinking pdf file")
output_file = os.path.splitext(filepath)[0] + "-shrink.pdf"
Expand Down Expand Up @@ -602,6 +593,149 @@ def create_filename(self, info, filename=None):
return filename


class Cropper(object):
def __init__(
self, input_file=None, output_file=None, pdfcrop_path="pdfcrop"
):
if not input_file is None:
self.input_file = os.path.abspath(input_file)
self.reader = PyPDF2.PdfFileReader(self.input_file)
if not output_file is None:
self.output_file = os.path.abspath(output_file)
self.pdfcrop_path = pdfcrop_path

self.writer = PyPDF2.PdfFileWriter()

def crop(self, margins=1):
return self.process_file(self.crop_page, margins=margins)

def center(self, padding=15):
return self.process_file(self.center_page, padding=padding)

def process_file(self, page_func, *args, **kwargs):
for page_idx in range(self.reader.getNumPages()):
status = page_func(page_idx, *args, **kwargs)
if not status == 0:
return status
with open(self.output_file, "wb") as fp:
self.writer.write(fp)
return 0

def center_page(self, page_idx, padding):
return self.process_page(
page_idx, self.get_center_bbox, padding=padding
)

def crop_page(self, page_idx, margins):
return self.process_page(page_idx, self.get_bbox, margins=margins)

def export_page(self, page_idx):
"""Helper function that exports a single page given by index """
page = self.reader.getPage(page_idx)
writer = PyPDF2.PdfFileWriter()
writer.addPage(page)
tmpfname = "./page.pdf"
with open(tmpfname, "wb") as fp:
writer.write(fp)
return tmpfname

def process_page(self, page_idx, bbox_func, *args, **kwargs):
"""Process a single page and add it to the writer """
tmpfname = self.export_page(page_idx)
tmpfout = "./output.pdf"
bbox = bbox_func(tmpfname, *args, **kwargs)
status = subprocess.call(
[
self.pdfcrop_path,
"--bbox",
" ".join(map(str, bbox)),
tmpfname,
tmpfout,
],
stdout=subprocess.DEVNULL,
)
if not status == 0:
return status
reader = PyPDF2.PdfFileReader(tmpfout)
page = reader.getPage(0)
self.writer.addPage(page)
os.unlink(tmpfname)
os.unlink(tmpfout)
return 0

def get_bbox(self, filename, margins=1, resolution=72):
"""Get the bounding box, with optional margins
if margins is integer, used for all margins, else
margins = [left, top, right, bottom]
We get the bounding box by finding the smallest rectangle that is
completely surrounded by white pixels.
"""
if isinstance(margins, int):
margins = [margins for _ in range(4)]
pdf = pdfplumber.open(filename)
im = pdf.pages[0].to_image(resolution=resolution)
pdf.close()

pixels = list(im.original.getdata())
W, H = im.original.size

# M is a list of H lists with each W integers that equal the sum of the
# pixel values
M = [[sum(x) for x in pixels[i * W : (i + 1) * W]] for i in range(H)]

left, top, bottom, right = 0, 0, 0, 0
while top < H and sum(M[top]) == W * 255 * 3:
top += 1
while bottom < H and sum(M[H - 1 - bottom]) == W * 255 * 3:
bottom += 1

# Transpose M
M = list(zip(*M))
while left < W and sum(M[left]) == H * 255 * 3:
left += 1
while right < W and sum(M[W - 1 - right]) == H * 255 * 3:
right += 1

left -= margins[0]
top -= margins[1]
right -= margins[2]
bottom -= margins[3]

# This is the bounding box in PIL format: (0, 0) top left
x0, y0, x1, y1 = left, top, W - right, H - bottom

# Get the bbox in Ghostscript format: (0, 0) bottom left
a0, b0, a1, b1 = x0, H - y1, x1, H - y0
return [a0, b0, a1, b1]

def get_center_bbox(self, filename, padding=15):
"""Compute a bounding box that will center the page file on the
reMarkable
"""
bbox = self.get_bbox(filename, margins=0)

h = bbox[3] - bbox[1]
w = bbox[2] - bbox[0]

# we want some minimal padding all around, because it is visually more
# pleasing.
h_prime = h + 2 * padding
w_prime = w + 2 * padding

# if the document is wider than the remarkable, we add top-padding to
# center it, otherwise we add left-padding
x, y = 0, 0
if h_prime / w_prime < RM_HEIGHT / RM_WIDTH:
y = ((RM_HEIGHT / RM_WIDTH) * w_prime - h_prime) / 2
else:
x = ((RM_WIDTH / RM_HEIGHT) * h_prime - w_prime) / 2

margins = [padding + x, padding + y, padding, padding]
return self.get_bbox(filename, margins=margins)


def exception(msg):
print("ERROR: " + msg, file=sys.stderr)
print("Error occurred. Exiting.", file=sys.stderr)
Expand Down
87 changes: 86 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ license = "MIT"
python = "^3.5"
bs4 = "^0.0.1"
requests = "^2.21"
pdfplumber = "^0.5.12"

[tool.poetry.dev-dependencies]

Expand Down

0 comments on commit 4cb3af8

Please sign in to comment.