Skip to content

Commit

Permalink
local import of ocrmypdf so that the webserver does not load that
Browse files Browse the repository at this point in the history
  • Loading branch information
jonaswinkler committed Feb 15, 2021
1 parent 416101d commit 56bd966
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 12 deletions.
10 changes: 6 additions & 4 deletions src/paperless_tesseract/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,8 @@
import os
import re

import ocrmypdf
import pdftotext
import pikepdf
from PIL import Image
from django.conf import settings
from ocrmypdf import InputFileError, EncryptedPdfError

from documents.parsers import DocumentParser, ParseError, \
make_thumbnail_from_pdf
Expand All @@ -22,6 +18,8 @@ class RasterisedDocumentParser(DocumentParser):
logging_name = "paperless.parsing.tesseract"

def extract_metadata(self, document_path, mime_type):
import pikepdf

namespace_pattern = re.compile(r"\{(.*)\}(.*)")

result = []
Expand Down Expand Up @@ -91,6 +89,9 @@ def calculate_a4_dpi(self, image):
return None

def parse(self, document_path, mime_type, file_name=None):
import ocrmypdf
from ocrmypdf import InputFileError, EncryptedPdfError

mode = settings.OCR_MODE

text_original = get_text_from_pdf(document_path)
Expand Down Expand Up @@ -223,6 +224,7 @@ def strip_excess_whitespace(text):


def get_text_from_pdf(pdf_file):
import pdftotext

if not os.path.isfile(pdf_file):
return None
Expand Down
11 changes: 3 additions & 8 deletions src/paperless_tesseract/tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,17 +164,12 @@ def f():

self.assertRaises(ParseError, f)

@mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr")
def test_image_calc_a4_dpi(self, m):
def test_image_calc_a4_dpi(self):
parser = RasterisedDocumentParser(None)

parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")

m.assert_called_once()

args, kwargs = m.call_args
dpi = parser.calculate_a4_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"))

self.assertEqual(kwargs['image_dpi'], 62)
self.assertEqual(dpi, 62)

@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi")
def test_image_dpi_fail(self, m):
Expand Down

0 comments on commit 56bd966

Please sign in to comment.