From 56bd966c02a4347bf764215d33897583fd5d5111 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Mon, 15 Feb 2021 12:18:10 +0100 Subject: [PATCH] local import of ocrmypdf so that the webserver does not load that --- src/paperless_tesseract/parsers.py | 10 ++++++---- src/paperless_tesseract/tests/test_parser.py | 11 +++-------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 271a840dfd5..78c335ac394 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -2,12 +2,8 @@ import os import re -import ocrmypdf -import pdftotext -import pikepdf from PIL import Image from django.conf import settings -from ocrmypdf import InputFileError, EncryptedPdfError from documents.parsers import DocumentParser, ParseError, \ make_thumbnail_from_pdf @@ -22,6 +18,8 @@ class RasterisedDocumentParser(DocumentParser): logging_name = "paperless.parsing.tesseract" def extract_metadata(self, document_path, mime_type): + import pikepdf + namespace_pattern = re.compile(r"\{(.*)\}(.*)") result = [] @@ -91,6 +89,9 @@ def calculate_a4_dpi(self, image): return None def parse(self, document_path, mime_type, file_name=None): + import ocrmypdf + from ocrmypdf import InputFileError, EncryptedPdfError + mode = settings.OCR_MODE text_original = get_text_from_pdf(document_path) @@ -223,6 +224,7 @@ def strip_excess_whitespace(text): def get_text_from_pdf(pdf_file): + import pdftotext if not os.path.isfile(pdf_file): return None diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 644587de092..4fd31466774 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -164,17 +164,12 @@ def f(): self.assertRaises(ParseError, f) - @mock.patch("paperless_tesseract.parsers.ocrmypdf.ocr") - def test_image_calc_a4_dpi(self, m): + def test_image_calc_a4_dpi(self): parser = RasterisedDocumentParser(None) - parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") - - m.assert_called_once() - - args, kwargs = m.call_args + dpi = parser.calculate_a4_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png")) - self.assertEqual(kwargs['image_dpi'], 62) + self.assertEqual(dpi, 62) @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi") def test_image_dpi_fail(self, m):