From 53fcf4e91284ca3b6e9510439e0acf8f530bc6a6 Mon Sep 17 00:00:00 2001 From: Mallori Harrell <6825104+mallorih@users.noreply.github.com> Date: Mon, 21 Nov 2022 11:47:29 -0600 Subject: [PATCH] chore: Remove PDF parsing code and dependencies (#75) Remove PDF parsing code and dependencies. --- .github/workflows/ci.yml | 15 +- CHANGELOG.md | 6 +- Makefile | 19 +-- README.md | 9 +- docs/requirements.txt | 8 +- docs/source/installing.rst | 20 +-- requirements/build.txt | 8 +- requirements/dev.txt | 24 ++- requirements/pdf.txt | 145 ------------------ requirements/test.txt | 23 +-- setup.py | 1 - test_unstructured/documents/test_pdf.py | 115 -------------- .../models/layout/test_detectron2.py | 24 --- .../models/ocr/test_tesseract.py | 24 --- unstructured/__version__.py | 2 +- unstructured/documents/pdf.py | 104 ------------- unstructured/models/layout/__init__.py | 0 unstructured/models/layout/detectron2.py | 39 ----- unstructured/models/ocr/__init__.py | 0 unstructured/models/ocr/tesseract.py | 22 --- 20 files changed, 69 insertions(+), 539 deletions(-) delete mode 100644 requirements/pdf.txt delete mode 100644 test_unstructured/documents/test_pdf.py delete mode 100644 test_unstructured/models/layout/test_detectron2.py delete mode 100644 test_unstructured/models/ocr/test_tesseract.py delete mode 100644 unstructured/documents/pdf.py delete mode 100644 unstructured/models/layout/__init__.py delete mode 100644 unstructured/models/layout/detectron2.py delete mode 100644 unstructured/models/ocr/__init__.py delete mode 100644 unstructured/models/ocr/tesseract.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a479416f8c..4e76dad02c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,8 +25,11 @@ jobs: path: | .venv nltk_data - key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }} - + key: unstructured-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }} + - name: Set up Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} - name: Setup virtual environment (no cache hit) if: steps.virtualenv-cache.outputs.cache-hit != 'true' run: | @@ -43,9 +46,13 @@ jobs: id: virtualenv-cache with: path: .venv - key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }} + key: unstructured-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }} # NOTE(robinson) - This is a fallback in case the lint job does not find the cache. # We can take this out when we implement the fix in CORE-99 + - name: Set up Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} - name: Setup virtual environment (no cache hit) if: steps.virtualenv-cache.outputs.cache-hit != 'true' run: | @@ -77,7 +84,7 @@ jobs: path: | .venv nltk_data - key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }} + key: unstructured-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }} # NOTE(robinson) - This is a fallback in case the lint job does not find the cache. # We can take this out when we implement the fix in CORE-99 - name: Setup virtual environment (no cache hit) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bab662d61..daebd51d8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,8 @@ -## 0.2.6-dev1 +## 0.3.0-dev1 + +* Removing the local PDF parsing code and any dependencies and tests. + +## 0.2.6 * Small change to how _read is placed within the inheritance structure since it doesn't really apply to pdf * Add partitioning brick for calling the document image analysis API diff --git a/Makefile b/Makefile index c88c927fc5..05bbea7ec4 100644 --- a/Makefile +++ b/Makefile @@ -17,10 +17,10 @@ install-base: install-base-pip-packages install-nltk-models ## install: installs all test, dev, and experimental requirements .PHONY: install -install: install-base-pip-packages install-dev install-detectron2 install-nltk-models install-test +install: install-base-pip-packages install-dev install-nltk-models install-test .PHONY: install-ci -install-ci: install-base-pip-packages install-pdf install-test install-nltk-models install-huggingface +install-ci: install-base-pip-packages install-test install-nltk-models install-huggingface .PHONY: install-base-pip-packages install-base-pip-packages: @@ -32,18 +32,6 @@ install-huggingface: python3 -m pip install pip==${PIP_VERSION} pip install -r requirements/huggingface.txt -.PHONY: install-pdf -install-pdf: - python3 -m pip install pip==${PIP_VERSION} - pip install -r requirements/pdf.txt - @echo "\n\n========================================================================" - @echo " WARNING: PDF parsing capabilities in unstructured is still experimental" - @echo "========================================================================\n\n" - -.PHONY: install-detectron2 -install-detectron2: install-pdf - pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2" - .PHONE: install-nltk-models install-nltk-models: python -c "import nltk; nltk.download('punkt')" @@ -67,12 +55,9 @@ pip-compile: pip-compile -o requirements/base.txt # Extra requirements for huggingface staging functions pip-compile --extra huggingface -o requirements/huggingface.txt - # Extra requirements for parsing PDF files - pip-compile --extra pdf -o requirements/pdf.txt # NOTE(robinson) - We want the dependencies for detectron2 in the requirements.txt, but not # the detectron2 repo itself. If detectron2 is in the requirements.txt file, an order of # operations issue related to the torch library causes the install to fail - sed 's/^detectron2 @/# detectron2 @/g' requirements/pdf.txt pip-compile requirements/dev.in pip-compile requirements/test.in pip-compile requirements/build.in diff --git a/README.md b/README.md index d4a24dd420..5585cf3101 100644 --- a/README.md +++ b/README.md @@ -88,17 +88,16 @@ titles and narrative text. ### PDF Parsing -You can use the following workflow to parse PDF documents. Note, PDF parsing is currently -expiremental and will be refined in the coming months. +You can use the following workflow to parse PDF documents. ```python -from unstructured.documents.pdf import PDFDocument +from unstructured.nlp.partition import partition_pdf -doc = PDFDocument.from_file("example-docs/layout-parser-paper.pdf") +elements = partition_pdf("example-docs/layout-parser-paper.pdf") print(doc) ``` -At this point, `print(doc)` will print out a string representation of the PDF file. The +At this point, `print(elements)` will print out a string representation of the PDF file. The first page of output looks like the following: ``` diff --git a/docs/requirements.txt b/docs/requirements.txt index 6d1d7604c0..fd4a7b2d69 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -8,7 +8,7 @@ alabaster==0.7.12 # via sphinx babel==2.10.3 # via sphinx -certifi==2022.9.14 +certifi==2022.9.24 # via requests charset-normalizer==2.1.1 # via requests @@ -38,11 +38,11 @@ requests==2.28.1 # via sphinx snowballstemmer==2.2.0 # via sphinx -sphinx==5.2.3 +sphinx==5.3.0 # via # -r requirements/build.in # sphinx-rtd-theme -sphinx-rtd-theme==1.0.0 +sphinx-rtd-theme==1.1.1 # via -r requirements/build.in sphinxcontrib-applehelp==1.0.2 # via sphinx @@ -58,5 +58,5 @@ sphinxcontrib-serializinghtml==1.1.5 # via sphinx urllib3==1.26.12 # via requests -zipp==3.9.0 +zipp==3.10.0 # via importlib-metadata diff --git a/docs/source/installing.rst b/docs/source/installing.rst index 20ba803aad..f808074af9 100644 --- a/docs/source/installing.rst +++ b/docs/source/installing.rst @@ -3,8 +3,7 @@ Installation You can install the library by cloning the repo and running ``make install`` from the root directory. Developers can run ``make install-local`` to install the dev and test -requirements alongside the base requirements. Specific parsing capabilities may require -extra dependencies, as documented below. If you want a minimal installation without any +requirements alongside the base requirements. If you want a minimal installation without any parser specific dependencies, run ``make install-base``. Logging @@ -37,23 +36,6 @@ that with: $ brew install libxml2 $ brew install libxslt -================ -PDF Dependencies -================ - -Currently, PDF parsing capabilities rely on the -`Detectron2 `_ object detection model. The -``make install-local`` command installs all of the dependencies for Detectron2. If you -need to parse PDFs and Detectron2 is not already installed, you can install it with -``make install-detectron2``. - -Also ensure that you have ``poppler`` installed on your system. On a Mac, you can run: - -.. code:: console - - $ brew install poppler - - ======================== Huggingface Dependencies ======================== diff --git a/requirements/build.txt b/requirements/build.txt index 5f295a8ca8..fd4a7b2d69 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -20,6 +20,8 @@ idna==3.4 # via requests imagesize==1.4.1 # via sphinx +importlib-metadata==5.0.0 + # via sphinx jinja2==3.1.2 # via sphinx markupsafe==2.1.1 @@ -38,10 +40,10 @@ snowballstemmer==2.2.0 # via sphinx sphinx==5.3.0 # via - # -r build.in + # -r requirements/build.in # sphinx-rtd-theme sphinx-rtd-theme==1.1.1 - # via -r build.in + # via -r requirements/build.in sphinxcontrib-applehelp==1.0.2 # via sphinx sphinxcontrib-devhelp==1.0.2 @@ -56,3 +58,5 @@ sphinxcontrib-serializinghtml==1.1.5 # via sphinx urllib3==1.26.12 # via requests +zipp==3.10.0 + # via importlib-metadata diff --git a/requirements/dev.txt b/requirements/dev.txt index 21ad663b3a..51e6e516eb 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -4,6 +4,10 @@ # # pip-compile requirements/dev.in # +appnope==0.1.3 + # via + # ipykernel + # ipython argon2-cffi==21.3.0 # via notebook argon2-cffi-bindings==21.2.0 @@ -36,6 +40,10 @@ executing==1.0.0 # via stack-data fastjsonschema==2.16.2 # via nbformat +importlib-metadata==5.0.0 + # via nbconvert +importlib-resources==5.10.0 + # via jsonschema ipykernel==6.15.3 # via # ipywidgets @@ -45,7 +53,7 @@ ipykernel==6.15.3 # qtconsole ipython==8.6.0 # via - # -r dev.in + # -r requirements/dev.in # ipykernel # ipywidgets # jupyter-console @@ -64,7 +72,7 @@ jinja2==3.1.2 jsonschema==4.16.0 # via nbformat jupyter==1.0.0 - # via -r dev.in + # via -r requirements/dev.in jupyter-client==7.3.5 # via # ipykernel @@ -133,7 +141,9 @@ pexpect==4.8.0 pickleshare==0.7.5 # via ipython pip-tools==6.10.0 - # via -r dev.in + # via -r requirements/dev.in +pkgutil-resolve-name==1.3.10 + # via jsonschema prometheus-client==0.14.1 # via notebook prompt-toolkit==3.0.31 @@ -187,6 +197,10 @@ terminado==0.15.0 # via notebook tinycss2==1.1.1 # via nbconvert +tomli==2.0.1 + # via + # build + # pep517 tornado==6.2 # via # ipykernel @@ -216,6 +230,10 @@ wheel==0.37.1 # via pip-tools widgetsnbextension==4.0.3 # via ipywidgets +zipp==3.10.0 + # via + # importlib-metadata + # importlib-resources # The following packages are considered to be unsafe in a requirements file: # pip diff --git a/requirements/pdf.txt b/requirements/pdf.txt deleted file mode 100644 index a0a31e1197..0000000000 --- a/requirements/pdf.txt +++ /dev/null @@ -1,145 +0,0 @@ -# -# This file is autogenerated by pip-compile with python 3.8 -# To update, run: -# -# pip-compile --extra=pdf --output-file=requirements/pdf.txt -# -antlr4-python3-runtime==4.9.3 - # via omegaconf -certifi==2022.9.24 - # via requests -cffi==1.15.1 - # via cryptography -charset-normalizer==2.1.1 - # via - # pdfminer-six - # requests -click==8.1.3 - # via nltk -contourpy==1.0.6 - # via matplotlib -cryptography==38.0.3 - # via pdfminer-six -cycler==0.11.0 - # via matplotlib -effdet==0.3.0 - # via layoutparser -filelock==3.8.0 - # via huggingface-hub -fonttools==4.38.0 - # via matplotlib -huggingface-hub==0.10.1 - # via timm -idna==3.4 - # via requests -iopath==0.1.10 - # via layoutparser -joblib==1.2.0 - # via nltk -kiwisolver==1.4.4 - # via matplotlib -layoutparser[layoutmodels,tesseract]==0.3.4 - # via unstructured (setup.py) -lxml==4.9.1 - # via unstructured (setup.py) -matplotlib==3.6.1 - # via pycocotools -nltk==3.7 - # via unstructured (setup.py) -numpy==1.23.4 - # via - # contourpy - # layoutparser - # matplotlib - # opencv-python - # pandas - # pycocotools - # scipy - # torchvision -omegaconf==2.2.3 - # via effdet -opencv-python==4.6.0.66 - # via layoutparser -packaging==21.3 - # via - # huggingface-hub - # matplotlib - # pytesseract -pandas==1.5.1 - # via layoutparser -pdf2image==1.16.0 - # via layoutparser -pdfminer-six==20220524 - # via pdfplumber -pdfplumber==0.7.5 - # via layoutparser -pillow==9.3.0 - # via - # layoutparser - # matplotlib - # pdf2image - # pdfplumber - # pytesseract - # torchvision -portalocker==2.6.0 - # via iopath -pycocotools==2.0.6 - # via effdet -pycparser==2.21 - # via cffi -pyparsing==3.0.9 - # via - # matplotlib - # packaging -pytesseract==0.3.10 - # via layoutparser -python-dateutil==2.8.2 - # via - # matplotlib - # pandas -pytz==2022.5 - # via pandas -pyyaml==6.0 - # via - # huggingface-hub - # layoutparser - # omegaconf - # timm -regex==2022.10.31 - # via nltk -requests==2.28.1 - # via - # huggingface-hub - # torchvision -scipy==1.9.3 - # via layoutparser -six==1.16.0 - # via python-dateutil -timm==0.6.11 - # via effdet -torch==1.12.1 - # via - # effdet - # layoutparser - # timm - # torchvision -torchvision==0.13.1 - # via - # effdet - # layoutparser - # timm -tqdm==4.64.1 - # via - # huggingface-hub - # iopath - # nltk -typing-extensions==4.4.0 - # via - # huggingface-hub - # iopath - # torch - # torchvision -urllib3==1.26.12 - # via requests -wand==0.6.10 - # via pdfplumber diff --git a/requirements/test.txt b/requirements/test.txt index 1ca846919a..994b96ca36 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -7,21 +7,21 @@ attrs==22.1.0 # via pytest black==22.10.0 - # via -r test.in + # via -r requirements/test.in certifi==2022.9.24 # via requests charset-normalizer==2.1.1 # via requests click==8.1.3 # via - # -r test.in + # -r requirements/test.in # black coverage[toml]==6.4.4 # via - # -r test.in + # -r requirements/test.in # pytest-cov flake8==5.0.4 - # via -r test.in + # via -r requirements/test.in idna==3.4 # via # requests @@ -29,7 +29,7 @@ idna==3.4 iniconfig==1.1.1 # via pytest label-studio-sdk==0.0.15 - # via -r test.in + # via -r requirements/test.in lxml==4.9.1 # via label-studio-sdk mccabe==0.7.0 @@ -37,7 +37,7 @@ mccabe==0.7.0 multidict==6.0.2 # via yarl mypy==0.990 - # via -r test.in + # via -r requirements/test.in mypy-extensions==0.4.3 # via # black @@ -63,7 +63,7 @@ pyparsing==3.0.9 pytest==7.1.3 # via pytest-cov pytest-cov==4.0.0 - # via -r test.in + # via -r requirements/test.in pyyaml==6.0 # via vcrpy requests==2.28.1 @@ -71,15 +71,20 @@ requests==2.28.1 six==1.16.0 # via vcrpy tomli==2.0.1 - # via pytest + # via + # black + # coverage + # mypy + # pytest typing-extensions==4.3.0 # via + # black # mypy # pydantic urllib3==1.26.12 # via requests vcrpy==4.2.1 - # via -r test.in + # via -r requirements/test.in wrapt==1.14.1 # via vcrpy yarl==1.8.1 diff --git a/setup.py b/setup.py index 5b732544ba..404a7e8f31 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,6 @@ "nltk", ], extras_require={ - "pdf": ["layoutparser[layoutmodels,tesseract]"], "huggingface": ["transformers"], }, ) diff --git a/test_unstructured/documents/test_pdf.py b/test_unstructured/documents/test_pdf.py deleted file mode 100644 index 39c87bd9a7..0000000000 --- a/test_unstructured/documents/test_pdf.py +++ /dev/null @@ -1,115 +0,0 @@ -import pytest -from unittest.mock import patch - -import layoutparser as lp -from layoutparser.elements import Layout, Rectangle, TextBlock -import numpy as np -from PIL import Image - -from unstructured.documents.pdf import PDFDocument, PDFPage -import unstructured.models.layout.detectron2 as detectron2 -import unstructured.models.ocr.tesseract as tesseract - - -@pytest.fixture -def mock_image(): - return Image.new("1", (1, 1)) - - -@pytest.fixture -def mock_page_layout(): - text_rectangle = Rectangle(2, 4, 6, 8) - text_block = TextBlock(text_rectangle, text="A very repetitive narrative. " * 10, type="Text") - - title_rectangle = Rectangle(1, 2, 3, 4) - title_block = TextBlock(title_rectangle, text="A Catchy Title", type="Title") - - return Layout([text_block, title_block]) - - -def test_pdf_page_converts_images_to_array(mock_image): - page = PDFPage(number=0, image=mock_image, layout=Layout()) - assert page.image_array is None - - image_array = page._get_image_array() - assert isinstance(image_array, np.ndarray) - assert page.image_array.all() == image_array.all() - - -def test_ocr(monkeypatch): - mock_text = "The parrot flies high in the air!" - - class MockOCRAgent: - def detect(self, *args): - return mock_text - - monkeypatch.setattr(tesseract, "ocr_agent", MockOCRAgent) - monkeypatch.setattr(tesseract, "is_pytesseract_available", lambda *args: True) - - image = np.random.randint(12, 24, (40, 40)) - page = PDFPage(number=0, image=image, layout=Layout()) - rectangle = Rectangle(1, 2, 3, 4) - text_block = TextBlock(rectangle, text=None) - - assert page.ocr(text_block) == mock_text - - -class MockLayoutModel: - def __init__(self, layout): - self.layout = layout - - def detect(self, *args): - return self.layout - - -def test_get_page_elements(monkeypatch, mock_page_layout): - monkeypatch.setattr(detectron2, "model", MockLayoutModel(mock_page_layout)) - monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True) - - image = np.random.randint(12, 24, (40, 40)) - page = PDFPage(number=0, image=image, layout=mock_page_layout) - - elements = page.get_elements(inplace=False) - - assert str(elements[0]) == "A Catchy Title" - assert str(elements[1]).startswith("A very repetitive narrative.") - - page.get_elements(inplace=True) - assert elements == page.elements - - -def test_get_page_elements_with_ocr(monkeypatch): - monkeypatch.setattr(PDFPage, "ocr", lambda *args: "An Even Catchier TItle") - - rectangle = Rectangle(2, 4, 6, 8) - text_block = TextBlock(rectangle, text=None, type="Title") - layout = Layout([text_block]) - - monkeypatch.setattr(detectron2, "model", MockLayoutModel(layout)) - monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True) - - image = np.random.randint(12, 24, (40, 40)) - page = PDFPage(number=0, image=image, layout=layout) - page.get_elements() - - assert str(page) == "An Even Catchier TItle" - - -def test_read_pdf(monkeypatch, mock_page_layout): - image = np.random.randint(12, 24, (40, 40)) - images = [image, image] - - layouts = Layout([mock_page_layout, mock_page_layout]) - - monkeypatch.setattr(detectron2, "model", MockLayoutModel(mock_page_layout)) - monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True) - - with patch.object(lp, "load_pdf", return_value=(layouts, images)): - doc = PDFDocument.from_file("fake-file.pdf") - - assert str(doc).startswith("A Catchy Title") - assert str(doc).count("A Catchy Title") == 2 # Once for each page - assert str(doc).endswith("A very repetitive narrative. ") - - pages = doc.pages - assert str(doc) == "\n\n".join([str(page) for page in pages]) diff --git a/test_unstructured/models/layout/test_detectron2.py b/test_unstructured/models/layout/test_detectron2.py deleted file mode 100644 index 7cf6d82567..0000000000 --- a/test_unstructured/models/layout/test_detectron2.py +++ /dev/null @@ -1,24 +0,0 @@ -import pytest -from unittest.mock import patch - -import unstructured.models.layout.detectron2 as detectron2 - - -class MockDetectron2LayoutModel: - def __init__(self, *args, **kwargs): - pass - - -def test_load_model(monkeypatch): - monkeypatch.setattr(detectron2, "Detectron2LayoutModel", MockDetectron2LayoutModel) - - with patch.object(detectron2, "is_detectron2_available", return_value=True): - detectron2.load_model() - - assert isinstance(detectron2.model, MockDetectron2LayoutModel) - - -def test_load_model_raises_when_not_available(): - with patch.object(detectron2, "is_detectron2_available", return_value=False): - with pytest.raises(ImportError): - detectron2.load_model() diff --git a/test_unstructured/models/ocr/test_tesseract.py b/test_unstructured/models/ocr/test_tesseract.py deleted file mode 100644 index 30a1d4c11b..0000000000 --- a/test_unstructured/models/ocr/test_tesseract.py +++ /dev/null @@ -1,24 +0,0 @@ -import pytest -from unittest.mock import patch - -import unstructured.models.ocr.tesseract as tesseract - - -class MockTesseractAgent: - def __init__(self, languages): - pass - - -def test_load_agent(monkeypatch): - monkeypatch.setattr(tesseract, "TesseractAgent", MockTesseractAgent) - - with patch.object(tesseract, "is_pytesseract_available", return_value=True): - tesseract.load_agent() - - assert isinstance(tesseract.ocr_agent, MockTesseractAgent) - - -def test_load_agent_raises_when_not_available(): - with patch.object(tesseract, "is_pytesseract_available", return_value=False): - with pytest.raises(ImportError): - tesseract.load_agent() diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 68100083db..086e684314 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.2.6-dev1" # pragma: no cover +__version__ = "0.3.0-dev1" # pragma: no cover diff --git a/unstructured/documents/pdf.py b/unstructured/documents/pdf.py deleted file mode 100644 index f984e8f325..0000000000 --- a/unstructured/documents/pdf.py +++ /dev/null @@ -1,104 +0,0 @@ -from __future__ import annotations -from typing import List, Optional, Union - -import layoutparser as lp -import numpy as np -from PIL import Image - -from unstructured.logger import get_logger -from unstructured.documents.base import Document, Page -from unstructured.documents.elements import Element, NarrativeText, Title -import unstructured.models.ocr.tesseract as tesseract -import unstructured.models.layout.detectron2 as detectron2 - -logger = get_logger() - - -class PDFDocument(Document): - """Class for handling documents that are saved as .pdf files. For .pdf files, a - document image analysis (DIA) model detects the layout of the page prior to extracting - element.""" - - def __init__(self): - print( - """ - -====================================================================== -WARNING: PDF parsing capabilities in unstructured is still experimental -====================================================================== - -""" - ) - super().__init__() - - @classmethod - def from_file(cls, filename: str): - logger.info(f"Reading PDF for file: {filename} ...") - layouts, images = lp.load_pdf(filename, load_images=True) - pages: List[Page] = list() - for i, layout in enumerate(layouts): - image = images[i] - # NOTE(robinson) - In the future, maybe we detect the page number and default - # to the index if it is not detected - page = PDFPage(number=i, image=image, layout=layout) - page.get_elements() - pages.append(page) - return cls.from_pages(pages) - - -class PDFPage(Page): - """Class for an individual PDF page.""" - - def __init__(self, number: int, image: Image, layout: lp.Layout): - self.image = image - self.image_array: Union[np.ndarray, None] = None - self.layout = layout - super().__init__(number=number) - - def get_elements(self, inplace=True) -> Optional[List[Element]]: - """Uses a layoutparser model to detect the elements on the page.""" - logger.info("Detecting page elements ...") - detectron2.load_model() - - elements: List[Element] = list() - # NOTE(mrobinson) - We'll want make this model inference step some kind of - # remote call in the future. - image_layout = detectron2.model.detect(self.image) - # NOTE(robinson) - This orders the page from top to bottom. We'll need more - # sophisticated ordering logic for more complicated layouts. - image_layout.sort(key=lambda element: element.coordinates[1], inplace=True) - for item in image_layout: - if item.type in ["Text", "Title"]: - text_blocks = self.layout.filter_by(item, center=True) - text = str() - for text_block in text_blocks: - # NOTE(robinson) - If the text attribute is None, that means the PDF isn't - # already OCR'd and we have to send the snippet out for OCRing. - if text_block.text is None: - text_block.text = self.ocr(text_block) - text = " ".join([x for x in text_blocks.get_texts() if x]) - - if item.type == "Text": - elements.append(NarrativeText(text=text)) - elif item.type == "Title": - elements.append(Title(text=text)) - - if inplace: - self.elements = elements - return None - return elements - - def ocr(self, text_block: lp.TextBlock) -> str: - """Runs a cropped text block image through and OCR agent.""" - logger.debug("Running OCR on text block ...") - tesseract.load_agent() - image_array = self._get_image_array() - padded_block = text_block.pad(left=5, right=5, top=5, bottom=5) - cropped_image = padded_block.crop_image(image_array) - return tesseract.ocr_agent.detect(cropped_image) - - def _get_image_array(self) -> Union[np.ndarray, None]: - """Converts the raw image into a numpy array.""" - if self.image_array is None: - self.image_array = np.array(self.image) - return self.image_array diff --git a/unstructured/models/layout/__init__.py b/unstructured/models/layout/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/unstructured/models/layout/detectron2.py b/unstructured/models/layout/detectron2.py deleted file mode 100644 index 2c862b729c..0000000000 --- a/unstructured/models/layout/detectron2.py +++ /dev/null @@ -1,39 +0,0 @@ -import sys - -if sys.version_info < (3, 8): - from typing_extensions import Final -else: - from typing import Final - -from layoutparser.models.detectron2.layoutmodel import ( - is_detectron2_available, - Detectron2LayoutModel, -) - -from unstructured.logger import get_logger - -logger = get_logger() - -model: Detectron2LayoutModel = None - -DETECTRON_CONFIG: Final = "lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config" - - -def load_model(): - """Loads the detectron2 model as a global variable to ensure that we are not loading - it multiple times.""" - global model - - if not is_detectron2_available(): - raise ImportError( - "Failed to load the Detectron2 model. Ensure that the Detectron2 " - "module is correctly installed." - ) - - if model is None: - logger.info("Loading the Detectron2 layout model ...") - model = Detectron2LayoutModel( - DETECTRON_CONFIG, - extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8], - label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}, - ) diff --git a/unstructured/models/ocr/__init__.py b/unstructured/models/ocr/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/unstructured/models/ocr/tesseract.py b/unstructured/models/ocr/tesseract.py deleted file mode 100644 index 178f48219c..0000000000 --- a/unstructured/models/ocr/tesseract.py +++ /dev/null @@ -1,22 +0,0 @@ -from layoutparser.ocr.tesseract_agent import is_pytesseract_available, TesseractAgent - -from unstructured.logger import get_logger - -ocr_agent: TesseractAgent = None - -logger = get_logger() - - -def load_agent(): - """Loads the Tesseract OCR agent as a global variable to ensure that we only load it once.""" - global ocr_agent - - if not is_pytesseract_available(): - raise ImportError( - "Failed to load Tesseract. Ensure that Tesseract is installed. Example command: \n" - " >>> sudo apt install -y tesseract-ocr" - ) - - if ocr_agent is None: - logger.info("Loading the Tesseract OCR agent ...") - ocr_agent = TesseractAgent(languages="eng")