From 53fcf4e91284ca3b6e9510439e0acf8f530bc6a6 Mon Sep 17 00:00:00 2001
From: Mallori Harrell <6825104+mallorih@users.noreply.github.com>
Date: Mon, 21 Nov 2022 11:47:29 -0600
Subject: [PATCH] chore: Remove PDF parsing code and dependencies (#75)

Remove PDF parsing code and dependencies.
---
 .github/workflows/ci.yml                      |  15 +-
 CHANGELOG.md                                  |   6 +-
 Makefile                                      |  19 +--
 README.md                                     |   9 +-
 docs/requirements.txt                         |   8 +-
 docs/source/installing.rst                    |  20 +--
 requirements/build.txt                        |   8 +-
 requirements/dev.txt                          |  24 ++-
 requirements/pdf.txt                          | 145 ------------------
 requirements/test.txt                         |  23 +--
 setup.py                                      |   1 -
 test_unstructured/documents/test_pdf.py       | 115 --------------
 .../models/layout/test_detectron2.py          |  24 ---
 .../models/ocr/test_tesseract.py              |  24 ---
 unstructured/__version__.py                   |   2 +-
 unstructured/documents/pdf.py                 | 104 -------------
 unstructured/models/layout/__init__.py        |   0
 unstructured/models/layout/detectron2.py      |  39 -----
 unstructured/models/ocr/__init__.py           |   0
 unstructured/models/ocr/tesseract.py          |  22 ---
 20 files changed, 69 insertions(+), 539 deletions(-)
 delete mode 100644 requirements/pdf.txt
 delete mode 100644 test_unstructured/documents/test_pdf.py
 delete mode 100644 test_unstructured/models/layout/test_detectron2.py
 delete mode 100644 test_unstructured/models/ocr/test_tesseract.py
 delete mode 100644 unstructured/documents/pdf.py
 delete mode 100644 unstructured/models/layout/__init__.py
 delete mode 100644 unstructured/models/layout/detectron2.py
 delete mode 100644 unstructured/models/ocr/__init__.py
 delete mode 100644 unstructured/models/ocr/tesseract.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a479416f8c..4e76dad02c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -25,8 +25,11 @@ jobs:
         path: |
           .venv
           nltk_data
-        key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
-
+        key: unstructured-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
+    - name: Set up Python ${{ env.PYTHON_VERSION }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
     - name: Setup virtual environment (no cache hit)
       if: steps.virtualenv-cache.outputs.cache-hit != 'true'
       run: |
@@ -43,9 +46,13 @@ jobs:
       id: virtualenv-cache
       with:
         path: .venv
-        key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
+        key: unstructured-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
     # NOTE(robinson) - This is a fallback in case the lint job does not find the cache.
     # We can take this out when we implement the fix in CORE-99
+    - name: Set up Python ${{ env.PYTHON_VERSION }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
     - name: Setup virtual environment (no cache hit)
       if: steps.virtualenv-cache.outputs.cache-hit != 'true'
       run: |
@@ -77,7 +84,7 @@ jobs:
         path: |
           .venv
           nltk_data
-        key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
+        key: unstructured-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
     # NOTE(robinson) - This is a fallback in case the lint job does not find the cache.
     # We can take this out when we implement the fix in CORE-99
     - name: Setup virtual environment (no cache hit)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5bab662d61..daebd51d8c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,8 @@
-## 0.2.6-dev1
+## 0.3.0-dev1
+
+* Removing the local PDF parsing code and any dependencies and tests.
+
+## 0.2.6
 
 * Small change to how _read is placed within the inheritance structure since it doesn't really apply to pdf
 * Add partitioning brick for calling the document image analysis API
diff --git a/Makefile b/Makefile
index c88c927fc5..05bbea7ec4 100644
--- a/Makefile
+++ b/Makefile
@@ -17,10 +17,10 @@ install-base: install-base-pip-packages install-nltk-models
 
 ## install:                 installs all test, dev, and experimental requirements
 .PHONY: install
-install: install-base-pip-packages install-dev install-detectron2 install-nltk-models install-test
+install: install-base-pip-packages install-dev install-nltk-models install-test
 
 .PHONY: install-ci
-install-ci: install-base-pip-packages install-pdf install-test install-nltk-models install-huggingface
+install-ci: install-base-pip-packages install-test install-nltk-models install-huggingface
 
 .PHONY: install-base-pip-packages
 install-base-pip-packages:
@@ -32,18 +32,6 @@ install-huggingface:
 	python3 -m pip install pip==${PIP_VERSION}
 	pip install -r requirements/huggingface.txt
 
-.PHONY: install-pdf
-install-pdf:
-	python3 -m pip install pip==${PIP_VERSION}
-	pip install -r requirements/pdf.txt
-	@echo "\n\n========================================================================"
-	@echo " WARNING: PDF parsing capabilities in unstructured is still experimental"
-	@echo "========================================================================\n\n"
-
-.PHONY: install-detectron2
-install-detectron2: install-pdf
-	pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"
-
 .PHONE: install-nltk-models
 install-nltk-models:
 	python -c "import nltk; nltk.download('punkt')"
@@ -67,12 +55,9 @@ pip-compile:
 	pip-compile -o requirements/base.txt
 	# Extra requirements for huggingface staging functions
 	pip-compile --extra huggingface -o requirements/huggingface.txt
-	# Extra requirements for parsing PDF files
-	pip-compile --extra pdf -o requirements/pdf.txt
 	# NOTE(robinson) - We want the dependencies for detectron2 in the requirements.txt, but not
 	# the detectron2 repo itself. If detectron2 is in the requirements.txt file, an order of
 	# operations issue related to the torch library causes the install to fail
-	sed 's/^detectron2 @/# detectron2 @/g' requirements/pdf.txt
 	pip-compile requirements/dev.in
 	pip-compile requirements/test.in
 	pip-compile requirements/build.in
diff --git a/README.md b/README.md
index d4a24dd420..5585cf3101 100644
--- a/README.md
+++ b/README.md
@@ -88,17 +88,16 @@ titles and narrative text.
 
 ### PDF Parsing
 
-You can use the following workflow to parse PDF documents. Note, PDF parsing is currently
-expiremental and will be refined in the coming months.
+You can use the following workflow to parse PDF documents.
 
 ```python
-from unstructured.documents.pdf import PDFDocument
+from unstructured.nlp.partition import partition_pdf
 
-doc = PDFDocument.from_file("example-docs/layout-parser-paper.pdf")
+elements = partition_pdf("example-docs/layout-parser-paper.pdf")
 print(doc)
 ```
 
-At this point, `print(doc)` will print out a string representation of the PDF file. The
+At this point, `print(elements)` will print out a string representation of the PDF file. The
 first page of output looks like the following:
 
 ```
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 6d1d7604c0..fd4a7b2d69 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -8,7 +8,7 @@ alabaster==0.7.12
     # via sphinx
 babel==2.10.3
     # via sphinx
-certifi==2022.9.14
+certifi==2022.9.24
     # via requests
 charset-normalizer==2.1.1
     # via requests
@@ -38,11 +38,11 @@ requests==2.28.1
     # via sphinx
 snowballstemmer==2.2.0
     # via sphinx
-sphinx==5.2.3
+sphinx==5.3.0
     # via
     #   -r requirements/build.in
     #   sphinx-rtd-theme
-sphinx-rtd-theme==1.0.0
+sphinx-rtd-theme==1.1.1
     # via -r requirements/build.in
 sphinxcontrib-applehelp==1.0.2
     # via sphinx
@@ -58,5 +58,5 @@ sphinxcontrib-serializinghtml==1.1.5
     # via sphinx
 urllib3==1.26.12
     # via requests
-zipp==3.9.0
+zipp==3.10.0
     # via importlib-metadata
diff --git a/docs/source/installing.rst b/docs/source/installing.rst
index 20ba803aad..f808074af9 100644
--- a/docs/source/installing.rst
+++ b/docs/source/installing.rst
@@ -3,8 +3,7 @@ Installation
 
 You can install the library by cloning the repo and running ``make install`` from the
 root directory. Developers can run ``make install-local`` to install the dev and test
-requirements alongside the base requirements. Specific parsing capabilities may require
-extra dependencies, as documented below. If you want a minimal installation without any
+requirements alongside the base requirements. If you want a minimal installation without any
 parser specific dependencies, run ``make install-base``.
 
 Logging
@@ -37,23 +36,6 @@ that with:
 		$ brew install libxml2
 		$ brew install libxslt
 
-================
-PDF Dependencies
-================
-
-Currently, PDF parsing capabilities rely on the
-`Detectron2 <https://github.com/facebookresearch/detectron2>`_ object detection model. The
-``make install-local`` command installs all of the dependencies for Detectron2. If you
-need to parse PDFs and Detectron2 is not already installed, you can install it with
-``make install-detectron2``.
-
-Also ensure that you have ``poppler`` installed on your system. On a Mac, you can run:
-
-.. code:: console
-
-		$ brew install poppler
-
-
 ========================
 Huggingface Dependencies
 ========================
diff --git a/requirements/build.txt b/requirements/build.txt
index 5f295a8ca8..fd4a7b2d69 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -20,6 +20,8 @@ idna==3.4
     # via requests
 imagesize==1.4.1
     # via sphinx
+importlib-metadata==5.0.0
+    # via sphinx
 jinja2==3.1.2
     # via sphinx
 markupsafe==2.1.1
@@ -38,10 +40,10 @@ snowballstemmer==2.2.0
     # via sphinx
 sphinx==5.3.0
     # via
-    #   -r build.in
+    #   -r requirements/build.in
     #   sphinx-rtd-theme
 sphinx-rtd-theme==1.1.1
-    # via -r build.in
+    # via -r requirements/build.in
 sphinxcontrib-applehelp==1.0.2
     # via sphinx
 sphinxcontrib-devhelp==1.0.2
@@ -56,3 +58,5 @@ sphinxcontrib-serializinghtml==1.1.5
     # via sphinx
 urllib3==1.26.12
     # via requests
+zipp==3.10.0
+    # via importlib-metadata
diff --git a/requirements/dev.txt b/requirements/dev.txt
index 21ad663b3a..51e6e516eb 100644
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@@ -4,6 +4,10 @@
 #
 #    pip-compile requirements/dev.in
 #
+appnope==0.1.3
+    # via
+    #   ipykernel
+    #   ipython
 argon2-cffi==21.3.0
     # via notebook
 argon2-cffi-bindings==21.2.0
@@ -36,6 +40,10 @@ executing==1.0.0
     # via stack-data
 fastjsonschema==2.16.2
     # via nbformat
+importlib-metadata==5.0.0
+    # via nbconvert
+importlib-resources==5.10.0
+    # via jsonschema
 ipykernel==6.15.3
     # via
     #   ipywidgets
@@ -45,7 +53,7 @@ ipykernel==6.15.3
     #   qtconsole
 ipython==8.6.0
     # via
-    #   -r dev.in
+    #   -r requirements/dev.in
     #   ipykernel
     #   ipywidgets
     #   jupyter-console
@@ -64,7 +72,7 @@ jinja2==3.1.2
 jsonschema==4.16.0
     # via nbformat
 jupyter==1.0.0
-    # via -r dev.in
+    # via -r requirements/dev.in
 jupyter-client==7.3.5
     # via
     #   ipykernel
@@ -133,7 +141,9 @@ pexpect==4.8.0
 pickleshare==0.7.5
     # via ipython
 pip-tools==6.10.0
-    # via -r dev.in
+    # via -r requirements/dev.in
+pkgutil-resolve-name==1.3.10
+    # via jsonschema
 prometheus-client==0.14.1
     # via notebook
 prompt-toolkit==3.0.31
@@ -187,6 +197,10 @@ terminado==0.15.0
     # via notebook
 tinycss2==1.1.1
     # via nbconvert
+tomli==2.0.1
+    # via
+    #   build
+    #   pep517
 tornado==6.2
     # via
     #   ipykernel
@@ -216,6 +230,10 @@ wheel==0.37.1
     # via pip-tools
 widgetsnbextension==4.0.3
     # via ipywidgets
+zipp==3.10.0
+    # via
+    #   importlib-metadata
+    #   importlib-resources
 
 # The following packages are considered to be unsafe in a requirements file:
 # pip
diff --git a/requirements/pdf.txt b/requirements/pdf.txt
deleted file mode 100644
index a0a31e1197..0000000000
--- a/requirements/pdf.txt
+++ /dev/null
@@ -1,145 +0,0 @@
-#
-# This file is autogenerated by pip-compile with python 3.8
-# To update, run:
-#
-#    pip-compile --extra=pdf --output-file=requirements/pdf.txt
-#
-antlr4-python3-runtime==4.9.3
-    # via omegaconf
-certifi==2022.9.24
-    # via requests
-cffi==1.15.1
-    # via cryptography
-charset-normalizer==2.1.1
-    # via
-    #   pdfminer-six
-    #   requests
-click==8.1.3
-    # via nltk
-contourpy==1.0.6
-    # via matplotlib
-cryptography==38.0.3
-    # via pdfminer-six
-cycler==0.11.0
-    # via matplotlib
-effdet==0.3.0
-    # via layoutparser
-filelock==3.8.0
-    # via huggingface-hub
-fonttools==4.38.0
-    # via matplotlib
-huggingface-hub==0.10.1
-    # via timm
-idna==3.4
-    # via requests
-iopath==0.1.10
-    # via layoutparser
-joblib==1.2.0
-    # via nltk
-kiwisolver==1.4.4
-    # via matplotlib
-layoutparser[layoutmodels,tesseract]==0.3.4
-    # via unstructured (setup.py)
-lxml==4.9.1
-    # via unstructured (setup.py)
-matplotlib==3.6.1
-    # via pycocotools
-nltk==3.7
-    # via unstructured (setup.py)
-numpy==1.23.4
-    # via
-    #   contourpy
-    #   layoutparser
-    #   matplotlib
-    #   opencv-python
-    #   pandas
-    #   pycocotools
-    #   scipy
-    #   torchvision
-omegaconf==2.2.3
-    # via effdet
-opencv-python==4.6.0.66
-    # via layoutparser
-packaging==21.3
-    # via
-    #   huggingface-hub
-    #   matplotlib
-    #   pytesseract
-pandas==1.5.1
-    # via layoutparser
-pdf2image==1.16.0
-    # via layoutparser
-pdfminer-six==20220524
-    # via pdfplumber
-pdfplumber==0.7.5
-    # via layoutparser
-pillow==9.3.0
-    # via
-    #   layoutparser
-    #   matplotlib
-    #   pdf2image
-    #   pdfplumber
-    #   pytesseract
-    #   torchvision
-portalocker==2.6.0
-    # via iopath
-pycocotools==2.0.6
-    # via effdet
-pycparser==2.21
-    # via cffi
-pyparsing==3.0.9
-    # via
-    #   matplotlib
-    #   packaging
-pytesseract==0.3.10
-    # via layoutparser
-python-dateutil==2.8.2
-    # via
-    #   matplotlib
-    #   pandas
-pytz==2022.5
-    # via pandas
-pyyaml==6.0
-    # via
-    #   huggingface-hub
-    #   layoutparser
-    #   omegaconf
-    #   timm
-regex==2022.10.31
-    # via nltk
-requests==2.28.1
-    # via
-    #   huggingface-hub
-    #   torchvision
-scipy==1.9.3
-    # via layoutparser
-six==1.16.0
-    # via python-dateutil
-timm==0.6.11
-    # via effdet
-torch==1.12.1
-    # via
-    #   effdet
-    #   layoutparser
-    #   timm
-    #   torchvision
-torchvision==0.13.1
-    # via
-    #   effdet
-    #   layoutparser
-    #   timm
-tqdm==4.64.1
-    # via
-    #   huggingface-hub
-    #   iopath
-    #   nltk
-typing-extensions==4.4.0
-    # via
-    #   huggingface-hub
-    #   iopath
-    #   torch
-    #   torchvision
-urllib3==1.26.12
-    # via requests
-wand==0.6.10
-    # via pdfplumber
diff --git a/requirements/test.txt b/requirements/test.txt
index 1ca846919a..994b96ca36 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -7,21 +7,21 @@
 attrs==22.1.0
     # via pytest
 black==22.10.0
-    # via -r test.in
+    # via -r requirements/test.in
 certifi==2022.9.24
     # via requests
 charset-normalizer==2.1.1
     # via requests
 click==8.1.3
     # via
-    #   -r test.in
+    #   -r requirements/test.in
     #   black
 coverage[toml]==6.4.4
     # via
-    #   -r test.in
+    #   -r requirements/test.in
     #   pytest-cov
 flake8==5.0.4
-    # via -r test.in
+    # via -r requirements/test.in
 idna==3.4
     # via
     #   requests
@@ -29,7 +29,7 @@ idna==3.4
 iniconfig==1.1.1
     # via pytest
 label-studio-sdk==0.0.15
-    # via -r test.in
+    # via -r requirements/test.in
 lxml==4.9.1
     # via label-studio-sdk
 mccabe==0.7.0
@@ -37,7 +37,7 @@ mccabe==0.7.0
 multidict==6.0.2
     # via yarl
 mypy==0.990
-    # via -r test.in
+    # via -r requirements/test.in
 mypy-extensions==0.4.3
     # via
     #   black
@@ -63,7 +63,7 @@ pyparsing==3.0.9
 pytest==7.1.3
     # via pytest-cov
 pytest-cov==4.0.0
-    # via -r test.in
+    # via -r requirements/test.in
 pyyaml==6.0
     # via vcrpy
 requests==2.28.1
@@ -71,15 +71,20 @@ requests==2.28.1
 six==1.16.0
     # via vcrpy
 tomli==2.0.1
-    # via pytest
+    # via
+    #   black
+    #   coverage
+    #   mypy
+    #   pytest
 typing-extensions==4.3.0
     # via
+    #   black
     #   mypy
     #   pydantic
 urllib3==1.26.12
     # via requests
 vcrpy==4.2.1
-    # via -r test.in
+    # via -r requirements/test.in
 wrapt==1.14.1
     # via vcrpy
 yarl==1.8.1
diff --git a/setup.py b/setup.py
index 5b732544ba..404a7e8f31 100644
--- a/setup.py
+++ b/setup.py
@@ -52,7 +52,6 @@
         "nltk",
     ],
     extras_require={
-        "pdf": ["layoutparser[layoutmodels,tesseract]"],
         "huggingface": ["transformers"],
     },
 )
diff --git a/test_unstructured/documents/test_pdf.py b/test_unstructured/documents/test_pdf.py
deleted file mode 100644
index 39c87bd9a7..0000000000
--- a/test_unstructured/documents/test_pdf.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import pytest
-from unittest.mock import patch
-
-import layoutparser as lp
-from layoutparser.elements import Layout, Rectangle, TextBlock
-import numpy as np
-from PIL import Image
-
-from unstructured.documents.pdf import PDFDocument, PDFPage
-import unstructured.models.layout.detectron2 as detectron2
-import unstructured.models.ocr.tesseract as tesseract
-
-
-@pytest.fixture
-def mock_image():
-    return Image.new("1", (1, 1))
-
-
-@pytest.fixture
-def mock_page_layout():
-    text_rectangle = Rectangle(2, 4, 6, 8)
-    text_block = TextBlock(text_rectangle, text="A very repetitive narrative. " * 10, type="Text")
-
-    title_rectangle = Rectangle(1, 2, 3, 4)
-    title_block = TextBlock(title_rectangle, text="A Catchy Title", type="Title")
-
-    return Layout([text_block, title_block])
-
-
-def test_pdf_page_converts_images_to_array(mock_image):
-    page = PDFPage(number=0, image=mock_image, layout=Layout())
-    assert page.image_array is None
-
-    image_array = page._get_image_array()
-    assert isinstance(image_array, np.ndarray)
-    assert page.image_array.all() == image_array.all()
-
-
-def test_ocr(monkeypatch):
-    mock_text = "The parrot flies high in the air!"
-
-    class MockOCRAgent:
-        def detect(self, *args):
-            return mock_text
-
-    monkeypatch.setattr(tesseract, "ocr_agent", MockOCRAgent)
-    monkeypatch.setattr(tesseract, "is_pytesseract_available", lambda *args: True)
-
-    image = np.random.randint(12, 24, (40, 40))
-    page = PDFPage(number=0, image=image, layout=Layout())
-    rectangle = Rectangle(1, 2, 3, 4)
-    text_block = TextBlock(rectangle, text=None)
-
-    assert page.ocr(text_block) == mock_text
-
-
-class MockLayoutModel:
-    def __init__(self, layout):
-        self.layout = layout
-
-    def detect(self, *args):
-        return self.layout
-
-
-def test_get_page_elements(monkeypatch, mock_page_layout):
-    monkeypatch.setattr(detectron2, "model", MockLayoutModel(mock_page_layout))
-    monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
-
-    image = np.random.randint(12, 24, (40, 40))
-    page = PDFPage(number=0, image=image, layout=mock_page_layout)
-
-    elements = page.get_elements(inplace=False)
-
-    assert str(elements[0]) == "A Catchy Title"
-    assert str(elements[1]).startswith("A very repetitive narrative.")
-
-    page.get_elements(inplace=True)
-    assert elements == page.elements
-
-
-def test_get_page_elements_with_ocr(monkeypatch):
-    monkeypatch.setattr(PDFPage, "ocr", lambda *args: "An Even Catchier TItle")
-
-    rectangle = Rectangle(2, 4, 6, 8)
-    text_block = TextBlock(rectangle, text=None, type="Title")
-    layout = Layout([text_block])
-
-    monkeypatch.setattr(detectron2, "model", MockLayoutModel(layout))
-    monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
-
-    image = np.random.randint(12, 24, (40, 40))
-    page = PDFPage(number=0, image=image, layout=layout)
-    page.get_elements()
-
-    assert str(page) == "An Even Catchier TItle"
-
-
-def test_read_pdf(monkeypatch, mock_page_layout):
-    image = np.random.randint(12, 24, (40, 40))
-    images = [image, image]
-
-    layouts = Layout([mock_page_layout, mock_page_layout])
-
-    monkeypatch.setattr(detectron2, "model", MockLayoutModel(mock_page_layout))
-    monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
-
-    with patch.object(lp, "load_pdf", return_value=(layouts, images)):
-        doc = PDFDocument.from_file("fake-file.pdf")
-
-        assert str(doc).startswith("A Catchy Title")
-        assert str(doc).count("A Catchy Title") == 2  # Once for each page
-        assert str(doc).endswith("A very repetitive narrative. ")
-
-        pages = doc.pages
-        assert str(doc) == "\n\n".join([str(page) for page in pages])
diff --git a/test_unstructured/models/layout/test_detectron2.py b/test_unstructured/models/layout/test_detectron2.py
deleted file mode 100644
index 7cf6d82567..0000000000
--- a/test_unstructured/models/layout/test_detectron2.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import pytest
-from unittest.mock import patch
-
-import unstructured.models.layout.detectron2 as detectron2
-
-
-class MockDetectron2LayoutModel:
-    def __init__(self, *args, **kwargs):
-        pass
-
-
-def test_load_model(monkeypatch):
-    monkeypatch.setattr(detectron2, "Detectron2LayoutModel", MockDetectron2LayoutModel)
-
-    with patch.object(detectron2, "is_detectron2_available", return_value=True):
-        detectron2.load_model()
-
-    assert isinstance(detectron2.model, MockDetectron2LayoutModel)
-
-
-def test_load_model_raises_when_not_available():
-    with patch.object(detectron2, "is_detectron2_available", return_value=False):
-        with pytest.raises(ImportError):
-            detectron2.load_model()
diff --git a/test_unstructured/models/ocr/test_tesseract.py b/test_unstructured/models/ocr/test_tesseract.py
deleted file mode 100644
index 30a1d4c11b..0000000000
--- a/test_unstructured/models/ocr/test_tesseract.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import pytest
-from unittest.mock import patch
-
-import unstructured.models.ocr.tesseract as tesseract
-
-
-class MockTesseractAgent:
-    def __init__(self, languages):
-        pass
-
-
-def test_load_agent(monkeypatch):
-    monkeypatch.setattr(tesseract, "TesseractAgent", MockTesseractAgent)
-
-    with patch.object(tesseract, "is_pytesseract_available", return_value=True):
-        tesseract.load_agent()
-
-    assert isinstance(tesseract.ocr_agent, MockTesseractAgent)
-
-
-def test_load_agent_raises_when_not_available():
-    with patch.object(tesseract, "is_pytesseract_available", return_value=False):
-        with pytest.raises(ImportError):
-            tesseract.load_agent()
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 68100083db..086e684314 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.2.6-dev1"  # pragma: no cover
+__version__ = "0.3.0-dev1"  # pragma: no cover
diff --git a/unstructured/documents/pdf.py b/unstructured/documents/pdf.py
deleted file mode 100644
index f984e8f325..0000000000
--- a/unstructured/documents/pdf.py
+++ /dev/null
@@ -1,104 +0,0 @@
-from __future__ import annotations
-from typing import List, Optional, Union
-
-import layoutparser as lp
-import numpy as np
-from PIL import Image
-
-from unstructured.logger import get_logger
-from unstructured.documents.base import Document, Page
-from unstructured.documents.elements import Element, NarrativeText, Title
-import unstructured.models.ocr.tesseract as tesseract
-import unstructured.models.layout.detectron2 as detectron2
-
-logger = get_logger()
-
-
-class PDFDocument(Document):
-    """Class for handling documents that are saved as .pdf files. For .pdf files, a
-    document image analysis (DIA) model detects the layout of the page prior to extracting
-    element."""
-
-    def __init__(self):
-        print(
-            """
-
-======================================================================
-WARNING: PDF parsing capabilities in unstructured is still experimental
-======================================================================
-
-"""
-        )
-        super().__init__()
-
-    @classmethod
-    def from_file(cls, filename: str):
-        logger.info(f"Reading PDF for file: {filename} ...")
-        layouts, images = lp.load_pdf(filename, load_images=True)
-        pages: List[Page] = list()
-        for i, layout in enumerate(layouts):
-            image = images[i]
-            # NOTE(robinson) - In the future, maybe we detect the page number and default
-            # to the index if it is not detected
-            page = PDFPage(number=i, image=image, layout=layout)
-            page.get_elements()
-            pages.append(page)
-        return cls.from_pages(pages)
-
-
-class PDFPage(Page):
-    """Class for an individual PDF page."""
-
-    def __init__(self, number: int, image: Image, layout: lp.Layout):
-        self.image = image
-        self.image_array: Union[np.ndarray, None] = None
-        self.layout = layout
-        super().__init__(number=number)
-
-    def get_elements(self, inplace=True) -> Optional[List[Element]]:
-        """Uses a layoutparser model to detect the elements on the page."""
-        logger.info("Detecting page elements ...")
-        detectron2.load_model()
-
-        elements: List[Element] = list()
-        # NOTE(mrobinson) - We'll want make this model inference step some kind of
-        # remote call in the future.
-        image_layout = detectron2.model.detect(self.image)
-        # NOTE(robinson) - This orders the page from top to bottom. We'll need more
-        # sophisticated ordering logic for more complicated layouts.
-        image_layout.sort(key=lambda element: element.coordinates[1], inplace=True)
-        for item in image_layout:
-            if item.type in ["Text", "Title"]:
-                text_blocks = self.layout.filter_by(item, center=True)
-                text = str()
-                for text_block in text_blocks:
-                    # NOTE(robinson) - If the text attribute is None, that means the PDF isn't
-                    # already OCR'd and we have to send the snippet out for OCRing.
-                    if text_block.text is None:
-                        text_block.text = self.ocr(text_block)
-                text = " ".join([x for x in text_blocks.get_texts() if x])
-
-                if item.type == "Text":
-                    elements.append(NarrativeText(text=text))
-                elif item.type == "Title":
-                    elements.append(Title(text=text))
-
-        if inplace:
-            self.elements = elements
-            return None
-        return elements
-
-    def ocr(self, text_block: lp.TextBlock) -> str:
-        """Runs a cropped text block image through and OCR agent."""
-        logger.debug("Running OCR on text block ...")
-        tesseract.load_agent()
-        image_array = self._get_image_array()
-        padded_block = text_block.pad(left=5, right=5, top=5, bottom=5)
-        cropped_image = padded_block.crop_image(image_array)
-        return tesseract.ocr_agent.detect(cropped_image)
-
-    def _get_image_array(self) -> Union[np.ndarray, None]:
-        """Converts the raw image into a numpy array."""
-        if self.image_array is None:
-            self.image_array = np.array(self.image)
-        return self.image_array
diff --git a/unstructured/models/layout/__init__.py b/unstructured/models/layout/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/unstructured/models/layout/detectron2.py b/unstructured/models/layout/detectron2.py
deleted file mode 100644
index 2c862b729c..0000000000
--- a/unstructured/models/layout/detectron2.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import sys
-
-if sys.version_info < (3, 8):
-    from typing_extensions import Final
-else:
-    from typing import Final
-
-from layoutparser.models.detectron2.layoutmodel import (
-    is_detectron2_available,
-    Detectron2LayoutModel,
-)
-
-from unstructured.logger import get_logger
-
-logger = get_logger()
-
-model: Detectron2LayoutModel = None
-
-DETECTRON_CONFIG: Final = "lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config"
-
-
-def load_model():
-    """Loads the detectron2 model as a global variable to ensure that we are not loading
-    it multiple times."""
-    global model
-
-    if not is_detectron2_available():
-        raise ImportError(
-            "Failed to load the Detectron2 model. Ensure that the Detectron2 "
-            "module is correctly installed."
-        )
-
-    if model is None:
-        logger.info("Loading the Detectron2 layout model ...")
-        model = Detectron2LayoutModel(
-            DETECTRON_CONFIG,
-            extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
-            label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
-        )
diff --git a/unstructured/models/ocr/__init__.py b/unstructured/models/ocr/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/unstructured/models/ocr/tesseract.py b/unstructured/models/ocr/tesseract.py
deleted file mode 100644
index 178f48219c..0000000000
--- a/unstructured/models/ocr/tesseract.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from layoutparser.ocr.tesseract_agent import is_pytesseract_available, TesseractAgent
-
-from unstructured.logger import get_logger
-
-ocr_agent: TesseractAgent = None
-
-logger = get_logger()
-
-
-def load_agent():
-    """Loads the Tesseract OCR agent as a global variable to ensure that we only load it once."""
-    global ocr_agent
-
-    if not is_pytesseract_available():
-        raise ImportError(
-            "Failed to load Tesseract. Ensure that Tesseract is installed. Example command: \n"
-            "    >>> sudo apt install -y tesseract-ocr"
-        )
-
-    if ocr_agent is None:
-        logger.info("Loading the Tesseract OCR agent ...")
-        ocr_agent = TesseractAgent(languages="eng")