feat: new partitioning brick that calls the document image analysis A…

…PI (Unstructured-IO#68) * docs: add new feature to the CHANGELOG.md, bump the version, update __version__.py * feat: new partition to call the document image analysis API * fix: remove duplicated dependency on partition.py * fix: linting error due to line-lenght > 100 * test: add test to call partition_pdf brick * chore: new short example-doc pdf for speed up in test X8 * fix: add missing return statement to _read to pass check * feat: new partitioning brick to call doc parse API * docs: version update fix in CHANGELOG * refactor: no nested ifs * docs: documentation for new brick partition_pdf * refactor: made tidy * docs: minor doc refactor Co-authored-by: Sebastian Laverde <sebastian@unstructured.io>
siddartha-RE · Nov 16, 2022 · baa15d0 · baa15d0
1 parent 83e7f9d
commit baa15d0
Show file tree

Hide file tree

Showing 6 changed files with 80 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,7 @@
-## 0.2.6-dev0
+## 0.2.6-dev1
 
 * Small change to how _read is placed within the inheritance structure since it doesn't really apply to pdf
+* Add partitioning brick for calling the document image analysis API
 
 ## 0.2.5
 

diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -174,6 +174,23 @@ Examples:
 
   # Returns False because the text is more than 1% caps
   exceeds_cap_ratio(example_2, threshold=0.01)
+  
+  
+``partition_pdf``
+---------------------
+
+The ``partition_pdf`` function segments a PDF document by calling the document image analysis API. 
+The intent of the parameters ``url`` and ``token`` is to allow users to self host an inference API,
+if desired.
+
+Examples:
+
+.. code:: python
+
+  from unstructured.nlp.partition import partition_pdf
+
+  # Returns a List[Element] present in the pages of the parsed pdf document
+  elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf")
 
 
 ########
@@ -856,4 +873,4 @@ Example:
 
 The output is a list of dictionaries, each one with two keys:
 "text" with the content of the element and 
-"entities" with an empty list.
+"entities" with an empty list.
diff --git a/example-docs/layout-parser-paper-fast.pdf b/example-docs/layout-parser-paper-fast.pdf
diff --git a/test_unstructured/nlp/test_partition.py b/test_unstructured/nlp/test_partition.py
@@ -133,3 +133,12 @@ def test_sentence_count(monkeypatch):
 def test_item_titles():
     text = "ITEM 1(A). THIS IS A TITLE"
     assert partition.sentence_count(text, 3) < 2
+
+
+def test_partition_pdf(filename="example-docs/layout-parser-paper-fast.pdf"):
+    partition_pdf_response = partition.partition_pdf(filename)
+    assert partition_pdf_response[0]["type"] == "Title"
+    assert (
+        partition_pdf_response[0]["text"]
+        == "LayoutParser : A Uniﬁed Toolkit for Deep Learning Based Document Image Analysis"
+    )
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.2.6-dev0"  # pragma: no cover
+__version__ = "0.2.6-dev1"  # pragma: no cover
diff --git a/unstructured/nlp/partition.py b/unstructured/nlp/partition.py
@@ -1,13 +1,14 @@
-"""parition.py implements logic for partining plain text documents into sections."""
-from typing import List, Optional
+"""partition.py implements logic for partitioning plain text documents into sections."""
 import sys
+import requests  # type: ignore
 
 if sys.version_info < (3, 8):
-    from typing_extensions import Final
+    from typing_extensions import Final, List, Optional
 else:
-    from typing import Final
+    from typing import Final, List, Optional
 
 from unstructured.cleaners.core import remove_punctuation
+from unstructured.documents.elements import Element, Text
 from unstructured.nlp.patterns import UNICODE_BULLETS_RE
 from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
 from unstructured.logger import get_logger
@@ -112,3 +113,48 @@ def exceeds_cap_ratio(text: str, threshold: float = 0.3) -> bool:
     capitalized = sum([word.istitle() or word.isupper() for word in tokens])
     ratio = capitalized / len(tokens)
     return ratio > threshold
+
+
+def partition_pdf(
+    filename: str = "",
+    file: Optional[bytes] = None,
+    url: str = "https://ml.unstructured.io/",
+    template: Optional[str] = "base-model",
+    token: Optional[str] = None,
+) -> List[Element]:
+    """Calls the document parsing API.
+    Parameters
+    ----------
+    filename
+        A string defining the target filename path.
+    file
+        A file-like object as bytes --> open(filename, "rb").
+    template
+        A string defining the model to be used. Default "base-model" makes reference to layout/pdf.
+    url
+        A string endpoint to self-host an inference API, if desired.
+    token
+        A string defining the authentication token for a self-host url.
+    """
+    if not filename and not file:
+        raise FileNotFoundError("No filename nor file were specified")
+
+    healthcheck_response = requests.models.Response()
+    if not token:
+        healthcheck_response = requests.get(url=f"{url}healthcheck")
+
+    if healthcheck_response.status_code != 200:
+        return [Text(text="error: endpoint api healthcheck has failed!")]
+
+    url = f"{url}layout/pdf" if template == "base-model" else f"{url}/{template}"
+    file_ = (filename, file if file else open(filename, "rb"))
+    response = requests.post(
+        url=url,
+        headers={"Authorization": f"Bearer {token}" if token else ""},
+        files={"file": file_},
+    )
+    if response.status_code == 200:
+        pages = response.json()["pages"]
+        return [element for page in pages for element in page["elements"]]
+    else:
+        return [Text(text=f"error: response status code = {response.status_code}")]
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.2.6-dev0" # pragma: no cover
		__version__ = "0.2.6-dev1" # pragma: no cover