feat: extract metadata from .docx, .xlsx, and .jpg (#113)

* add python-docx dependency * added function for extracting metadata from word documents * add openpyxl * added get_jpg_metadata; fixed typing * bump changelog * added pillow to dependencies
Unstructured-IO · Dec 26, 2022 · b14f6ac · b14f6ac
1 parent e0a76ef
commit b14f6ac
Show file tree

Hide file tree

Showing 16 changed files with 330 additions and 30 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,10 @@
-## 0.3.5-dev2
+## 0.3.5-dev3
 
 * Add new pattern to recognize plain text dash bullets
 * Add test for bullet patterns
 * Fix for `partition_html` that allows for processing `div` tags that have both text and child
   elements
+* Add ability to extract document metadata from `.docx`, `.xlsx`, and `.jpg` files.
 
 ## 0.3.4
 

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with python 3.8
+# This file is autogenerated by pip-compile with python 3.10
 # To update, run:
 #
 #    pip-compile requirements/build.in
@@ -22,8 +22,6 @@ idna==3.4
     # via requests
 imagesize==1.4.1
     # via sphinx
-importlib-metadata==5.0.0
-    # via sphinx
 jinja2==3.1.2
     # via sphinx
 markupsafe==2.1.1
@@ -60,5 +58,3 @@ sphinxcontrib-serializinghtml==1.1.5
     # via sphinx
 urllib3==1.26.12
     # via requests
-zipp==3.10.0
-    # via importlib-metadata
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
@@ -234,3 +234,39 @@ for reading in a document with an XSLT stylesheet is as follows:
 If you read from a stylesheet ``HTMLDocument`` will use the ``etree.XMLParser`` by default
 instead of the ``etree.HTMLParser`` because ``HTMLDocument`` assumes you want to convert
 your raw XML to HTML.
+
+
+##################################
+Extracting Metadata from Documents
+##################################
+
+The ``unstructured`` library includes utilities for extracting metadata from
+documents. Currently, there is support for extracting metadata from ``.docx``,
+``.xlsx``, and ``.jpg`` documents. When you call these functions, the return type
+is a ``Metadata`` data class that you can convert to a dictionary by calling the
+``to_dict()`` method. If you extract metadata from a ``.jpg`` document, the output
+will include EXIF metadata in the ``exif_data`` attribute, if it is available.
+Here is an example of how to use the metadata extraction functionality:
+
+
+.. code:: python
+
+  from unstructured.file_utils.metadata import get_jpg_metadata
+
+  filename = "example-docs/example.jpg"
+  metadata = get_jpg_metadata(filename=filename)
+
+
+You can also pass in a file-like object with:
+
+.. code:: python
+
+  from unstructured.file_utils.metadata import get_jpg_metadata
+
+  filename = "example-docs/example.jpg"
+  with open(filename, "rb") as f:
+      metadata = get_jpg_metadata(file=f)
+
+
+To extract metadata from ``.docx`` or ``.xlsx``, use ``get_docx_metadata`` and
+``get_xlsx_metadata``. The interfaces are the same as ``get_jpg_metadata``.
diff --git a/example-docs/example.jpg b/example-docs/example.jpg
diff --git a/example-docs/fake-excel.xlsx b/example-docs/fake-excel.xlsx
diff --git a/example-docs/fake.docx b/example-docs/fake.docx
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with python 3.8
+# This file is autogenerated by pip-compile with python 3.10
 # To update, run:
 #
 #    pip-compile --output-file=requirements/base.txt
@@ -16,6 +16,8 @@ click==8.1.3
     # via nltk
 deprecated==1.2.13
     # via argilla
+et-xmlfile==1.1.0
+    # via openpyxl
 h11==0.9.0
     # via httpcore
 httpcore==0.11.1
@@ -27,7 +29,9 @@ idna==3.4
 joblib==1.2.0
     # via nltk
 lxml==4.9.1
-    # via unstructured (setup.py)
+    # via
+    #   python-docx
+    #   unstructured (setup.py)
 monotonic==1.6
     # via argilla
 nltk==3.7
@@ -36,16 +40,22 @@ numpy==1.23.5
     # via
     #   argilla
     #   pandas
+openpyxl==3.0.10
+    # via unstructured (setup.py)
 packaging==21.3
     # via argilla
 pandas==1.5.2
     # via argilla
+pillow==9.3.0
+    # via unstructured (setup.py)
 pydantic==1.10.2
     # via argilla
 pyparsing==3.0.9
     # via packaging
 python-dateutil==2.8.2
     # via pandas
+python-docx==0.8.11
+    # via unstructured (setup.py)
 pytz==2022.6
     # via pandas
 regex==2022.10.31

diff --git a/requirements/build.txt b/requirements/build.txt
@@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with python 3.8
+# This file is autogenerated by pip-compile with python 3.10
 # To update, run:
 #
 #    pip-compile requirements/build.in
@@ -22,8 +22,6 @@ idna==3.4
     # via requests
 imagesize==1.4.1
     # via sphinx
-importlib-metadata==5.0.0
-    # via sphinx
 jinja2==3.1.2
     # via sphinx
 markupsafe==2.1.1
@@ -60,5 +58,3 @@ sphinxcontrib-serializinghtml==1.1.5
     # via sphinx
 urllib3==1.26.12
     # via requests
-zipp==3.10.0
-    # via importlib-metadata
diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with python 3.8
+# This file is autogenerated by pip-compile with python 3.10
 # To update, run:
 #
 #    pip-compile requirements/dev.in
@@ -40,10 +40,6 @@ executing==1.0.0
     # via stack-data
 fastjsonschema==2.16.2
     # via nbformat
-importlib-metadata==5.0.0
-    # via nbconvert
-importlib-resources==5.10.0
-    # via jsonschema
 ipykernel==6.15.3
     # via
     #   ipywidgets
@@ -143,8 +139,6 @@ pickleshare==0.7.5
     # via ipython
 pip-tools==6.10.0
     # via -r requirements/dev.in
-pkgutil-resolve-name==1.3.10
-    # via jsonschema
 platformdirs==2.5.4
     # via jupyter-core
 prometheus-client==0.14.1
@@ -233,10 +227,6 @@ wheel==0.37.1
     # via pip-tools
 widgetsnbextension==4.0.3
     # via ipywidgets
-zipp==3.10.0
-    # via
-    #   importlib-metadata
-    #   importlib-resources
 
 # The following packages are considered to be unsafe in a requirements file:
 # pip

diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt
@@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with python 3.8
+# This file is autogenerated by pip-compile with python 3.10
 # To update, run:
 #
 #    pip-compile --extra=huggingface --output-file=requirements/huggingface.txt
@@ -21,6 +21,8 @@ click==8.1.3
     #   sacremoses
 deprecated==1.2.13
     # via argilla
+et-xmlfile==1.1.0
+    # via openpyxl
 filelock==3.8.2
     # via
     #   huggingface-hub
@@ -44,7 +46,9 @@ joblib==1.2.0
 langdetect==1.0.9
     # via unstructured (setup.py)
 lxml==4.9.1
-    # via unstructured (setup.py)
+    # via
+    #   python-docx
+    #   unstructured (setup.py)
 monotonic==1.6
     # via argilla
 nltk==3.7
@@ -54,19 +58,25 @@ numpy==1.23.4
     #   argilla
     #   pandas
     #   transformers
+openpyxl==3.0.10
+    # via unstructured (setup.py)
 packaging==21.3
     # via
     #   argilla
     #   huggingface-hub
     #   transformers
 pandas==1.5.2
     # via argilla
+pillow==9.3.0
+    # via unstructured (setup.py)
 pydantic==1.10.2
     # via argilla
 pyparsing==3.0.9
     # via packaging
 python-dateutil==2.8.2
     # via pandas
+python-docx==0.8.11
+    # via unstructured (setup.py)
 pytz==2022.6
     # via pandas
 pyyaml==6.0

diff --git a/requirements/test.txt b/requirements/test.txt
@@ -1,5 +1,5 @@
 #
-# This file is autogenerated by pip-compile with python 3.8
+# This file is autogenerated by pip-compile with python 3.10
 # To update, run:
 #
 #    pip-compile requirements/test.in
@@ -80,7 +80,6 @@ tomli==2.0.1
     #   pytest
 typing-extensions==4.3.0
     # via
-    #   black
     #   mypy
     #   pydantic
 urllib3==1.26.12

diff --git a/setup.py b/setup.py
@@ -48,9 +48,12 @@
     version=__version__,
     entry_points={},
     install_requires=[
+        "argilla",
         "lxml",
         "nltk",
-        "argilla",
+        "openpyxl",
+        "pillow",
+        "python-docx",
         # NOTE(robinson) - The following dependencies are pinned
         # to address security scans
         "certifi>=2022.12.07",

diff --git a/test_unstructured/file_utils/test_metadata.py b/test_unstructured/file_utils/test_metadata.py
@@ -0,0 +1,107 @@
+import datetime
+import os
+import pathlib
+import pytest
+
+import docx
+import openpyxl
+
+import unstructured.file_utils.metadata as meta
+
+DIRECTORY = pathlib.Path(__file__).parent.resolve()
+EXAMPLE_JPG_FILENAME = os.path.join(DIRECTORY, "..", "..", "example-docs", "example.jpg")
+
+
+def test_get_docx_metadata_from_filename(tmpdir):
+    filename = os.path.join(tmpdir, "test-doc.docx")
+
+    document = docx.Document()
+    document.add_paragraph("Lorem ipsum dolor sit amet.")
+    document.core_properties.author = "Mr. Miagi"
+    document.save(filename)
+
+    metadata = meta.get_docx_metadata(filename=filename)
+    assert metadata.author == "Mr. Miagi"
+    assert metadata.to_dict()["author"] == "Mr. Miagi"
+
+
+def test_get_docx_metadata_from_file(tmpdir):
+    filename = os.path.join(tmpdir, "test-doc.docx")
+
+    document = docx.Document()
+    document.add_paragraph("Lorem ipsum dolor sit amet.")
+    document.core_properties.author = "Mr. Miagi"
+    document.save(filename)
+
+    with open(filename, "rb") as f:
+        metadata = meta.get_docx_metadata(file=f)
+    assert metadata.author == "Mr. Miagi"
+
+
+def test_get_docx_metadata_raises_without_file_or_filename():
+    with pytest.raises(FileNotFoundError):
+        meta.get_docx_metadata()
+
+
+def test_get_xlsx_metadata_from_filename(tmpdir):
+    filename = os.path.join(tmpdir, "test-excel.xlsx")
+
+    workbook = openpyxl.Workbook()
+    workbook.properties.creator = "Mr. Miagi"
+    workbook.save(filename)
+
+    metadata = meta.get_xlsx_metadata(filename=filename)
+    metadata.author = "Mr. Miagi"
+
+
+def test_get_xlsx_metadata_from_file(tmpdir):
+    filename = os.path.join(tmpdir, "test-excel.xlsx")
+
+    workbook = openpyxl.Workbook()
+    workbook.properties.creator = "Mr. Miagi"
+    workbook.save(filename)
+
+    with open(filename, "rb") as f:
+        metadata = meta.get_xlsx_metadata(file=f)
+    metadata.author = "Mr. Miagi"
+
+
+def test_get_xlsx_metadata_raises_without_file_or_filename():
+    with pytest.raises(FileNotFoundError):
+        meta.get_xlsx_metadata()
+
+
+def test_get_jpg_metadata_from_filename():
+    metadata = meta.get_jpg_metadata(filename=EXAMPLE_JPG_FILENAME)
+    assert metadata.modified == datetime.datetime(2003, 12, 14, 12, 1, 44)
+    assert metadata.exif_data["Make"] == "Canon"
+
+
+def test_get_jpg_metadata_from_file():
+    with open(EXAMPLE_JPG_FILENAME, "rb") as f:
+        metadata = meta.get_jpg_metadata(file=f)
+    assert metadata.modified == datetime.datetime(2003, 12, 14, 12, 1, 44)
+    assert metadata.exif_data["Make"] == "Canon"
+
+
+def test_get_jpg_metadata_raises_without_file_or_filename():
+    with pytest.raises(FileNotFoundError):
+        meta.get_jpg_metadata()
+
+
+def test_get_exif_datetime():
+    exif_data = {"DateTime": "2022:12:23 15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"}
+    date = meta._get_exif_datetime(exif_data, "DateTime")
+    assert date == datetime.datetime(2022, 12, 23, 15, 49, 0)
+
+
+def test_get_exif_datetime_ignores_bad_formats():
+    exif_data = {"DateTime": "2022-12-23TZ15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"}
+    date = meta._get_exif_datetime(exif_data, "DateTime")
+    assert date is None
+
+
+def test_get_exif_datetime_ignores_missing_key():
+    exif_data = {"Datetime": "2022-12-23TZ15:49:00", "DateTimeOriginal": "2020:12:14 12:00:00"}
+    date = meta._get_exif_datetime(exif_data, "DateTimeDigitized")
+    assert date is None
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.3.5-dev2"  # pragma: no cover
+__version__ = "0.3.5-dev3"  # pragma: no cover
diff --git a/unstructured/file_utils/__init__.py b/unstructured/file_utils/__init__.py
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.3.5-dev2" # pragma: no cover
		__version__ = "0.3.5-dev3" # pragma: no cover