diff --git a/CHANGELOG.md b/CHANGELOG.md
index c59655d9d1..1c0a97cd23 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,17 @@
+## 0.5.8-dev0
+
+### Enhancements
+
+* `detect_filetype` now does a final fallback to file extension.
+
+### Features
+
+* Added `partition_msg` for processing MSFT Outlook .msg files.
+
+### Fixes
+
+* `convert_file_to_text` now passes through the `source_format` and `target_format` kwargs.
+
## 0.5.7
### Enhancements
diff --git a/Makefile b/Makefile
index c0b74c846d..9dd5f518bd 100644
--- a/Makefile
+++ b/Makefile
@@ -85,7 +85,7 @@ install-unstructured-inference:
.PHONY: install-detectron2
install-detectron2:
- pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"
+ pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@e2ce8dc1ab097891395d324abaffe9cf298503d1#egg=detectron2"
## install-local-inference: installs requirements for local inference
.PHONY: install-local-inference
diff --git a/README.md b/README.md
index 0e55738a28..7c60b8ec0e 100644
--- a/README.md
+++ b/README.md
@@ -101,7 +101,7 @@ Once pulled, you can create a container from this image and shell to it.
docker run --platform linux/amd64 -d -t --name unstructured quay.io/unstructured-io/unstructured:latest
# this will drop you into a bash shell where the Docker image is running
-docker exec -it unstructured bash
+docker exec -it unstructured bash
```
You can also build your own Docker image.
@@ -167,7 +167,7 @@ you can also uninstall the hooks with `pre-commit uninstall`.
You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCjY2-x8c6y5TYMbSFtQGlQVFHCVIW) to run the examples below.
The following examples show how to get started with the `unstructured` library.
-You can parse **TXT**, **HTML**, **PDF**, **EML**, **EPUB**, **DOC**, **DOCX**, **PPT**, **PPTX**, **JPG**,
+You can parse **TXT**, **HTML**, **PDF**, **EML**, **MSG**, **EPUB**, **DOC**, **DOCX**, **PPT**, **PPTX**, **JPG**,
and **PNG** documents with one line of code!
See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
@@ -182,7 +182,7 @@ If you are using the `partition` brick, you may need to install additional param
instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection)
`partition` will always apply the default arguments. If you need
advanced features, use a document-specific brick. The `partition` brick currently works for
-`.txt`, `.doc`, `.docx`, `.ppt`, `.pptx`, `.jpg`, `.png`, `.eml`, `.html`, and `.pdf` documents.
+`.txt`, `.doc`, `.docx`, `.ppt`, `.pptx`, `.jpg`, `.png`, `.eml`, `.msg`, `.html`, and `.pdf` documents.
```python
from unstructured.partition.auto import partition
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
index a0faeb192c..38b4306fa1 100644
--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@@ -83,7 +83,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
file type and route it to the appropriate partitioning brick. All partitioning bricks
called within ``partition`` are called using the default kwargs. Use the document-type
specific bricks if you need to apply non-default settings.
-``partition`` currently supports ``.docx``, ``.doc``, ``.pptx``, ``.ppt``, ``.eml``, ``.epub``, ``.html``, ``.pdf``,
+``partition`` currently supports ``.docx``, ``.doc``, ``.pptx``, ``.ppt``, ``.eml``, ``.msg``, ``.epub``, ``.html``, ``.pdf``,
``.png``, ``.jpg``, and ``.txt`` files.
If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
``.png``, and ``.jpg``.
@@ -324,6 +324,21 @@ Examples:
elements = partition_email(text=text, include_headers=True)
+``partition_msg``
+-----------------
+
+The ``partition_msg`` functions processes ``.msg`` files, which is a filetype specific
+to email exports from Microsoft Outlook.
+
+Examples:
+
+.. code:: python
+
+ from unstructured.partition.msg import partition_msg
+
+ elements = partition_msg(filename="example-docs/fake-email.msg")
+
+
``partition_epub``
---------------------
diff --git a/docs/source/integrations.rst b/docs/source/integrations.rst
index 37238b7275..3f01950a80 100644
--- a/docs/source/integrations.rst
+++ b/docs/source/integrations.rst
@@ -1,46 +1,46 @@
Integrations
-======
-Integrate your model development pipeline with your favorite machine learning frameworks and libraries,
-and prepare your data for ingestion into downstream systems. Most of our integrations come in the form of
-`staging bricks `_,
+=============
+Integrate your model development pipeline with your favorite machine learning frameworks and libraries,
+and prepare your data for ingestion into downstream systems. Most of our integrations come in the form of
+`staging bricks `_,
which take a list of ``Element`` objects as input and return formatted dictionaries as output.
``Integration with Argilla``
---------------
+----------------------------
You can convert a list of ``Text`` elements to an `Argilla `_ ``Dataset`` using the `stage_for_argilla `_ staging brick. Specify the type of dataset to be generated using the ``argilla_task`` parameter. Valid values are ``"text_classification"``, ``"token_classification"``, and ``"text2text"``. Follow the link for more details on usage.
``Integration with Datasaur``
---------------
+------------------------------
You can format a list of ``Text`` elements as input to token based tasks in `Datasaur `_ using the `stage_for_datasaur `_ staging brick. You will obtain a list of dictionaries indexed by the keys ``"text"`` with the content of the element, and ``"entities"`` with an empty list. Follow the link to learn how to customise your entities and for more details on usage.
``Integration with Hugging Face``
---------------
-You can prepare ``Text`` elements for processing in Hugging Face `Transformers `_
-pipelines by splitting the elements into chunks that fit into the model's attention window using the `stage_for_transformers `_ staging brick. You can customise the transformation by defining
-the ``buffer`` and ``window_size``, the ``split_function`` and the ``chunk_separator``. if you need to operate on
+----------------------------------
+You can prepare ``Text`` elements for processing in Hugging Face `Transformers `_
+pipelines by splitting the elements into chunks that fit into the model's attention window using the `stage_for_transformers `_ staging brick. You can customise the transformation by defining
+the ``buffer`` and ``window_size``, the ``split_function`` and the ``chunk_separator``. if you need to operate on
text directly instead of ``unstructured`` ``Text`` objects, use the `chunk_by_attention_window `_ helper function. Follow the links for more details on usage.
``Integration with Labelbox``
---------------
+------------------------------
You can format your outputs for use with `LabelBox `_ using the `stage_for_label_box `_ staging brick. LabelBox accepts cloud-hosted data and does not support importing text directly. With this integration you can stage the data files in the ``output_directory`` to be uploaded to a cloud storage service (such as S3 buckets) and get a config of type ``List[Dict[str, Any]]`` that can be written to a ``.json`` file and imported into LabelBox. Follow the link to see how to generate the ``config.json`` file that can be used with LabelBox, how to upload the staged data files to an S3 bucket, and for more details on usage.
``Integration with Label Studio``
---------------
-You can format your outputs for upload to `Label Studio `_ using the `stage_for_label_studio `_ staging brick. After running ``stage_for_label_studio``, you can write the results
-to a JSON folder that is ready to be included in a new Label Studio project. You can also include pre-annotations and predictions
+----------------------------------
+You can format your outputs for upload to `Label Studio `_ using the `stage_for_label_studio `_ staging brick. After running ``stage_for_label_studio``, you can write the results
+to a JSON folder that is ready to be included in a new Label Studio project. You can also include pre-annotations and predictions
as part of your upload.
Check our example notebook to format and upload the risk section from an SEC filing to Label Studio for a sentiment analysis labeling task `here `_ . Follow the link for more details on usage, and check `Label Studio docs `_ for a full list of options for labels and annotations.
``Integration with LangChain``
---------------
-Our integration with `LangChain `_ makes it incredibly easy to combine language models with your data, no matter what form it is in. The `Unstructured.io File Loader `_ extracts the text from a variety of unstructured text files using our ``unstructured`` library. It is designed to be used as a way to load data into `LlamaIndex `_ and/or subsequently used as a Tool in a LangChain Agent. See `here `_ for more `LlamaHub `_ examples.
+--------------------------------
+Our integration with `LangChain `_ makes it incredibly easy to combine language models with your data, no matter what form it is in. The `Unstructured.io File Loader `_ extracts the text from a variety of unstructured text files using our ``unstructured`` library. It is designed to be used as a way to load data into `LlamaIndex `_ and/or subsequently used as a Tool in a LangChain Agent. See `here `_ for more `LlamaHub `_ examples.
To use ``Unstructured.io File Loader`` you will need to have LlamaIndex 🦙 (GPT Index) installed in your environment. Just ``pip install llama-index`` and then pass in a ``Path`` to a local file. Optionally, you may specify split_documents if you want each element generated by ``unstructured`` to be placed in a separate document. Here is a simple example on how to use it:
@@ -57,12 +57,12 @@ To use ``Unstructured.io File Loader`` you will need to have LlamaIndex 🦙 (GP
``Integration with Pandas``
---------------
-You can convert a list of ``Element`` objects to a Pandas dataframe with columns for
+----------------------------
+You can convert a list of ``Element`` objects to a Pandas dataframe with columns for
the text from each element and their types such as ``NarrativeText`` or ``Title`` using the `convert_to_dataframe `_ staging brick. Follow the link for more details on usage.
``Integration with Prodigy``
---------------
-You can format your JSON or CSV outputs for use with `Prodigy `_ using the `stage_for_prodigy `_ and `stage_csv_for_prodigy `_ staging bricks. After running ``stage_for_prodigy`` |
+-----------------------------
+You can format your JSON or CSV outputs for use with `Prodigy `_ using the `stage_for_prodigy `_ and `stage_csv_for_prodigy `_ staging bricks. After running ``stage_for_prodigy`` |
``stage_csv_for_prodigy``, you can write the results to a ``.json`` | ``.jsonl`` or a ``.csv`` file that is ready to be used with Prodigy. Follow the links for more details on usage.
diff --git a/example-docs/fake-email.msg b/example-docs/fake-email.msg
new file mode 100644
index 0000000000..61ec3c4bbe
Binary files /dev/null and b/example-docs/fake-email.msg differ
diff --git a/requirements/base.txt b/requirements/base.txt
index a8f264fc7e..3fc0052f9d 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -4,6 +4,9 @@
#
# pip-compile --output-file=requirements/base.txt
#
+--extra-index-url https://pypi.ngc.nvidia.com
+--trusted-host pypi.ngc.nvidia.com
+
anyio==3.6.2
# via httpcore
argilla==1.5.0
@@ -50,12 +53,16 @@ markdown==3.4.3
# via unstructured (setup.py)
monotonic==1.6
# via argilla
+msg-parser==1.2.0
+ # via unstructured (setup.py)
nltk==3.8.1
# via unstructured (setup.py)
numpy==1.23.5
# via
# argilla
# pandas
+olefile==0.46
+ # via msg-parser
openpyxl==3.1.2
# via unstructured (setup.py)
packaging==23.0
diff --git a/setup.py b/setup.py
index e57221beb9..5126841c31 100644
--- a/setup.py
+++ b/setup.py
@@ -52,6 +52,7 @@
install_requires=[
"argilla",
"lxml",
+ "msg_parser",
"nltk",
"openpyxl",
"pandas",
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
index be5e536b5e..60c08fa6ae 100644
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@@ -341,3 +341,17 @@ def test_auto_partition_epub_from_file():
elements = partition(file=f)
assert len(elements) > 0
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
+
+
+EXPECTED_MSG_OUTPUT = [
+ NarrativeText(text="This is a test email to use for unit tests."),
+ Title(text="Important points:"),
+ ListItem(text="Roses are red"),
+ ListItem(text="Violets are blue"),
+]
+
+
+def test_auto_partition_msg_from_filename():
+ filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
+ elements = partition(filename=filename)
+ assert elements == EXPECTED_MSG_OUTPUT
diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py
new file mode 100644
index 0000000000..02658e18cb
--- /dev/null
+++ b/test_unstructured/partition/test_msg.py
@@ -0,0 +1,60 @@
+import os
+import pathlib
+
+import msg_parser
+import pytest
+
+from unstructured.documents.elements import ListItem, NarrativeText, Title
+from unstructured.partition.msg import partition_msg
+
+DIRECTORY = pathlib.Path(__file__).parent.resolve()
+EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
+
+EXPECTED_MSG_OUTPUT = [
+ NarrativeText(text="This is a test email to use for unit tests."),
+ Title(text="Important points:"),
+ ListItem(text="Roses are red"),
+ ListItem(text="Violets are blue"),
+]
+
+
+def test_partition_msg_from_filename():
+ filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
+ elements = partition_msg(filename=filename)
+ assert elements == EXPECTED_MSG_OUTPUT
+
+
+class MockMsOxMessage:
+ def __init__(self, filename):
+ self.body = "Here is an email with plain text."
+
+
+def test_partition_msg_from_filename_with_text_content(monkeypatch):
+ monkeypatch.setattr(msg_parser, "MsOxMessage", MockMsOxMessage)
+ filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
+ elements = partition_msg(filename=filename)
+ assert str(elements[0]) == "Here is an email with plain text."
+
+
+def test_partition_msg_raises_with_missing_file():
+ filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "doesnt-exist.msg")
+ with pytest.raises(FileNotFoundError):
+ partition_msg(filename=filename)
+
+
+def test_partition_msg_from_file():
+ filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
+ with open(filename, "rb") as f:
+ elements = partition_msg(file=f)
+ assert elements == EXPECTED_MSG_OUTPUT
+
+
+def test_partition_msg_raises_with_both_specified():
+ filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
+ with open(filename, "rb") as f, pytest.raises(ValueError):
+ partition_msg(filename=filename, file=f)
+
+
+def test_partition_msg_raises_with_neither():
+ with pytest.raises(ValueError):
+ partition_msg()
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 2beec73a0d..111af2ce64 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.7" # pragma: no cover
+__version__ = "0.5.8-dev0" # pragma: no cover
diff --git a/unstructured/file_utils/file_conversion.py b/unstructured/file_utils/file_conversion.py
index 5ad671fe1b..a163a4a95a 100644
--- a/unstructured/file_utils/file_conversion.py
+++ b/unstructured/file_utils/file_conversion.py
@@ -9,7 +9,7 @@
def convert_file_to_text(filename: str, source_format: str, target_format: str) -> str:
"""Uses pandoc to convert the source document to a raw text string."""
try:
- text = pypandoc.convert_file(filename, "html", format="epub")
+ text = pypandoc.convert_file(filename, source_format, format=target_format)
except FileNotFoundError as err:
msg = (
"Error converting the file to text. Ensure you have the pandoc "
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
index 78aab783e5..8376b80e08 100644
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@@ -39,6 +39,11 @@
"application/vnd.ms-powerpoint",
]
+MSG_MIME_TYPES = [
+ "application/vnd.ms-outlook",
+ "application/x-ole-storage",
+]
+
TXT_MIME_TYPES = [
"text/plain",
"message/rfc822", # ref: https://www.rfc-editor.org/rfc/rfc822
@@ -83,6 +88,7 @@ class FileType(Enum):
XLSX = 13
PPT = 14
PPTX = 15
+ MSG = 16
# Adobe Types
PDF = 20
@@ -150,6 +156,7 @@ def __lt__(self, other):
".rtf": FileType.RTF,
".json": FileType.JSON,
".epub": FileType.EPUB,
+ ".msg": FileType.MSG,
None: FileType.UNK,
}
@@ -205,6 +212,9 @@ def detect_filetype(
elif mime_type in DOC_MIME_TYPES:
return FileType.DOC
+ elif mime_type in MSG_MIME_TYPES:
+ return FileType.MSG
+
elif mime_type == "image/jpeg":
return FileType.JPG
@@ -255,7 +265,7 @@ def detect_filetype(
if file and not extension:
return _detect_filetype_from_octet_stream(file=file)
else:
- return EXT_TO_FILETYPE.get(extension)
+ return EXT_TO_FILETYPE.get(extension, FileType.UNK)
elif mime_type == "application/zip":
filetype = FileType.UNK
@@ -275,7 +285,7 @@ def detect_filetype(
f"The MIME type{f' of {filename!r}' if filename else ''} is {mime_type!r}. "
"This file type is not currently supported in unstructured.",
)
- return FileType.UNK
+ return EXT_TO_FILETYPE.get(extension, FileType.UNK)
def _detect_filetype_from_octet_stream(file: IO) -> FileType:
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
index f4dd3d4ea5..41966bbd68 100644
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@@ -9,6 +9,7 @@
from unstructured.partition.image import partition_image
from unstructured.partition.json import partition_json
from unstructured.partition.md import partition_md
+from unstructured.partition.msg import partition_msg
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.ppt import partition_ppt
from unstructured.partition.pptx import partition_pptx
@@ -64,6 +65,8 @@ def partition(
return partition_docx(filename=filename, file=file)
elif filetype == FileType.EML:
return partition_email(filename=filename, file=file, encoding=encoding)
+ elif filetype == FileType.MSG:
+ return partition_msg(filename=filename, file=file)
elif filetype == FileType.HTML:
return partition_html(
filename=filename,
diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py
new file mode 100644
index 0000000000..84046310ba
--- /dev/null
+++ b/unstructured/partition/msg.py
@@ -0,0 +1,41 @@
+import tempfile
+from typing import IO, List, Optional
+
+import msg_parser
+
+from unstructured.documents.elements import Element
+from unstructured.partition.common import exactly_one
+from unstructured.partition.html import partition_html
+from unstructured.partition.text import partition_text
+
+
+def partition_msg(
+ filename: Optional[str] = None,
+ file: Optional[IO] = None,
+) -> List[Element]:
+ """Partitions a MSFT Outlook .msg file
+
+ Parameters
+ ----------
+ filename
+ A string defining the target filename path.
+ file
+ A file-like object using "rb" mode --> open(filename, "rb").
+ """
+ exactly_one(filename=filename, file=file)
+
+ if filename is not None:
+ msg_obj = msg_parser.MsOxMessage(filename)
+ elif file is not None:
+ tmp = tempfile.NamedTemporaryFile(delete=False)
+ tmp.write(file.read())
+ tmp.close()
+ msg_obj = msg_parser.MsOxMessage(tmp.name)
+
+ text = msg_obj.body
+ if "" in text or "" in text:
+ elements = partition_html(text=text)
+ else:
+ elements = partition_text(text=text)
+
+ return elements