diff --git a/CHANGELOG.md b/CHANGELOG.md index c59655d9d1..1c0a97cd23 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,17 @@ +## 0.5.8-dev0 + +### Enhancements + +* `detect_filetype` now does a final fallback to file extension. + +### Features + +* Added `partition_msg` for processing MSFT Outlook .msg files. + +### Fixes + +* `convert_file_to_text` now passes through the `source_format` and `target_format` kwargs. + ## 0.5.7 ### Enhancements diff --git a/Makefile b/Makefile index c0b74c846d..9dd5f518bd 100644 --- a/Makefile +++ b/Makefile @@ -85,7 +85,7 @@ install-unstructured-inference: .PHONY: install-detectron2 install-detectron2: - pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2" + pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@e2ce8dc1ab097891395d324abaffe9cf298503d1#egg=detectron2" ## install-local-inference: installs requirements for local inference .PHONY: install-local-inference diff --git a/README.md b/README.md index 0e55738a28..7c60b8ec0e 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,7 @@ Once pulled, you can create a container from this image and shell to it. docker run --platform linux/amd64 -d -t --name unstructured quay.io/unstructured-io/unstructured:latest # this will drop you into a bash shell where the Docker image is running -docker exec -it unstructured bash +docker exec -it unstructured bash ``` You can also build your own Docker image. @@ -167,7 +167,7 @@ you can also uninstall the hooks with `pre-commit uninstall`. You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCjY2-x8c6y5TYMbSFtQGlQVFHCVIW) to run the examples below. The following examples show how to get started with the `unstructured` library. -You can parse **TXT**, **HTML**, **PDF**, **EML**, **EPUB**, **DOC**, **DOCX**, **PPT**, **PPTX**, **JPG**, +You can parse **TXT**, **HTML**, **PDF**, **EML**, **MSG**, **EPUB**, **DOC**, **DOCX**, **PPT**, **PPTX**, **JPG**, and **PNG** documents with one line of code!

See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description @@ -182,7 +182,7 @@ If you are using the `partition` brick, you may need to install additional param instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection) `partition` will always apply the default arguments. If you need advanced features, use a document-specific brick. The `partition` brick currently works for -`.txt`, `.doc`, `.docx`, `.ppt`, `.pptx`, `.jpg`, `.png`, `.eml`, `.html`, and `.pdf` documents. +`.txt`, `.doc`, `.docx`, `.ppt`, `.pptx`, `.jpg`, `.png`, `.eml`, `.msg`, `.html`, and `.pdf` documents. ```python from unstructured.partition.auto import partition diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index a0faeb192c..38b4306fa1 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -83,7 +83,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect file type and route it to the appropriate partitioning brick. All partitioning bricks called within ``partition`` are called using the default kwargs. Use the document-type specific bricks if you need to apply non-default settings. -``partition`` currently supports ``.docx``, ``.doc``, ``.pptx``, ``.ppt``, ``.eml``, ``.epub``, ``.html``, ``.pdf``, +``partition`` currently supports ``.docx``, ``.doc``, ``.pptx``, ``.ppt``, ``.eml``, ``.msg``, ``.epub``, ``.html``, ``.pdf``, ``.png``, ``.jpg``, and ``.txt`` files. If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``, ``.png``, and ``.jpg``. @@ -324,6 +324,21 @@ Examples: elements = partition_email(text=text, include_headers=True) +``partition_msg`` +----------------- + +The ``partition_msg`` functions processes ``.msg`` files, which is a filetype specific +to email exports from Microsoft Outlook. + +Examples: + +.. code:: python + + from unstructured.partition.msg import partition_msg + + elements = partition_msg(filename="example-docs/fake-email.msg") + + ``partition_epub`` --------------------- diff --git a/docs/source/integrations.rst b/docs/source/integrations.rst index 37238b7275..3f01950a80 100644 --- a/docs/source/integrations.rst +++ b/docs/source/integrations.rst @@ -1,46 +1,46 @@ Integrations -====== -Integrate your model development pipeline with your favorite machine learning frameworks and libraries, -and prepare your data for ingestion into downstream systems. Most of our integrations come in the form of -`staging bricks `_, +============= +Integrate your model development pipeline with your favorite machine learning frameworks and libraries, +and prepare your data for ingestion into downstream systems. Most of our integrations come in the form of +`staging bricks `_, which take a list of ``Element`` objects as input and return formatted dictionaries as output. ``Integration with Argilla`` --------------- +---------------------------- You can convert a list of ``Text`` elements to an `Argilla `_ ``Dataset`` using the `stage_for_argilla `_ staging brick. Specify the type of dataset to be generated using the ``argilla_task`` parameter. Valid values are ``"text_classification"``, ``"token_classification"``, and ``"text2text"``. Follow the link for more details on usage. ``Integration with Datasaur`` --------------- +------------------------------ You can format a list of ``Text`` elements as input to token based tasks in `Datasaur `_ using the `stage_for_datasaur `_ staging brick. You will obtain a list of dictionaries indexed by the keys ``"text"`` with the content of the element, and ``"entities"`` with an empty list. Follow the link to learn how to customise your entities and for more details on usage. ``Integration with Hugging Face`` --------------- -You can prepare ``Text`` elements for processing in Hugging Face `Transformers `_ -pipelines by splitting the elements into chunks that fit into the model's attention window using the `stage_for_transformers `_ staging brick. You can customise the transformation by defining -the ``buffer`` and ``window_size``, the ``split_function`` and the ``chunk_separator``. if you need to operate on +---------------------------------- +You can prepare ``Text`` elements for processing in Hugging Face `Transformers `_ +pipelines by splitting the elements into chunks that fit into the model's attention window using the `stage_for_transformers `_ staging brick. You can customise the transformation by defining +the ``buffer`` and ``window_size``, the ``split_function`` and the ``chunk_separator``. if you need to operate on text directly instead of ``unstructured`` ``Text`` objects, use the `chunk_by_attention_window `_ helper function. Follow the links for more details on usage. ``Integration with Labelbox`` --------------- +------------------------------ You can format your outputs for use with `LabelBox `_ using the `stage_for_label_box `_ staging brick. LabelBox accepts cloud-hosted data and does not support importing text directly. With this integration you can stage the data files in the ``output_directory`` to be uploaded to a cloud storage service (such as S3 buckets) and get a config of type ``List[Dict[str, Any]]`` that can be written to a ``.json`` file and imported into LabelBox. Follow the link to see how to generate the ``config.json`` file that can be used with LabelBox, how to upload the staged data files to an S3 bucket, and for more details on usage. ``Integration with Label Studio`` --------------- -You can format your outputs for upload to `Label Studio `_ using the `stage_for_label_studio `_ staging brick. After running ``stage_for_label_studio``, you can write the results -to a JSON folder that is ready to be included in a new Label Studio project. You can also include pre-annotations and predictions +---------------------------------- +You can format your outputs for upload to `Label Studio `_ using the `stage_for_label_studio `_ staging brick. After running ``stage_for_label_studio``, you can write the results +to a JSON folder that is ready to be included in a new Label Studio project. You can also include pre-annotations and predictions as part of your upload. Check our example notebook to format and upload the risk section from an SEC filing to Label Studio for a sentiment analysis labeling task `here `_ . Follow the link for more details on usage, and check `Label Studio docs `_ for a full list of options for labels and annotations. ``Integration with LangChain`` --------------- -Our integration with `LangChain `_ makes it incredibly easy to combine language models with your data, no matter what form it is in. The `Unstructured.io File Loader `_ extracts the text from a variety of unstructured text files using our ``unstructured`` library. It is designed to be used as a way to load data into `LlamaIndex `_ and/or subsequently used as a Tool in a LangChain Agent. See `here `_ for more `LlamaHub `_ examples. +-------------------------------- +Our integration with `LangChain `_ makes it incredibly easy to combine language models with your data, no matter what form it is in. The `Unstructured.io File Loader `_ extracts the text from a variety of unstructured text files using our ``unstructured`` library. It is designed to be used as a way to load data into `LlamaIndex `_ and/or subsequently used as a Tool in a LangChain Agent. See `here `_ for more `LlamaHub `_ examples. To use ``Unstructured.io File Loader`` you will need to have LlamaIndex 🦙 (GPT Index) installed in your environment. Just ``pip install llama-index`` and then pass in a ``Path`` to a local file. Optionally, you may specify split_documents if you want each element generated by ``unstructured`` to be placed in a separate document. Here is a simple example on how to use it: @@ -57,12 +57,12 @@ To use ``Unstructured.io File Loader`` you will need to have LlamaIndex 🦙 (GP ``Integration with Pandas`` --------------- -You can convert a list of ``Element`` objects to a Pandas dataframe with columns for +---------------------------- +You can convert a list of ``Element`` objects to a Pandas dataframe with columns for the text from each element and their types such as ``NarrativeText`` or ``Title`` using the `convert_to_dataframe `_ staging brick. Follow the link for more details on usage. ``Integration with Prodigy`` --------------- -You can format your JSON or CSV outputs for use with `Prodigy `_ using the `stage_for_prodigy `_ and `stage_csv_for_prodigy `_ staging bricks. After running ``stage_for_prodigy`` | +----------------------------- +You can format your JSON or CSV outputs for use with `Prodigy `_ using the `stage_for_prodigy `_ and `stage_csv_for_prodigy `_ staging bricks. After running ``stage_for_prodigy`` | ``stage_csv_for_prodigy``, you can write the results to a ``.json`` | ``.jsonl`` or a ``.csv`` file that is ready to be used with Prodigy. Follow the links for more details on usage. diff --git a/example-docs/fake-email.msg b/example-docs/fake-email.msg new file mode 100644 index 0000000000..61ec3c4bbe Binary files /dev/null and b/example-docs/fake-email.msg differ diff --git a/requirements/base.txt b/requirements/base.txt index a8f264fc7e..3fc0052f9d 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -4,6 +4,9 @@ # # pip-compile --output-file=requirements/base.txt # +--extra-index-url https://pypi.ngc.nvidia.com +--trusted-host pypi.ngc.nvidia.com + anyio==3.6.2 # via httpcore argilla==1.5.0 @@ -50,12 +53,16 @@ markdown==3.4.3 # via unstructured (setup.py) monotonic==1.6 # via argilla +msg-parser==1.2.0 + # via unstructured (setup.py) nltk==3.8.1 # via unstructured (setup.py) numpy==1.23.5 # via # argilla # pandas +olefile==0.46 + # via msg-parser openpyxl==3.1.2 # via unstructured (setup.py) packaging==23.0 diff --git a/setup.py b/setup.py index e57221beb9..5126841c31 100644 --- a/setup.py +++ b/setup.py @@ -52,6 +52,7 @@ install_requires=[ "argilla", "lxml", + "msg_parser", "nltk", "openpyxl", "pandas", diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index be5e536b5e..60c08fa6ae 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -341,3 +341,17 @@ def test_auto_partition_epub_from_file(): elements = partition(file=f) assert len(elements) > 0 assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") + + +EXPECTED_MSG_OUTPUT = [ + NarrativeText(text="This is a test email to use for unit tests."), + Title(text="Important points:"), + ListItem(text="Roses are red"), + ListItem(text="Violets are blue"), +] + + +def test_auto_partition_msg_from_filename(): + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") + elements = partition(filename=filename) + assert elements == EXPECTED_MSG_OUTPUT diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py new file mode 100644 index 0000000000..02658e18cb --- /dev/null +++ b/test_unstructured/partition/test_msg.py @@ -0,0 +1,60 @@ +import os +import pathlib + +import msg_parser +import pytest + +from unstructured.documents.elements import ListItem, NarrativeText, Title +from unstructured.partition.msg import partition_msg + +DIRECTORY = pathlib.Path(__file__).parent.resolve() +EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs") + +EXPECTED_MSG_OUTPUT = [ + NarrativeText(text="This is a test email to use for unit tests."), + Title(text="Important points:"), + ListItem(text="Roses are red"), + ListItem(text="Violets are blue"), +] + + +def test_partition_msg_from_filename(): + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") + elements = partition_msg(filename=filename) + assert elements == EXPECTED_MSG_OUTPUT + + +class MockMsOxMessage: + def __init__(self, filename): + self.body = "Here is an email with plain text." + + +def test_partition_msg_from_filename_with_text_content(monkeypatch): + monkeypatch.setattr(msg_parser, "MsOxMessage", MockMsOxMessage) + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") + elements = partition_msg(filename=filename) + assert str(elements[0]) == "Here is an email with plain text." + + +def test_partition_msg_raises_with_missing_file(): + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "doesnt-exist.msg") + with pytest.raises(FileNotFoundError): + partition_msg(filename=filename) + + +def test_partition_msg_from_file(): + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") + with open(filename, "rb") as f: + elements = partition_msg(file=f) + assert elements == EXPECTED_MSG_OUTPUT + + +def test_partition_msg_raises_with_both_specified(): + filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") + with open(filename, "rb") as f, pytest.raises(ValueError): + partition_msg(filename=filename, file=f) + + +def test_partition_msg_raises_with_neither(): + with pytest.raises(ValueError): + partition_msg() diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 2beec73a0d..111af2ce64 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.5.7" # pragma: no cover +__version__ = "0.5.8-dev0" # pragma: no cover diff --git a/unstructured/file_utils/file_conversion.py b/unstructured/file_utils/file_conversion.py index 5ad671fe1b..a163a4a95a 100644 --- a/unstructured/file_utils/file_conversion.py +++ b/unstructured/file_utils/file_conversion.py @@ -9,7 +9,7 @@ def convert_file_to_text(filename: str, source_format: str, target_format: str) -> str: """Uses pandoc to convert the source document to a raw text string.""" try: - text = pypandoc.convert_file(filename, "html", format="epub") + text = pypandoc.convert_file(filename, source_format, format=target_format) except FileNotFoundError as err: msg = ( "Error converting the file to text. Ensure you have the pandoc " diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 78aab783e5..8376b80e08 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -39,6 +39,11 @@ "application/vnd.ms-powerpoint", ] +MSG_MIME_TYPES = [ + "application/vnd.ms-outlook", + "application/x-ole-storage", +] + TXT_MIME_TYPES = [ "text/plain", "message/rfc822", # ref: https://www.rfc-editor.org/rfc/rfc822 @@ -83,6 +88,7 @@ class FileType(Enum): XLSX = 13 PPT = 14 PPTX = 15 + MSG = 16 # Adobe Types PDF = 20 @@ -150,6 +156,7 @@ def __lt__(self, other): ".rtf": FileType.RTF, ".json": FileType.JSON, ".epub": FileType.EPUB, + ".msg": FileType.MSG, None: FileType.UNK, } @@ -205,6 +212,9 @@ def detect_filetype( elif mime_type in DOC_MIME_TYPES: return FileType.DOC + elif mime_type in MSG_MIME_TYPES: + return FileType.MSG + elif mime_type == "image/jpeg": return FileType.JPG @@ -255,7 +265,7 @@ def detect_filetype( if file and not extension: return _detect_filetype_from_octet_stream(file=file) else: - return EXT_TO_FILETYPE.get(extension) + return EXT_TO_FILETYPE.get(extension, FileType.UNK) elif mime_type == "application/zip": filetype = FileType.UNK @@ -275,7 +285,7 @@ def detect_filetype( f"The MIME type{f' of {filename!r}' if filename else ''} is {mime_type!r}. " "This file type is not currently supported in unstructured.", ) - return FileType.UNK + return EXT_TO_FILETYPE.get(extension, FileType.UNK) def _detect_filetype_from_octet_stream(file: IO) -> FileType: diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index f4dd3d4ea5..41966bbd68 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -9,6 +9,7 @@ from unstructured.partition.image import partition_image from unstructured.partition.json import partition_json from unstructured.partition.md import partition_md +from unstructured.partition.msg import partition_msg from unstructured.partition.pdf import partition_pdf from unstructured.partition.ppt import partition_ppt from unstructured.partition.pptx import partition_pptx @@ -64,6 +65,8 @@ def partition( return partition_docx(filename=filename, file=file) elif filetype == FileType.EML: return partition_email(filename=filename, file=file, encoding=encoding) + elif filetype == FileType.MSG: + return partition_msg(filename=filename, file=file) elif filetype == FileType.HTML: return partition_html( filename=filename, diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py new file mode 100644 index 0000000000..84046310ba --- /dev/null +++ b/unstructured/partition/msg.py @@ -0,0 +1,41 @@ +import tempfile +from typing import IO, List, Optional + +import msg_parser + +from unstructured.documents.elements import Element +from unstructured.partition.common import exactly_one +from unstructured.partition.html import partition_html +from unstructured.partition.text import partition_text + + +def partition_msg( + filename: Optional[str] = None, + file: Optional[IO] = None, +) -> List[Element]: + """Partitions a MSFT Outlook .msg file + + Parameters + ---------- + filename + A string defining the target filename path. + file + A file-like object using "rb" mode --> open(filename, "rb"). + """ + exactly_one(filename=filename, file=file) + + if filename is not None: + msg_obj = msg_parser.MsOxMessage(filename) + elif file is not None: + tmp = tempfile.NamedTemporaryFile(delete=False) + tmp.write(file.read()) + tmp.close() + msg_obj = msg_parser.MsOxMessage(tmp.name) + + text = msg_obj.body + if "" in text or "" in text: + elements = partition_html(text=text) + else: + elements = partition_text(text=text) + + return elements