microsoft · afourney · Mar 6, 2025 · Mar 3, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/.gitattributes b/.gitattributes
@@ -1 +1,2 @@
-tests/test_files/** linguist-vendored
+packages/markitdown/tests/test_files/** linguist-vendored
+packages/markitdown-sample-plugin/tests/test_files/** linguist-vendored
diff --git a/README.md b/README.md
@@ -7,9 +7,11 @@
 > [!IMPORTANT]
 > Breaking changes between 0.0.1 to 0.0.2:
 > * Dependencies are now organized into optional feature-groups (further details below). Use `pip install markitdown[all]` to have backward-compatible behavior. 
+> * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything.
 
-MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
-It supports:
+MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption.
+
+At present, MarkItDown supports:
 
 - PDF
 - PowerPoint
@@ -23,6 +25,17 @@ It supports:
 - Youtube URLs
 - ... and more!
 
+## Why Markdown?
+
+Markdown is extremely close to plain text, with minimal markup or formatting, but still
+provides a way to represent important document structure. Mainstream LLMs, such as
+OpenAI's GPT-4o, natively "_speak_" Markdown, and often incorporate Markdown into their
+responses unprompted. This suggests that they have been trained on vast amounts of
+Markdown-formatted text, and understand it well. As a side benefit, Markdown conventions
+are also highly token-efficient.
+
+## Installation
+
 To install MarkItDown, use pip: `pip install markitdown[all]`. Alternatively, you can install it from the source:
 
 ```bash

diff --git a/packages/markitdown-sample-plugin/README.md b/packages/markitdown-sample-plugin/README.md
@@ -10,23 +10,38 @@ This project shows how to create a sample plugin for MarkItDown. The most import
 Next, implement your custom DocumentConverter:
 
 ```python
-from typing import Union
-from markitdown import DocumentConverter, DocumentConverterResult
+from typing import BinaryIO, Any
+from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult, StreamInfo
 
 class RtfConverter(DocumentConverter):
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not an RTF file 
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".rtf":
-            return None
-
-	# Implement the conversion logic here ...
-
-        # Return the result
-        return DocumentConverterResult(
-            title=title,
-            text_content=text_content,
-        )
+
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> bool:
+
+	# Implement logic to check if the file stream is an RTF file
+	# ...
+	raise NotImplementedError()
+
+
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> DocumentConverterResult:
+
+	# Implement logic to convert the file stream to Markdown
+	# ...
+	raise NotImplementedError()
 ```
 
 Next, make sure your package implements and exports the following:
@@ -71,10 +86,10 @@ Once the plugin package is installed, verify that it is available to MarkItDown
 markitdown --list-plugins
 ```
 
-To use the plugin for a conversion use the `--use-plugins` flag. For example, to convert a PDF:
+To use the plugin for a conversion use the `--use-plugins` flag. For example, to convert an RTF file:
 
 ```bash
-markitdown --use-plugins path-to-file.pdf
+markitdown --use-plugins path-to-file.rtf
 ```
 
 In Python, plugins can be enabled as follows:
@@ -83,7 +98,7 @@ In Python, plugins can be enabled as follows:
 from markitdown import MarkItDown
 
 md = MarkItDown(enable_plugins=True) 
-result = md.convert("path-to-file.pdf")
+result = md.convert("path-to-file.rtf")
 print(result.text_content)
 ```
 

diff --git a/packages/markitdown-sample-plugin/pyproject.toml b/packages/markitdown-sample-plugin/pyproject.toml
@@ -24,7 +24,7 @@ classifiers = [
   "Programming Language :: Python :: Implementation :: PyPy",
 ]
 dependencies = [
-  "markitdown",
+  "markitdown>=0.0.2a2",
   "striprtf",
 ]
 

diff --git a/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/__about__.py b/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/__about__.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.0.1a2"
+__version__ = "0.0.1a3"
diff --git a/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/_plugin.py b/packages/markitdown-sample-plugin/src/markitdown_sample_plugin/_plugin.py
@@ -1,12 +1,26 @@
-from typing import Union
+import locale
+from typing import BinaryIO, Any
 from striprtf.striprtf import rtf_to_text
 
-from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult
+from markitdown import (
+    MarkItDown,
+    DocumentConverter,
+    DocumentConverterResult,
+    StreamInfo,
+)
+
 
 __plugin_interface_version__ = (
     1  # The version of the plugin interface that this plugin uses
 )
 
+ACCEPTED_MIME_TYPE_PREFIXES = [
+    "text/rtf",
+    "application/rtf",
+]
+
+ACCEPTED_FILE_EXTENSIONS = [".rtf"]
+
 
 def register_converters(markitdown: MarkItDown, **kwargs):
     """
@@ -22,18 +36,41 @@ class RtfConverter(DocumentConverter):
     Converts an RTF file to in the simplest possible way.
     """
 
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
-        # Bail if not a RTF
-        extension = kwargs.get("file_extension", "")
-        if extension.lower() != ".rtf":
-            return None
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
+    def accepts(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> bool:
+        mimetype = (stream_info.mimetype or "").lower()
+        extension = (stream_info.extension or "").lower()
+
+        if extension in ACCEPTED_FILE_EXTENSIONS:
+            return True
+
+        for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
+            if mimetype.startswith(prefix):
+                return True
+
+        return False
 
-        # Read the RTF file
-        with open(local_path, "r") as f:
-            rtf = f.read()
+    def convert(
+        self,
+        file_stream: BinaryIO,
+        stream_info: StreamInfo,
+        **kwargs: Any,
+    ) -> DocumentConverterResult:
+        # Read the file stream into an str using hte provided charset encoding, or using the system default
+        encoding = stream_info.charset or locale.getpreferredencoding()
+        stream_data = file_stream.read().decode(encoding)
 
         # Return the result
         return DocumentConverterResult(
             title=None,
-            text_content=rtf_to_text(rtf),
+            markdown=rtf_to_text(stream_data),
         )
diff --git a/packages/markitdown-sample-plugin/tests/test_sample_plugin.py b/packages/markitdown-sample-plugin/tests/test_sample_plugin.py
@@ -2,7 +2,7 @@
 import os
 import pytest
 
-from markitdown import MarkItDown
+from markitdown import MarkItDown, StreamInfo
 from markitdown_sample_plugin import RtfConverter
 
 TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
@@ -15,18 +15,22 @@
 
 def test_converter() -> None:
     """Tests the RTF converter dirctly."""
-    converter = RtfConverter()
-    result = converter.convert(
-        os.path.join(TEST_FILES_DIR, "test.rtf"), file_extension=".rtf"
-    )
+    with open(os.path.join(TEST_FILES_DIR, "test.rtf"), "rb") as file_stream:
+        converter = RtfConverter()
+        result = converter.convert(
+            file_stream=file_stream,
+            stream_info=StreamInfo(
+                mimetype="text/rtf", extension=".rtf", filename="test.rtf"
+            ),
+        )
 
-    for test_string in RTF_TEST_STRINGS:
-        assert test_string in result.text_content
+        for test_string in RTF_TEST_STRINGS:
+            assert test_string in result.text_content
 
 
 def test_markitdown() -> None:
     """Tests that MarkItDown correctly loads the plugin."""
-    md = MarkItDown()
+    md = MarkItDown(enable_plugins=True)
     result = md.convert(os.path.join(TEST_FILES_DIR, "test.rtf"))
 
     for test_string in RTF_TEST_STRINGS:

diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml
@@ -26,7 +26,7 @@ classifiers = [
 dependencies = [
   "beautifulsoup4",
   "requests",
-  "markdownify~=0.14.1",
+  "markdownify",
   "puremagic",
   "pathvalidate",
   "charset-normalizer",
@@ -78,11 +78,14 @@ extra-dependencies = [
 ]
 
 [tool.hatch.envs.types]
+features = ["all"]
 extra-dependencies = [
+  "openai",
   "mypy>=1.0.0",
 ]
+
 [tool.hatch.envs.types.scripts]
-check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
+check = "mypy --install-types --non-interactive --ignore-missing-imports {args:src/markitdown tests}"
 
 [tool.coverage.run]
 source_pkgs = ["markitdown", "tests"]

diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.0.2a1"
+__version__ = "0.0.2a2"
diff --git a/packages/markitdown/src/markitdown/__init__.py b/packages/markitdown/src/markitdown/__init__.py
@@ -3,15 +3,20 @@
 # SPDX-License-Identifier: MIT
 
 from .__about__ import __version__
-from ._markitdown import MarkItDown
+from ._markitdown import (
+    MarkItDown,
+    PRIORITY_SPECIFIC_FILE_FORMAT,
+    PRIORITY_GENERIC_FILE_FORMAT,
+)
+from ._base_converter import DocumentConverterResult, DocumentConverter
+from ._stream_info import StreamInfo
 from ._exceptions import (
     MarkItDownException,
     MissingDependencyException,
     FailedConversionAttempt,
     FileConversionException,
     UnsupportedFormatException,
 )
-from .converters import DocumentConverter, DocumentConverterResult
 
 __all__ = [
     "__version__",
@@ -23,4 +28,7 @@
     "FailedConversionAttempt",
     "FileConversionException",
     "UnsupportedFormatException",
+    "StreamInfo",
+    "PRIORITY_SPECIFIC_FILE_FORMAT",
+    "PRIORITY_GENERIC_FILE_FORMAT",
 ]