Skip to content
Merged
1 change: 0 additions & 1 deletion packages/markitdown-mcp/src/markitdown_mcp/__main__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import sys
from typing import Any
from mcp.server.fastmcp import FastMCP
from starlette.applications import Starlette
from mcp.server.sse import SseServerTransport
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python3 -m pytest
import os
import pytest

from markitdown import MarkItDown, StreamInfo
from markitdown_sample_plugin import RtfConverter
Expand Down
9 changes: 4 additions & 5 deletions packages/markitdown/src/markitdown/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import argparse
import sys
import codecs
import locale
from textwrap import dedent
from importlib.metadata import entry_points
from .__about__ import __version__
Expand Down Expand Up @@ -34,13 +33,13 @@ def main():
OR

markitdown < example.pdf

OR to save to a file use

markitdown example.pdf -o example.md

OR

markitdown example.pdf > example.md
"""
).strip(),
Expand Down
5 changes: 1 addition & 4 deletions packages/markitdown/src/markitdown/_base_converter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
import os
import tempfile
from warnings import warn
from typing import Any, Union, BinaryIO, Optional, List
from typing import Any, BinaryIO, Optional
from ._stream_info import StreamInfo


Expand Down
7 changes: 2 additions & 5 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
import copy
import mimetypes
import os
import re
import sys
import shutil
import tempfile
import warnings
import traceback
import io
from dataclasses import dataclass
Expand Down Expand Up @@ -547,7 +544,7 @@ def _convert(
# Sanity check -- make sure the cur_pos is still the same
assert (
cur_pos == file_stream.tell()
), f"File stream position should NOT change between guess iterations"
), "File stream position should NOT change between guess iterations"

_kwargs = {k: v for k, v in kwargs.items()}

Expand Down Expand Up @@ -614,7 +611,7 @@ def _convert(

# Nothing can handle it!
raise UnsupportedFormatException(
f"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
)

def register_page_converter(self, converter: DocumentConverter) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def do_fname(self, elm):
if FUNC.get(t):
latex_chars.append(FUNC[t])
else:
raise NotImplemented("Not support func %s" % t)
raise NotImplementedError("Not support func %s" % t)
else:
latex_chars.append(t)
t = BLANK.join(latex_chars)
Expand Down Expand Up @@ -316,7 +316,7 @@ def do_limlow(self, elm):
t_dict = self.process_children_dict(elm, include=("e", "lim"))
latex_s = LIM_FUNC.get(t_dict["e"])
if not latex_s:
raise NotImplemented("Not support lim %s" % t_dict["e"])
raise NotImplementedError("Not support lim %s" % t_dict["e"])
else:
return latex_s.format(lim=t_dict.get("lim"))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
updated_content = _pre_process_math(content)
# In the future, if there are more pre-processing steps, they can be added here
zip_output.writestr(name, updated_content)
except:
except Exception:
# If there is an error in processing the content, write the original content
zip_output.writestr(name, content)
else:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import io
from typing import Any, BinaryIO, Optional
from typing import Any, BinaryIO

from ._exiftool import exiftool_metadata
from ._transcribe_audio import transcribe_audio
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import io
import re
import base64
import binascii
from urllib.parse import parse_qs, urlparse
from typing import Any, BinaryIO, Optional
from typing import Any, BinaryIO
from bs4 import BeautifulSoup

from .._base_converter import DocumentConverter, DocumentConverterResult
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import sys
import csv
import io
from typing import BinaryIO, Any
from charset_normalizer import from_bytes
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo

Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import sys
import re
import os
from typing import BinaryIO, Any, List, Optional, Union
from typing import BinaryIO, Any, List
from enum import Enum

from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
from .._exceptions import MissingDependencyException

# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from ._html_converter import HtmlConverter
from ..converter_utils.docx.pre_process import pre_process_docx
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import BinaryIO, Any, Dict, List

from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._base_converter import DocumentConverterResult
from .._stream_info import StreamInfo

ACCEPTED_MIME_TYPE_PREFIXES = [
Expand Down
4 changes: 0 additions & 4 deletions packages/markitdown/src/markitdown/converters/_exiftool.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import json
import subprocess
import locale
import sys
import shutil
import os
import warnings
from typing import BinaryIO, Any, Union


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ def convert(
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Parse and convert the notebook
result = None

encoding = stream_info.charset or "utf-8"
notebook_content = file_stream.read().decode(encoding=encoding)
return self._convert(json.loads(notebook_content))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import BinaryIO, Any, Union
from typing import BinaryIO, Union
import base64
import mimetypes
from .._stream_info import StreamInfo
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from typing import BinaryIO, Any


from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# Save reporting of any exceptions for later
_dependency_exc_info = None
try:
import mammoth
import mammoth # noqa: F401
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import io
import re
import bs4
from typing import Any, BinaryIO, Optional
from typing import Any, BinaryIO

from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
_xlsx_dependency_exc_info = None
try:
import pandas as pd
import openpyxl
import openpyxl # noqa: F401
except ImportError:
_xlsx_dependency_exc_info = sys.exc_info()

_xls_dependency_exc_info = None
try:
import pandas as pd
import xlrd
import pandas as pd # noqa: F811
import xlrd # noqa: F401
except ImportError:
_xls_dependency_exc_info = sys.exc_info()

Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import sys
import json
import time
import io
import re
import bs4
from typing import Any, BinaryIO, Optional, Dict, List, Union
from typing import Any, BinaryIO, Dict, List, Union
from urllib.parse import parse_qs, urlparse, unquote

from .._base_converter import DocumentConverter, DocumentConverterResult
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import sys
import zipfile
import io
import os
Expand Down
5 changes: 2 additions & 3 deletions packages/markitdown/tests/test_cli_misc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python3 -m pytest
import subprocess
import pytest
from markitdown import __version__

# This file contains CLI tests that are not directly tested by the FileTestVectors.
Expand All @@ -24,8 +23,8 @@ def test_invalid_flag() -> None:
assert result.returncode != 0, f"CLI exited with error: {result.stderr}"
assert (
"unrecognized arguments" in result.stderr
), f"Expected 'unrecognized arguments' to appear in STDERR"
assert "SYNTAX" in result.stderr, f"Expected 'SYNTAX' to appear in STDERR"
), "Expected 'unrecognized arguments' to appear in STDERR"
assert "SYNTAX" in result.stderr, "Expected 'SYNTAX' to appear in STDERR"


if __name__ == "__main__":
Expand Down
10 changes: 0 additions & 10 deletions packages/markitdown/tests/test_cli_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,6 @@
FileTestVector,
)

from markitdown import (
MarkItDown,
UnsupportedFormatException,
FileConversionException,
StreamInfo,
)

skip_remote = (
True if os.environ.get("GITHUB_ACTIONS") else False
) # Don't run these tests in CI
Expand Down Expand Up @@ -140,8 +133,6 @@ def test_convert_url(shared_tmp_dir, test_vector):
"""Test the conversion of a stream with no stream info."""
# Note: tmp_dir is not used here, but is needed to match the signature

markitdown = MarkItDown()

time.sleep(1) # Ensure we don't hit rate limits
result = subprocess.run(
["python", "-m", "markitdown", TEST_FILES_URL + "/" + test_vector.filename],
Expand Down Expand Up @@ -191,7 +182,6 @@ def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None:


if __name__ == "__main__":
import sys
import tempfile

"""Runs this file's tests from the command line."""
Expand Down
3 changes: 0 additions & 3 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import os
import re
import shutil
import openai
import pytest

from markitdown._uri_utils import parse_data_uri, file_uri_to_path
Expand Down Expand Up @@ -253,8 +252,6 @@ def test_file_uris() -> None:


def test_docx_comments() -> None:
markitdown = MarkItDown()

# Test DOCX processing, with comments and setting style_map on init
markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
result = markitdown_with_style_map.convert(
Expand Down
5 changes: 0 additions & 5 deletions packages/markitdown/tests/test_module_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import os
import time
import pytest
import codecs
import base64

from pathlib import Path
Expand All @@ -14,8 +13,6 @@

from markitdown import (
MarkItDown,
UnsupportedFormatException,
FileConversionException,
StreamInfo,
)

Expand Down Expand Up @@ -203,8 +200,6 @@ def test_convert_stream_keep_data_uris(test_vector):


if __name__ == "__main__":
import sys

"""Runs this file's tests from the command line."""

# General tests
Expand Down