Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: remove processor tests that are in megaparse & remove specific U… #3501

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ markers = [
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
"base: these tests require quivr-core with extra `base` to be installed",
"tika: these tests require a tika server to be running",
"unstructured: these tests require `unstructured` dependency",
"megaparse: these tests require `megaparse` dependency",
]

[[tool.mypy.overrides]]
Expand Down
32 changes: 0 additions & 32 deletions core/quivr_core/processor/implementations/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,10 @@

import tiktoken
from langchain_community.document_loaders import (
BibtexLoader,
CSVLoader,
Docx2txtLoader,
NotebookLoader,
PythonLoader,
UnstructuredEPubLoader,
UnstructuredExcelLoader,
UnstructuredHTMLLoader,
UnstructuredMarkdownLoader,
UnstructuredODTLoader,
UnstructuredPDFLoader,
UnstructuredPowerPointLoader,
)
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.text import TextLoader
Expand Down Expand Up @@ -99,31 +91,7 @@ async def process_file_inner(self, file: QuivrFile) -> list[Document]:
DOCXProcessor = _build_processor(
"DOCXProcessor", Docx2txtLoader, [FileExtension.docx, FileExtension.doc]
)
XLSXProcessor = _build_processor(
"XLSXProcessor", UnstructuredExcelLoader, [FileExtension.xlsx, FileExtension.xls]
)
PPTProcessor = _build_processor(
"PPTProcessor", UnstructuredPowerPointLoader, [FileExtension.pptx]
)
MarkdownProcessor = _build_processor(
"MarkdownProcessor",
UnstructuredMarkdownLoader,
[FileExtension.md, FileExtension.mdx, FileExtension.markdown],
)
EpubProcessor = _build_processor(
"EpubProcessor", UnstructuredEPubLoader, [FileExtension.epub]
)
BibTexProcessor = _build_processor("BibTexProcessor", BibtexLoader, [FileExtension.bib])
ODTProcessor = _build_processor(
"ODTProcessor", UnstructuredODTLoader, [FileExtension.odt]
)
HTMLProcessor = _build_processor(
"HTMLProcessor", UnstructuredHTMLLoader, [FileExtension.html]
)
PythonProcessor = _build_processor("PythonProcessor", PythonLoader, [FileExtension.py])
NotebookProcessor = _build_processor(
"NotebookProcessor", NotebookLoader, [FileExtension.ipynb]
)
UnstructuredPDFProcessor = _build_processor(
"UnstructuredPDFProcessor", UnstructuredPDFLoader, [FileExtension.pdf]
)
Empty file.
33 changes: 0 additions & 33 deletions core/tests/processor/community/test_markdown_processor.py

This file was deleted.

Empty file.
Binary file removed core/tests/processor/docx/demo.docx
Binary file not shown.
33 changes: 0 additions & 33 deletions core/tests/processor/docx/test_docx.py

This file was deleted.

Empty file.
Binary file removed core/tests/processor/epub/page-blanche.epub
Binary file not shown.
Binary file removed core/tests/processor/epub/sway.epub
Binary file not shown.
51 changes: 0 additions & 51 deletions core/tests/processor/epub/test_epub_processor.py

This file was deleted.

Empty file.
1 change: 0 additions & 1 deletion core/tests/processor/odt/bad_odt.odt

This file was deleted.

Binary file removed core/tests/processor/odt/sample.odt
Binary file not shown.
42 changes: 0 additions & 42 deletions core/tests/processor/odt/test_odt.py

This file was deleted.

22 changes: 13 additions & 9 deletions core/tests/processor/pdf/test_unstructured_pdf_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@
import pytest
from quivr_core.files.file import FileExtension, QuivrFile

unstructured = pytest.importorskip("unstructured")
megaparse = pytest.importorskip("megaparse")

all_but_pdf = list(filter(lambda ext: ext != ".pdf", list(FileExtension)))


@pytest.mark.unstructured
@pytest.mark.megaparse
@pytest.mark.asyncio
async def test_unstructured_pdf_processor():
from quivr_core.processor.implementations.default import UnstructuredPDFProcessor
async def test_megaparse_pdf_processor():
from quivr_core.processor.implementations.megaparse_processor import (
MegaparseProcessor,
)

p = Path("./tests/processor/pdf/sample.pdf")
f = QuivrFile(
Expand All @@ -23,16 +25,18 @@ async def test_unstructured_pdf_processor():
file_extension=FileExtension.pdf,
file_sha1="123",
)
processor = UnstructuredPDFProcessor()
processor = MegaparseProcessor()
result = await processor.process_file(f)
assert len(result) > 0


@pytest.mark.unstructured
@pytest.mark.megaparse
@pytest.mark.parametrize("ext", all_but_pdf)
@pytest.mark.asyncio
async def test_unstructured_pdf_processor_fail(ext):
from quivr_core.processor.implementations.default import UnstructuredPDFProcessor
async def test_megaparse_pdf_processor_fail(ext):
from quivr_core.processor.implementations.megaparse_processor import (
MegaparseProcessor,
)

p = Path("./tests/processor/pdf/sample.pdf")
f = QuivrFile(
Expand All @@ -43,6 +47,6 @@ async def test_unstructured_pdf_processor_fail(ext):
file_extension=ext,
file_sha1="123",
)
processor = UnstructuredPDFProcessor()
processor = MegaparseProcessor()
with pytest.raises(ValueError):
await processor.process_file(f)
2 changes: 1 addition & 1 deletion core/tests/processor/test_txt_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest
from quivr_core.storage.file import FileExtension, QuivrFile

unstructured = pytest.importorskip("unstructured")
megaparse = pytest.importorskip("megaparse")


@pytest.fixture
Expand Down
Loading