Skip to content

Commit

Permalink
Fix/add_megaparse_lib_ (#3476)
Browse files Browse the repository at this point in the history
  • Loading branch information
chloedia authored Nov 14, 2024
1 parent da97b2c commit 175a1cd
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 23 deletions.
2 changes: 1 addition & 1 deletion core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ dependencies = [
"transformers[sentencepiece]>=4.44.2",
"faiss-cpu>=1.8.0.post1",
"rapidfuzz>=3.10.1",
"megaparse-sdk>=0.1.2",
"markupsafe>=2.1.5",
"megaparse[all]== 0.0.43",
]
readme = "README.md"
requires-python = ">= 3.11"
Expand Down
59 changes: 37 additions & 22 deletions core/quivr_core/processor/implementations/megaparse_processor.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import logging
import os

import tiktoken
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
from megaparse_sdk import MegaParseSDK
from megaparse.core.megaparse import MegaParse
from megaparse.core.parser.unstructured_parser import UnstructuredParser

from quivr_core.config import MegaparseConfig
from quivr_core.files.file import QuivrFile
Expand Down Expand Up @@ -74,26 +74,41 @@ def processor_metadata(self):
}

async def process_file_inner(self, file: QuivrFile) -> list[Document]:
api_key = str(os.getenv("MEGAPARSE_API_KEY"))
megaparse = MegaParseSDK(api_key)
logger.info(f"Uploading file {file.path} to MegaParse")
data = {
"method": self.megaparse_config.method,
"strategy": self.megaparse_config.strategy,
"check_table": self.megaparse_config.check_table,
"parsing_instruction": self.megaparse_config.parsing_instruction,
"model_name": self.megaparse_config.model_name,
}
response = await megaparse.file.upload(
file_path=str(file.path),
**data,
)
parser = UnstructuredParser(**self.megaparse_config.model_dump())
megaparse = MegaParse(parser)
response = await megaparse.aload(file.path)
logger.info(f"File : {response}")
document = Document(
page_content=response["result"],
page_content=response,
)
if len(response) > self.splitter_config.chunk_size:
docs = self.text_splitter.split_documents([document])
for doc in docs:
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
return docs
return [document]

docs = self.text_splitter.split_documents([document])
for doc in docs:
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
return docs

# async def process_file_inner(self, file: QuivrFile) -> list[Document]:
# api_key = str(os.getenv("MEGAPARSE_API_KEY"))
# megaparse = MegaParseSDK(api_key)
# logger.info(f"Uploading file {file.path} to MegaParse")
# data = {
# "method": self.megaparse_config.method,
# "strategy": self.megaparse_config.strategy,
# "check_table": self.megaparse_config.check_table,
# "parsing_instruction": self.megaparse_config.parsing_instruction,
# "model_name": self.megaparse_config.model_name,
# }
# response = await megaparse.file.upload(
# file_path=str(file.path),
# **data,
# )
# document = Document(
# page_content=response["result"],
# )
# if len(response) > self.splitter_config.chunk_size:
# docs = self.text_splitter.split_documents([document])
# for doc in docs:
# doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
# return docs
# return [document]

0 comments on commit 175a1cd

Please sign in to comment.