Skip to content

Commit

Permalink
fix: added chunk_size in tika processor (#3466)
Browse files Browse the repository at this point in the history
  • Loading branch information
AmineDiro authored Nov 9, 2024
1 parent 190d971 commit 063bbd3
Showing 1 changed file with 4 additions and 0 deletions.
4 changes: 4 additions & 0 deletions core/quivr_core/processor/implementations/tika_processor.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import tiktoken
import logging
import os
from typing import AsyncIterable
Expand Down Expand Up @@ -39,6 +40,7 @@ def __init__(
self.max_retries = max_retries
self._client = httpx.AsyncClient(timeout=timeout)

self.enc = tiktoken.get_encoding("cl100k_base")
self.splitter_config = splitter_config

if splitter:
Expand Down Expand Up @@ -73,5 +75,7 @@ async def process_file_inner(self, file: QuivrFile) -> list[Document]:
txt = await self._send_parse_tika(f)
document = Document(page_content=txt)
docs = self.text_splitter.split_documents([document])
for doc in docs:
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}

return docs

0 comments on commit 063bbd3

Please sign in to comment.