Skip to content

Commit

Permalink
feat: kms-migration (#3446)
Browse files Browse the repository at this point in the history
# Description

- necessary changes for Kms v0.1
  • Loading branch information
AmineDiro authored Nov 1, 2024
1 parent 6415c75 commit 1356d87
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 36 deletions.
2 changes: 1 addition & 1 deletion core/quivr_core/files/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,9 @@ def __init__(
id: UUID,
original_filename: str,
path: Path,
brain_id: UUID,
file_sha1: str,
file_extension: FileExtension | str,
brain_id: UUID | None = None,
file_size: int | None = None,
metadata: dict[str, Any] | None = None,
) -> None:
Expand Down
20 changes: 9 additions & 11 deletions core/quivr_core/processor/implementations/megaparse_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import tiktoken
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
from megaparse import MegaParse

from quivr_core.config import MegaparseConfig
from quivr_core.files.file import QuivrFile
Expand Down Expand Up @@ -55,14 +56,11 @@ def processor_metadata(self):
}

async def process_file_inner(self, file: QuivrFile) -> list[Document]:
# mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore
# document: Document = await mega_parse.aload()
# if len(document.page_content) > self.splitter_config.chunk_size:
# docs = self.text_splitter.split_documents([document])
# for doc in docs:
# # if "Production Fonts (maximum)" in doc.page_content:
# # print('Doc: ', doc.page_content)
# doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
# return docs
# return [document]
return []
mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore
document: Document = await mega_parse.aload()
if len(document.page_content) > self.splitter_config.chunk_size:
docs = self.text_splitter.split_documents([document])
for doc in docs:
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
return docs
return [document]
3 changes: 2 additions & 1 deletion core/quivr_core/processor/implementations/tika_processor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import os
from typing import AsyncIterable

import httpx
Expand Down Expand Up @@ -28,7 +29,7 @@ class TikaProcessor(ProcessorBase):

def __init__(
self,
tika_url: str = "http://localhost:9998/tika",
tika_url: str = os.getenv("TIKA_SERVER_URL", "http://localhost:9998/tika"),
splitter: TextSplitter | None = None,
splitter_config: SplitterConfig = SplitterConfig(),
timeout: float = 5.0,
Expand Down
2 changes: 0 additions & 2 deletions core/quivr_core/processor/processor_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from abc import ABC, abstractmethod
from importlib.metadata import PackageNotFoundError, version
from typing import Any
from uuid import uuid4

from langchain_core.documents import Document

Expand Down Expand Up @@ -43,7 +42,6 @@ async def process_file(self, file: QuivrFile) -> list[Document]:
"utf-8"
)
doc.metadata = {
"id": uuid4(),
"chunk_index": idx,
"quivr_core_version": qvr_version,
**file.metadata,
Expand Down
14 changes: 7 additions & 7 deletions core/quivr_core/processor/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,13 @@ def defaults_to_proc_entries(

# TODO(@aminediro): Megaparse should register itself
# Append Megaparse
_append_proc_mapping(
mapping=base_processors,
file_ext=FileExtension.pdf,
cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor",
errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor",
priority=None,
)
# _append_proc_mapping(
# mapping=base_processors,
# file_ext=FileExtension.pdf,
# cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor",
# errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor",
# priority=None,
# )
return base_processors


Expand Down
5 changes: 3 additions & 2 deletions core/quivr_core/rag/entities/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,11 @@ class ChatMessage(BaseModelV1):


class KnowledgeStatus(str, Enum):
PROCESSING = "PROCESSING"
UPLOADED = "UPLOADED"
ERROR = "ERROR"
RESERVED = "RESERVED"
PROCESSING = "PROCESSING"
PROCESSED = "PROCESSED"
UPLOADED = "UPLOADED"


class Source(BaseModel):
Expand Down
21 changes: 9 additions & 12 deletions core/quivr_core/rag/quivr_rag_langgraph.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,38 @@
import asyncio
import logging
from typing import (
Annotated,
Any,
AsyncGenerator,
Dict,
List,
Optional,
Sequence,
Tuple,
TypedDict,
Dict,
Any,
Type,
TypedDict,
)
from uuid import uuid4
import asyncio

# TODO(@aminediro): this is the only dependency to langchain package, we should remove it
import openai
from langchain.retrievers import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain_community.document_compressors import JinaRerank
from langchain_core.callbacks import Callbacks
from langchain_core.documents import BaseDocumentCompressor, Document
from langchain_core.messages import BaseMessage
from langchain_core.messages.ai import AIMessageChunk
from langchain_core.vectorstores import VectorStore
from langchain_core.prompts.base import BasePromptTemplate
from langgraph.graph import START, END, StateGraph
from langchain_core.vectorstores import VectorStore
from langgraph.graph import END, START, StateGraph
from langgraph.graph.message import add_messages
from langgraph.types import Send


from pydantic import BaseModel, Field
import openai

from quivr_core.rag.entities.chat import ChatHistory
from quivr_core.rag.entities.config import DefaultRerankers, NodeConfig, RetrievalConfig
from quivr_core.llm import LLMEndpoint
from quivr_core.llm_tools.llm_tools import LLMToolFactory
from quivr_core.rag.entities.chat import ChatHistory
from quivr_core.rag.entities.config import DefaultRerankers, NodeConfig, RetrievalConfig
from quivr_core.rag.entities.models import (
ParsedRAGChunkResponse,
QuivrKnowledge,
Expand Down

0 comments on commit 1356d87

Please sign in to comment.