mrge-io · cubic-dev-local · Sep 23, 2025 · Sep 23, 2025 · Sep 23, 2025 · Sep 23, 2025
diff --git a/backend/onyx/agents/agent_search/dr/sub_agents/basic_search/dr_basic_search_3_reduce.py b/backend/onyx/agents/agent_search/dr/sub_agents/basic_search/dr_basic_search_3_reduce.py
@@ -5,12 +5,12 @@
 
 from onyx.agents.agent_search.dr.sub_agents.states import SubAgentMainState
 from onyx.agents.agent_search.dr.sub_agents.states import SubAgentUpdate
-from onyx.agents.agent_search.dr.utils import chunks_or_sections_to_search_docs
 from onyx.agents.agent_search.shared_graph_utils.utils import (
     get_langgraph_node_log_string,
 )
 from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
 from onyx.context.search.models import SavedSearchDoc
+from onyx.context.search.models import SearchDoc
 from onyx.server.query_and_chat.streaming_models import SectionEnd
 from onyx.utils.logger import setup_logger
 
@@ -47,7 +47,7 @@ def is_reducer(
             doc_list.append(x)
 
     # Convert InferenceSections to SavedSearchDocs
-    search_docs = chunks_or_sections_to_search_docs(doc_list)
+    search_docs = SearchDoc.chunks_or_sections_to_search_docs(doc_list)
     retrieved_saved_search_docs = [
         SavedSearchDoc.from_search_doc(search_doc, db_doc_id=0)
         for search_doc in search_docs

diff --git a/backend/onyx/agents/agent_search/dr/utils.py b/backend/onyx/agents/agent_search/dr/utils.py
@@ -13,7 +13,7 @@
 )
 from onyx.context.search.models import InferenceSection
 from onyx.context.search.models import SavedSearchDoc
-from onyx.context.search.utils import chunks_or_sections_to_search_docs
+from onyx.context.search.models import SearchDoc
 from onyx.tools.tool_implementations.web_search.web_search_tool import (
     WebSearchTool,
 )
@@ -266,7 +266,7 @@ def convert_inference_sections_to_search_docs(
     is_internet: bool = False,
 ) -> list[SavedSearchDoc]:
     # Convert InferenceSections to SavedSearchDocs
-    search_docs = chunks_or_sections_to_search_docs(inference_sections)
+    search_docs = SearchDoc.chunks_or_sections_to_search_docs(inference_sections)
     for search_doc in search_docs:
         search_doc.is_internet = is_internet
 

diff --git a/backend/onyx/agents/agent_search/orchestration/states.py b/backend/onyx/agents/agent_search/orchestration/states.py
@@ -1,6 +1,6 @@
 from pydantic import BaseModel
 
-from onyx.chat.prompt_builder.answer_prompt_builder import PromptSnapshot
+from onyx.chat.prompt_builder.schemas import PromptSnapshot
 from onyx.tools.message import ToolCallSummary
 from onyx.tools.models import SearchToolOverrideKwargs
 from onyx.tools.models import ToolCallFinalResult

diff --git a/backend/onyx/background/celery/tasks/docprocessing/tasks.py b/backend/onyx/background/celery/tasks/docprocessing/tasks.py
@@ -86,7 +86,6 @@
 from onyx.file_store.document_batch_storage import get_document_batch_storage
 from onyx.httpx.httpx_pool import HttpxPool
 from onyx.indexing.embedder import DefaultIndexingEmbedder
-from onyx.indexing.indexing_pipeline import run_indexing_pipeline
 from onyx.natural_language_processing.search_nlp_models import EmbeddingModel
 from onyx.natural_language_processing.search_nlp_models import (
     InformationContentClassificationModel,
@@ -1268,6 +1267,8 @@ def _docprocessing_task(
     tenant_id: str,
     batch_num: int,
 ) -> None:
+    from onyx.indexing.indexing_pipeline import run_indexing_pipeline
+
     start_time = time.monotonic()
 
     if tenant_id:

diff --git a/backend/onyx/background/indexing/run_docfetching.py b/backend/onyx/background/indexing/run_docfetching.py
@@ -28,7 +28,6 @@
 from onyx.connectors.connector_runner import ConnectorRunner
 from onyx.connectors.exceptions import ConnectorValidationError
 from onyx.connectors.exceptions import UnexpectedValidationError
-from onyx.connectors.factory import instantiate_connector
 from onyx.connectors.interfaces import CheckpointedConnector
 from onyx.connectors.models import ConnectorFailure
 from onyx.connectors.models import ConnectorStopSignal
@@ -66,7 +65,6 @@
 from onyx.httpx.httpx_pool import HttpxPool
 from onyx.indexing.embedder import DefaultIndexingEmbedder
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
-from onyx.indexing.indexing_pipeline import run_indexing_pipeline
 from onyx.natural_language_processing.search_nlp_models import (
     InformationContentClassificationModel,
 )
@@ -100,6 +98,8 @@ def _get_connector_runner(
     are the complete list of existing documents of the connector. If the task
     of type LOAD_STATE, the list will be considered complete and otherwise incomplete.
     """
+    from onyx.connectors.factory import instantiate_connector
+
     task = attempt.connector_credential_pair.connector.input_type
 
     try:
@@ -283,6 +283,8 @@ def _run_indexing(
     2. Embed and index these documents into the chosen datastore (vespa)
     3. Updates Postgres to record the indexed documents + the outcome of this run
     """
+    from onyx.indexing.indexing_pipeline import run_indexing_pipeline
+
     start_time = time.monotonic()  # jsut used for logging
 
     with get_session_with_current_tenant() as db_session_temp:

diff --git a/backend/onyx/chat/prompt_builder/answer_prompt_builder.py b/backend/onyx/chat/prompt_builder/answer_prompt_builder.py
@@ -4,7 +4,6 @@
 from langchain_core.messages import BaseMessage
 from langchain_core.messages import HumanMessage
 from langchain_core.messages import SystemMessage
-from pydantic import BaseModel
 from pydantic.v1 import BaseModel as BaseModel__v1
 
 from onyx.chat.models import PromptConfig
@@ -196,10 +195,6 @@ def build(self) -> list[BaseMessage]:
 
 
 # Stores some parts of a prompt builder as needed for tool calls
-class PromptSnapshot(BaseModel):
-    raw_message_history: list[PreviousMessage]
-    raw_user_query: str
-    built_prompt: list[BaseMessage]
 
 
 # TODO: rename this? AnswerConfig maybe?

diff --git a/backend/onyx/chat/prompt_builder/schemas.py b/backend/onyx/chat/prompt_builder/schemas.py
@@ -0,0 +1,10 @@
+from langchain_core.messages import BaseMessage
+from pydantic import BaseModel
+
+from onyx.llm.models import PreviousMessage
+
+
+class PromptSnapshot(BaseModel):
+    raw_message_history: list[PreviousMessage]
+    raw_user_query: str
+    built_prompt: list[BaseMessage]
diff --git a/backend/onyx/chat/tool_handling/tool_response_handler.py b/backend/onyx/chat/tool_handling/tool_response_handler.py
@@ -7,7 +7,7 @@
 from onyx.chat.models import ResponsePart
 from onyx.chat.prompt_builder.answer_prompt_builder import AnswerPromptBuilder
 from onyx.chat.prompt_builder.answer_prompt_builder import LLMCall
-from onyx.chat.prompt_builder.answer_prompt_builder import PromptSnapshot
+from onyx.chat.prompt_builder.schemas import PromptSnapshot
 from onyx.llm.interfaces import LLM
 from onyx.tools.force import ForceUseTool
 from onyx.tools.message import build_tool_message

diff --git a/backend/onyx/context/search/models.py b/backend/onyx/context/search/models.py
@@ -1,3 +1,4 @@
+from collections.abc import Sequence
 from datetime import datetime
 from typing import Any
 
@@ -355,6 +356,44 @@ class SearchDoc(BaseModel):
     secondary_owners: list[str] | None = None
     is_internet: bool = False
 
+    @classmethod
+    def chunks_or_sections_to_search_docs(
+        cls,
+        items: "Sequence[InferenceChunk | InferenceSection] | None",
-        items: "Sequence[InferenceChunk | InferenceSection] | None",
+        items: Sequence[InferenceChunk | InferenceSection] | None,
-        items: "Sequence[InferenceChunk | InferenceSection] | None",
+        items: Sequence[InferenceChunk | InferenceSection] | None,
-        items: "Sequence[InferenceChunk | InferenceSection] | None",
+        items: Sequence[InferenceChunk | InferenceSection] | None,
-        items: "Sequence[InferenceChunk | InferenceSection] | None",
+        items: Sequence[InferenceChunk | InferenceSection] | None,
+    ) -> list["SearchDoc"]:
+        """Convert a sequence of InferenceChunk or InferenceSection objects to SearchDoc objects."""
+        if not items:
+            return []
+
+        search_docs = [
+            cls(
+                document_id=(
+                    chunk := (
+                        item.center_chunk
+                        if isinstance(item, InferenceSection)
+                        else item
+                    )
+                ).document_id,
+                chunk_ind=chunk.chunk_id,
+                semantic_identifier=chunk.semantic_identifier or "Unknown",
+                link=chunk.source_links[0] if chunk.source_links else None,
-                link=chunk.source_links[0] if chunk.source_links else None,
+link=(next(iter(chunk.source_links.values())) if chunk.source_links else None),
-                link=chunk.source_links[0] if chunk.source_links else None,
+link=(next(iter(chunk.source_links.values())) if chunk.source_links else None),
+                blurb=chunk.blurb,
+                source_type=chunk.source_type,
+                boost=chunk.boost,
+                hidden=chunk.hidden,
+                metadata=chunk.metadata,
+                score=chunk.score,
+                match_highlights=chunk.match_highlights,
+                updated_at=chunk.updated_at,
+                primary_owners=chunk.primary_owners,
+                secondary_owners=chunk.secondary_owners,
+                is_internet=False,
+            )
+            for item in items
+        ]
+
+        return search_docs
+
     def model_dump(self, *args: list, **kwargs: dict[str, Any]) -> dict[str, Any]:  # type: ignore
         initial_dict = super().model_dump(*args, **kwargs)  # type: ignore
         initial_dict["updated_at"] = (

diff --git a/backend/onyx/context/search/utils.py b/backend/onyx/context/search/utils.py
@@ -118,40 +118,6 @@ def inference_section_from_chunks(
     )
 
 
-def chunks_or_sections_to_search_docs(
-    items: Sequence[InferenceChunk | InferenceSection] | None,
-) -> list[SearchDoc]:
-    if not items:
-        return []
-
-    search_docs = [
-        SearchDoc(
-            document_id=(
-                chunk := (
-                    item.center_chunk if isinstance(item, InferenceSection) else item
-                )
-            ).document_id,
-            chunk_ind=chunk.chunk_id,
-            semantic_identifier=chunk.semantic_identifier or "Unknown",
-            link=chunk.source_links[0] if chunk.source_links else None,
-            blurb=chunk.blurb,
-            source_type=chunk.source_type,
-            boost=chunk.boost,
-            hidden=chunk.hidden,
-            metadata=chunk.metadata,
-            score=chunk.score,
-            match_highlights=chunk.match_highlights,
-            updated_at=chunk.updated_at,
-            primary_owners=chunk.primary_owners,
-            secondary_owners=chunk.secondary_owners,
-            is_internet=False,
-        )
-        for item in items
-    ]
-
-    return search_docs
-
-
 def remove_stop_words_and_punctuation(keywords: list[str]) -> list[str]:
     try:
         # Re-tokenize using the NLTK tokenizer for better matching

diff --git a/backend/onyx/db/chat.py b/backend/onyx/db/chat.py
@@ -34,7 +34,6 @@
 from onyx.context.search.models import RetrievalDocs
 from onyx.context.search.models import SavedSearchDoc
 from onyx.context.search.models import SearchDoc as ServerSearchDoc
-from onyx.context.search.utils import chunks_or_sections_to_search_docs
 from onyx.db.models import AgentSearchMetrics
 from onyx.db.models import AgentSubQuery
 from onyx.db.models import AgentSubQuestion
@@ -57,7 +56,7 @@
 from onyx.server.query_and_chat.models import ChatMessageDetail
 from onyx.server.query_and_chat.models import SubQueryDetail
 from onyx.server.query_and_chat.models import SubQuestionDetail
-from onyx.tools.tool_runner import ToolCallFinalResult
+from onyx.tools.models import ToolCallFinalResult
 from onyx.utils.logger import setup_logger
 from onyx.utils.special_types import JSON_ro
 
@@ -1147,7 +1146,7 @@ def log_agent_sub_question_results(
             db_session.add(sub_query_object)
             db_session.commit()
 
-            search_docs = chunks_or_sections_to_search_docs(
+            search_docs = ServerSearchDoc.chunks_or_sections_to_search_docs(
                 sub_query.retrieved_documents
             )
             for doc in search_docs:

diff --git a/backend/onyx/file_processing/extract_file_text.py b/backend/onyx/file_processing/extract_file_text.py
@@ -15,14 +15,12 @@
 from typing import Any
 from typing import IO
 from typing import NamedTuple
+from typing import Optional
+from typing import TYPE_CHECKING
 from zipfile import BadZipFile
 
 import chardet
 import openpyxl
-from markitdown import FileConversionException
-from markitdown import MarkItDown
-from markitdown import StreamInfo
-from markitdown import UnsupportedFormatException
 from PIL import Image
 from pypdf import PdfReader
 from pypdf.errors import PdfStreamError
@@ -37,6 +35,8 @@
 from onyx.utils.file_types import WORD_PROCESSING_MIME_TYPE
 from onyx.utils.logger import setup_logger
 
+if TYPE_CHECKING:
+    from markitdown import MarkItDown
 logger = setup_logger()
 
 # NOTE(rkuo): Unify this with upload_files_for_chat and file_valiation.py
@@ -85,16 +85,18 @@
     "image/webp",
 ]
 
-_MARKITDOWN_CONVERTER: MarkItDown | None = None
+_MARKITDOWN_CONVERTER: Optional["MarkItDown"] = None
 
 KNOWN_OPENPYXL_BUGS = [
     "Value must be either numerical or a string containing a wildcard",
     "File contains no valid workbook part",
 ]
 
 
-def get_markitdown_converter() -> MarkItDown:
+def get_markitdown_converter() -> "MarkItDown":
     global _MARKITDOWN_CONVERTER
+    from markitdown import MarkItDown
+
     if _MARKITDOWN_CONVERTER is None:
         _MARKITDOWN_CONVERTER = MarkItDown(enable_plugins=False)
     return _MARKITDOWN_CONVERTER
@@ -358,6 +360,12 @@ def docx_to_text_and_images(
     The images list returned is empty in this case.
     """
     md = get_markitdown_converter()
+    from markitdown import (
+        StreamInfo,
+        FileConversionException,
+        UnsupportedFormatException,
+    )
+
     try:
         doc = md.convert(
             to_bytesio(file), stream_info=StreamInfo(mimetype=WORD_PROCESSING_MIME_TYPE)
@@ -394,6 +402,12 @@ def docx_to_text_and_images(
 
 def pptx_to_text(file: IO[Any], file_name: str = "") -> str:
     md = get_markitdown_converter()
+    from markitdown import (
+        StreamInfo,
+        FileConversionException,
+        UnsupportedFormatException,
+    )
+
     stream_info = StreamInfo(
         mimetype=PRESENTATION_MIME_TYPE, filename=file_name or None, extension=".pptx"
     )