initial

pqthong · Feb 20, 2025 · 2fa6b28 · 2fa6b28
1 parent b354103
commit 2fa6b28
Show file tree

Hide file tree

Showing 8 changed files with 84 additions and 19 deletions.
diff --git a/.langgraph_api/.langgraph_checkpoint.1.pckl b/.langgraph_api/.langgraph_checkpoint.1.pckl
diff --git a/.langgraph_api/.langgraph_checkpoint.2.pckl b/.langgraph_api/.langgraph_checkpoint.2.pckl
diff --git a/.langgraph_api/.langgraph_ops.pckl b/.langgraph_api/.langgraph_ops.pckl
diff --git a/.langgraph_api/.langgraph_retry_counter.pckl b/.langgraph_api/.langgraph_retry_counter.pckl
diff --git a/src/retrieval_graph/graph.py b/src/retrieval_graph/graph.py
@@ -7,11 +7,13 @@
 """
 
 from typing import Any, Literal, TypedDict, cast
+import json
+from index_graph.configuration import IndexConfiguration
+from shared import retrieval
 
 from langchain_core.messages import BaseMessage
 from langchain_core.runnables import RunnableConfig
 from langgraph.graph import END, START, StateGraph
-
 from retrieval_graph.configuration import AgentConfiguration
 from retrieval_graph.researcher_graph.graph import graph as researcher_graph
 from retrieval_graph.state import AgentState, InputState, Router
@@ -20,6 +22,8 @@
 from retrieval_graph.tools import TOOLS
 from langchain_core.messages import AIMessage
 from langgraph.prebuilt import ToolNode
+from shared.state import reduce_docs
+from langchain_core.documents import Document
 
 
 async def analyze_and_route_query(
@@ -249,6 +253,50 @@ def route_model_output(state: AgentState) -> Literal["__end__", "tools"]:
     # Otherwise we execute the requested actions
     return "tools"
 
+def convert_to_documents(doc_string: str) -> list[Document]:
+    """Convert a list of document dictionaries to a list of Document objects.
+
+    Args:
+        doc_dicts (list[dict[str, str]]): A list of document dictionaries with 'url' and 'content'.
+
+    Returns:
+        list[Document]: A list of Document objects with content and metadata.
+    """
+    doc_dicts = json.loads(doc_string)
+    documents = []
+    for doc_dict in doc_dicts:
+        # Create a Document object with content and metadata
+        document = Document(page_content=doc_dict["content"], metadata={"source": doc_dict["url"]})
+        documents.append(document)
+    return documents
+
+
+async def index_docs(
+    state: AgentState, *, config: RunnableConfig
+) -> dict[str, str]:
+    """Asynchronously index documents in the given state using the configured retriever.
+
+    This function takes the documents from the state, ensures they have a user ID,
+    adds them to the retriever's index, and then signals for the documents to be
+    deleted from the state.
+
+    If docs are not provided in the state, they will be loaded
+    from the configuration.docs_file JSON file.
+
+    Args:
+        state (IndexState): The current state containing documents and retriever.
+        config (Optional[RunnableConfig]): Configuration for the indexing process.r
+    """
+    if not config:
+        raise ValueError("Configuration required to run index_docs.")
+
+    configuration = IndexConfiguration.from_runnable_config(config)
+
+    with retrieval.make_retriever(config) as retriever:
+        await retriever.aadd_documents(convert_to_documents(state.messages[2].content))
+
+    return {"docs": "delete"}
+
 
 # Define the graph
 builder = StateGraph(AgentState, input=InputState, config_schema=AgentConfiguration)
@@ -263,6 +311,7 @@ def route_model_output(state: AgentState) -> Literal["__end__", "tools"]:
     "respond",
     route_model_output,
 )
+builder.add_node(index_docs)
 
 builder.add_edge(START, "analyze_and_route_query")
 builder.add_conditional_edges("analyze_and_route_query", route_query)
@@ -271,6 +320,7 @@ def route_model_output(state: AgentState) -> Literal["__end__", "tools"]:
 builder.add_edge("ask_for_more_info", END)
 builder.add_edge("respond_to_general_query", END)
 builder.add_edge("respond", END)
+builder.add_edge("tools", "index_docs")
 builder.add_edge("tools", "respond")
 
 # Compile into a graph object that you can invoke and deploy.

diff --git a/src/retrieval_graph/prompts.py b/src/retrieval_graph/prompts.py
@@ -46,26 +46,36 @@
 You do not need to specify where you want to research for all steps of the plan, but it's sometimes helpful."""
 
 RESPONSE_SYSTEM_PROMPT = """\
-You are an expert  problem-solver, tasked with answering any question \
-about any knowledge.
-
-
-
+You are an expert information retriever responsible for generating a comprehensive and informative answer based solely on the provided search results (URL and content). Follow these strict guidelines:
+
+If the provided <context> contains relevant information:
+
+DO NOT search the internet.
+Generate a precise and informative answer using only the <context>.
+Maintain an unbiased and journalistic tone while combining search results into a coherent response.
+Do NOT ramble. Adjust response length based on the question:
+If the answer requires one sentence, keep it short.
+If more detail is required (up to five paragraphs), provide the necessary depth.
+Use source URLs as citations, placing them immediately after the referenced text (not all at the end).
+If multiple sources discuss different entities under the same name, provide separate answers for each.
+Use bullet points for clarity and readability, citing sources within the bullet points where applicable.If multiple sources discuss different entities under the same name, provide separate answers for each.
+Use bullet points for clarity and readability, citing sources within the bullet points where applicable.
+If the <context> does NOT contain relevant information:
+
+Call the web_search_tool tool to find the necessary data. This tool searches the web for information. use it if context is not enough
+Generate an answer based on the retrieved web search results following the same formatting rules (concise, unbiased, cited properly).
+If neither <context> nor web search provides a definitive answer:
+
+Do NOT make up information.
+Do NOT say you don't know or not sure or not enough infomation, use web_search_tool tool to get infomation.
+If applicable, ask the user for more details to refine the search.
 Anything between the following `context` html blocks is retrieved from a knowledge \
 bank, not part of the conversation with the user.
 
 <context>
     {context}
 <context/>
 
-If the provided search results is not relevant to the question, say thay you will search the internet and do that\
-
-If there is nothing in the context relevant to the question at hand, say thay you will search the internet and do that.\
-
-You do have access to the internet to search for information via tool call called web_search_tool , use it \
-
-DO NOT promt for more just search the web\
-
 """
 
 # Researcher graph

diff --git a/src/retrieval_graph/tools.py b/src/retrieval_graph/tools.py
@@ -5,6 +5,7 @@
 These tools are intended as free examples to get started. For production use,
 consider implementing more robust and specialized tools tailored to your needs.
 """
+from index_graph.graph import graph as index_graph
 
 from typing import Any, Callable, List, Optional, cast
 
@@ -28,7 +29,12 @@ async def web_search_tool(
     configuration = Configuration.from_runnable_config(config)
     wrapped = TavilySearchResults(max_results=10)
     result = await wrapped.ainvoke({"query": query})
-    return cast(list[dict[str, Any]], result)
+    result = cast(list[dict[str, Any]], result)
+    print(f"search    {result}")
+    # if result:
+    #         # Send the search results to the index graph
+    #         await index_graph.invoke({"documents": result})
+    return result
 
 
 TOOLS: List[Callable[..., Any]] = [web_search_tool]

diff --git a/src/shared/retrieval.py b/src/shared/retrieval.py
@@ -69,8 +69,7 @@ def make_elastic_retriever(
         embedding=embedding_model,
     )
     # Debugging statements
-    print(f"search_kwargs: {configuration.search_kwargs}")
-    yield vstore.as_retriever(search_kwargs=configuration.search_kwargs)
+    yield vstore.as_retriever(search_kwargs={"key":"value"})
 
 
 @contextmanager
@@ -83,7 +82,7 @@ def make_pinecone_retriever(
     vstore = PineconeVectorStore.from_existing_index(
         os.environ["PINECONE_INDEX_NAME"], embedding=embedding_model
     )
-    yield vstore.as_retriever(search_kwargs=configuration.search_kwargs)
+    yield vstore.as_retriever(search_kwargs={"key":"value"})
 
 
 @contextmanager
@@ -98,7 +97,7 @@ def make_mongodb_retriever(
         namespace="langgraph_retrieval_agent.default",
         embedding=embedding_model,
     )
-    yield vstore.as_retriever(search_kwargs=configuration.search_kwargs)
+    yield vstore.as_retriever(search_kwargs={"key":"value"})
 
 
 @contextmanager