neo4j-labs · aashipandya · Jul 12, 2024 · Jul 11, 2024 · Jul 11, 2024 · Jul 11, 2024
diff --git a/README.md b/README.md
@@ -40,7 +40,7 @@ DIFFBOT_API_KEY="your-diffbot-key"
 
 if you only want OpenAI:
 ```env
-LLM_MODELS="gpt-3.5,gpt-4o"
+LLM_MODELS="diffbot,openai-gpt-3.5,openai-gpt-4o"
 OPENAI_API_KEY="your-openai-key"
 ```
 
@@ -70,6 +70,18 @@ GOOGLE_CLIENT_ID="xxxx"
 
 You can of course combine all (local, youtube, wikipedia, s3 and gcs) or remove any you don't want/need.
 
+### Chat Modes
+
+By default,all of the chat modes will be available: vector, graph+vector and graph.
+If none of the mode is mentioned in the chat modes variable all modes will be available:
+```env
+CHAT_MODES=""
+```
+
+If however you want to specifiy the only vector mode or only graph mode you can do that by specifying the mode in the env:
+```env
+CHAT_MODES="vector,graph+vector"
+```
 
 #### Running Backend and Frontend separately (dev environment)
 Alternatively, you can run the backend and frontend separately:
@@ -134,7 +146,8 @@ Allow unauthenticated request : Yes
 | BACKEND_API_URL         | Optional           | http://localhost:8000 | URL for backend API                                                                       |
 | BLOOM_URL               | Optional           | https://workspace-preview.neo4j.io/workspace/explore?connectURL={CONNECT_URL}&search=Show+me+a+graph&featureGenAISuggestions=true&featureGenAISuggestionsInternal=true | URL for Bloom visualization |
 | REACT_APP_SOURCES       | Optional           | local,youtube,wiki,s3 | List of input sources that will be available                                               |
-| LLM_MODELS              | Optional           | diffbot,gpt-3.5,gpt-4o | Models available for selection on the frontend, used for entities extraction and Q&A Chatbot                          |
+| LLM_MODELS              | Optional           | diffbot,openai-gpt-3.5,openai-gpt-4o | Models available for selection on the frontend, used for entities extraction and Q&A
+| CHAT_MODES              | Optional           | vector,graph+vector,graph | Chat modes available for Q&A
 | ENV                     | Optional           | DEV           | Environment variable for the app                                                                 |
 | TIME_PER_CHUNK          | Optional           | 4             | Time per chunk for processing                                                                    |
 | CHUNK_SIZE              | Optional           | 5242880       | Size of each chunk of file for upload                                                                |

diff --git a/backend/src/QA_integration_new.py b/backend/src/QA_integration_new.py
@@ -322,7 +322,7 @@ def QA_RAG(graph, model, question, document_names,session_id, mode):
         if mode == "graph":
             graph_chain, qa_llm,model_version = create_graph_chain(model,graph)
             graph_response = get_graph_response(graph_chain,question)
-            ai_response = AIMessage(content=graph_response["response"])
+            ai_response = AIMessage(content=graph_response["response"]) if graph_response["response"] else AIMessage(content="Something went wrong")
             messages.append(ai_response)
             summarize_and_log(history, messages, qa_llm)
 
@@ -342,7 +342,7 @@ def QA_RAG(graph, model, question, document_names,session_id, mode):
         elif mode == "vector":
             retrieval_query = VECTOR_SEARCH_QUERY
         else:
-            retrieval_query = VECTOR_GRAPH_SEARCH_QUERY
+            retrieval_query = VECTOR_GRAPH_SEARCH_QUERY.format(no_of_entites=VECTOR_GRAPH_SEARCH_ENTITY_LIMIT)
 
         llm, doc_retriever, model_version = setup_chat(model, graph, session_id, document_names,retrieval_query)
 

diff --git a/backend/src/shared/constants.py b/backend/src/shared/constants.py
@@ -111,38 +111,102 @@
 # """  
 
 
+# VECTOR_GRAPH_SEARCH_QUERY = """
+# WITH node as chunk, score
+# // find the document of the chunk
+# MATCH (chunk)-[:PART_OF]->(d:Document)
+# // fetch entities
+# CALL { WITH chunk
+# // entities connected to the chunk
+# // todo only return entities that are actually in the chunk, remember we connect all extracted entities to all chunks
+# MATCH (chunk)-[:HAS_ENTITY]->(e)
+
+# // depending on match to query embedding either 1 or 2 step expansion
+# WITH CASE WHEN true // vector.similarity.cosine($embedding, e.embedding ) <= 0.95
+# THEN 
+# collect { MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,1}(:!Chunk&!Document) RETURN path }
+# ELSE 
+# collect { MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document) RETURN path } 
+# END as paths
+
+# RETURN collect{ unwind paths as p unwind relationships(p) as r return distinct r} as rels,
+# collect{ unwind paths as p unwind nodes(p) as n return distinct n} as nodes
+# }
+# // aggregate chunk-details and de-duplicate nodes and relationships
+# WITH d, collect(DISTINCT {chunk: chunk, score: score}) AS chunks, avg(score) as avg_score, apoc.coll.toSet(apoc.coll.flatten(collect(rels))) as rels,
+
+# // TODO sort by relevancy (embeddding comparision?) cut off after X (e.g. 25) nodes?
+# apoc.coll.toSet(apoc.coll.flatten(collect(
+#                 [r in rels |[startNode(r),endNode(r)]]),true)) as nodes
+
+# // generate metadata and text components for chunks, nodes and relationships
+# WITH d, avg_score,
+#      [c IN chunks | c.chunk.text] AS texts, 
+#      [c IN chunks | {id: c.chunk.id, score: c.score}] AS chunkdetails,  
+#   apoc.coll.sort([n in nodes | 
+
+# coalesce(apoc.coll.removeAll(labels(n),['__Entity__'])[0],"") +":"+ 
+# n.id + (case when n.description is not null then " ("+ n.description+")" else "" end)]) as nodeTexts,
+# 	apoc.coll.sort([r in rels 
+#     // optional filter if we limit the node-set
+#     // WHERE startNode(r) in nodes AND endNode(r) in nodes 
+#   | 
+# coalesce(apoc.coll.removeAll(labels(startNode(r)),['__Entity__'])[0],"") +":"+ 
+# startNode(r).id +
+# " " + type(r) + " " + 
+# coalesce(apoc.coll.removeAll(labels(endNode(r)),['__Entity__'])[0],"") +":" + 
+# endNode(r).id
+# ]) as relTexts
+
+# // combine texts into response-text
+# WITH d, avg_score,chunkdetails,
+# "Text Content:\n" +
+# apoc.text.join(texts,"\n----\n") +
+# "\n----\nEntities:\n"+
+# apoc.text.join(nodeTexts,"\n") +
+# "\n----\nRelationships:\n"+
+# apoc.text.join(relTexts,"\n")
+
+# as text
+# RETURN text, avg_score as score, {length:size(text), source: COALESCE( CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkdetails: chunkdetails} AS metadata
+# """
+
+VECTOR_GRAPH_SEARCH_ENTITY_LIMIT = 25
+
 VECTOR_GRAPH_SEARCH_QUERY = """
 WITH node as chunk, score
 // find the document of the chunk
 MATCH (chunk)-[:PART_OF]->(d:Document)
+
+// aggregate chunk-details
+WITH d, collect(DISTINCT {{chunk: chunk, score: score}}) AS chunks, avg(score) as avg_score
 // fetch entities
-CALL { WITH chunk
+CALL {{ WITH chunks
+UNWIND chunks as chunkScore
+WITH chunkScore.chunk as chunk
 // entities connected to the chunk
 // todo only return entities that are actually in the chunk, remember we connect all extracted entities to all chunks
-MATCH (chunk)-[:HAS_ENTITY]->(e)
-
+// todo sort by relevancy (embeddding comparision?) cut off after X (e.g. 25) nodes?
+OPTIONAL MATCH (chunk)-[:HAS_ENTITY]->(e)
+WITH e, count(*) as numChunks 
+ORDER BY numChunks DESC LIMIT {no_of_entites}
 // depending on match to query embedding either 1 or 2 step expansion
 WITH CASE WHEN true // vector.similarity.cosine($embedding, e.embedding ) <= 0.95
 THEN 
-collect { MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,1}(:!Chunk&!Document) RETURN path }
+collect {{ OPTIONAL MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){{0,1}}(:!Chunk&!Document) RETURN path }}
 ELSE 
-collect { MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document) RETURN path } 
-END as paths
-
-RETURN collect{ unwind paths as p unwind relationships(p) as r return distinct r} as rels,
-collect{ unwind paths as p unwind nodes(p) as n return distinct n} as nodes
-}
-// aggregate chunk-details and de-duplicate nodes and relationships
-WITH d, collect(DISTINCT {chunk: chunk, score: score}) AS chunks, avg(score) as avg_score, apoc.coll.toSet(apoc.coll.flatten(collect(rels))) as rels,
-
-// TODO sort by relevancy (embeddding comparision?) cut off after X (e.g. 25) nodes?
-apoc.coll.toSet(apoc.coll.flatten(collect(
-                [r in rels |[startNode(r),endNode(r)]]),true)) as nodes
+collect {{ OPTIONAL MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){{0,2}}(:!Chunk&!Document) RETURN path }} 
+END as paths, e
+WITH apoc.coll.toSet(apoc.coll.flatten(collect(distinct paths))) as paths, collect(distinct e) as entities
+// de-duplicate nodes and relationships across chunks
+RETURN collect{{ unwind paths as p unwind relationships(p) as r return distinct r}} as rels,
+collect{{ unwind paths as p unwind nodes(p) as n return distinct n}} as nodes, entities
+}}
 
 // generate metadata and text components for chunks, nodes and relationships
 WITH d, avg_score,
      [c IN chunks | c.chunk.text] AS texts, 
-     [c IN chunks | {id: c.chunk.id, score: c.score}] AS chunkdetails,  
+     [c IN chunks | {{id: c.chunk.id, score: c.score}}] AS chunkdetails, 
   apoc.coll.sort([n in nodes | 
 
 coalesce(apoc.coll.removeAll(labels(n),['__Entity__'])[0],"") +":"+ 
@@ -154,24 +218,20 @@
 coalesce(apoc.coll.removeAll(labels(startNode(r)),['__Entity__'])[0],"") +":"+ 
 startNode(r).id +
 " " + type(r) + " " + 
-coalesce(apoc.coll.removeAll(labels(endNode(r)),['__Entity__'])[0],"") +":" + 
-endNode(r).id
+coalesce(apoc.coll.removeAll(labels(endNode(r)),['__Entity__'])[0],"") +":" + endNode(r).id
 ]) as relTexts
-
+, entities
 // combine texts into response-text
-WITH d, avg_score,chunkdetails,
-"Text Content:\n" +
-apoc.text.join(texts,"\n----\n") +
-"\n----\nEntities:\n"+
-apoc.text.join(nodeTexts,"\n") +
-"\n----\nRelationships:\n"+
-apoc.text.join(relTexts,"\n")
-
-as text
-RETURN text, avg_score as score, {length:size(text), source: COALESCE( CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkdetails: chunkdetails} AS metadata
-"""
-
-
 
+WITH d, avg_score,chunkdetails,
+"Text Content:\\n" +
+apoc.text.join(texts,"\\n----\\n") +
+"\\n----\\nEntities:\\n"+
+apoc.text.join(nodeTexts,"\\n") +
+"\\n----\\nRelationships:\\n" +
+apoc.text.join(relTexts,"\\n")
 
+as text,entities
 
+RETURN text, avg_score as score, {{length:size(text), source: COALESCE( CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkdetails: chunkdetails}} AS metadata
+"""
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -51,13 +51,14 @@ services:
       args:
         - BACKEND_API_URL=${BACKEND_API_URL-http://localhost:8000}
         - REACT_APP_SOURCES=${REACT_APP_SOURCES-local,youtube,wiki,s3}
-        - LLM_MODELS=${LLM_MODELS-diffbot,gpt-3.5,gpt-4o}
+        - LLM_MODELS=${LLM_MODELS-diffbot,openai-gpt-3.5,openai-gpt-4o}
         - GOOGLE_CLIENT_ID=${GOOGLE_CLIENT_ID-""}
         - BLOOM_URL=${BLOOM_URL-https://workspace-preview.neo4j.io/workspace/explore?connectURL={CONNECT_URL}&search=Show+me+a+graph&featureGenAISuggestions=true&featureGenAISuggestionsInternal=true}
         - TIME_PER_CHUNK=${TIME_PER_CHUNK-4}
         - TIME_PER_PAGE=${TIME_PER_PAGE-50}
         - CHUNK_SIZE=${CHUNK_SIZE-5242880}
         - ENV=${ENV-DEV}
+        - CHAT_MODES=${CHAT_MODES-""}
     volumes:
       - ./frontend:/app
       - /app/node_modules

diff --git a/example.env b/example.env
@@ -28,9 +28,10 @@ ENTITY_EMBEDDING=True
 BACKEND_API_URL="http://localhost:8000"
 BLOOM_URL="https://workspace-preview.neo4j.io/workspace/explore?connectURL={CONNECT_URL}&search=Show+me+a+graph&featureGenAISuggestions=true&featureGenAISuggestionsInternal=true"
 REACT_APP_SOURCES="local,youtube,wiki,s3,web"
-LLM_MODELS="diffbot,gpt-3.5,gpt-4o" # ",ollama_llama3"
+LLM_MODELS="diffbot,openai-gpt-3.5,openai-gpt-4o" # ",ollama_llama3"
 ENV="DEV"
 TIME_PER_CHUNK=4
 TIME_PER_PAGE=50
 CHUNK_SIZE=5242880
 GOOGLE_CLIENT_ID=""
+CHAT_MODES=""
diff --git a/frontend/README.md b/frontend/README.md
@@ -1,6 +1,6 @@
 # Neo4j Knowledge Graph Builder
 
-Reactjs Responsive app for building an knowledge graph using [Neo4j Needle](https://www.neo4j.design/).
+Reactjs app for building an knowledge graph using [Neo4j Needle](https://www.neo4j.design/).
 
 ## Features
 - 🚀 Responsive: Adapts to different screen sizes for optimal user experience.

diff --git a/frontend/src/assets/images/web-search-darkmode-final.svg b/frontend/src/assets/images/web-search-darkmode-final.svg