Skip to content

added new retrieval query #533

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions backend/src/QA_integration_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ def QA_RAG(graph, model, question, document_names,session_id, mode):
if mode == "graph":
graph_chain, qa_llm,model_version = create_graph_chain(model,graph)
graph_response = get_graph_response(graph_chain,question)
ai_response = AIMessage(content=graph_response["response"])
ai_response = AIMessage(content=graph_response["response"]) if graph_response["response"] else AIMessage(content="Something went wrong")
messages.append(ai_response)
summarize_and_log(history, messages, qa_llm)

Expand All @@ -342,7 +342,7 @@ def QA_RAG(graph, model, question, document_names,session_id, mode):
elif mode == "vector":
retrieval_query = VECTOR_SEARCH_QUERY
else:
retrieval_query = VECTOR_GRAPH_SEARCH_QUERY
retrieval_query = VECTOR_GRAPH_SEARCH_QUERY.format(no_of_entites=VECTOR_GRAPH_SEARCH_ENTITY_LIMIT)

llm, doc_retriever, model_version = setup_chat(model, graph, session_id, document_names,retrieval_query)

Expand Down
126 changes: 93 additions & 33 deletions backend/src/shared/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,38 +111,102 @@
# """


# VECTOR_GRAPH_SEARCH_QUERY = """
# WITH node as chunk, score
# // find the document of the chunk
# MATCH (chunk)-[:PART_OF]->(d:Document)
# // fetch entities
# CALL { WITH chunk
# // entities connected to the chunk
# // todo only return entities that are actually in the chunk, remember we connect all extracted entities to all chunks
# MATCH (chunk)-[:HAS_ENTITY]->(e)

# // depending on match to query embedding either 1 or 2 step expansion
# WITH CASE WHEN true // vector.similarity.cosine($embedding, e.embedding ) <= 0.95
# THEN
# collect { MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,1}(:!Chunk&!Document) RETURN path }
# ELSE
# collect { MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document) RETURN path }
# END as paths

# RETURN collect{ unwind paths as p unwind relationships(p) as r return distinct r} as rels,
# collect{ unwind paths as p unwind nodes(p) as n return distinct n} as nodes
# }
# // aggregate chunk-details and de-duplicate nodes and relationships
# WITH d, collect(DISTINCT {chunk: chunk, score: score}) AS chunks, avg(score) as avg_score, apoc.coll.toSet(apoc.coll.flatten(collect(rels))) as rels,

# // TODO sort by relevancy (embeddding comparision?) cut off after X (e.g. 25) nodes?
# apoc.coll.toSet(apoc.coll.flatten(collect(
# [r in rels |[startNode(r),endNode(r)]]),true)) as nodes

# // generate metadata and text components for chunks, nodes and relationships
# WITH d, avg_score,
# [c IN chunks | c.chunk.text] AS texts,
# [c IN chunks | {id: c.chunk.id, score: c.score}] AS chunkdetails,
# apoc.coll.sort([n in nodes |

# coalesce(apoc.coll.removeAll(labels(n),['__Entity__'])[0],"") +":"+
# n.id + (case when n.description is not null then " ("+ n.description+")" else "" end)]) as nodeTexts,
# apoc.coll.sort([r in rels
# // optional filter if we limit the node-set
# // WHERE startNode(r) in nodes AND endNode(r) in nodes
# |
# coalesce(apoc.coll.removeAll(labels(startNode(r)),['__Entity__'])[0],"") +":"+
# startNode(r).id +
# " " + type(r) + " " +
# coalesce(apoc.coll.removeAll(labels(endNode(r)),['__Entity__'])[0],"") +":" +
# endNode(r).id
# ]) as relTexts

# // combine texts into response-text
# WITH d, avg_score,chunkdetails,
# "Text Content:\n" +
# apoc.text.join(texts,"\n----\n") +
# "\n----\nEntities:\n"+
# apoc.text.join(nodeTexts,"\n") +
# "\n----\nRelationships:\n"+
# apoc.text.join(relTexts,"\n")

# as text
# RETURN text, avg_score as score, {length:size(text), source: COALESCE( CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkdetails: chunkdetails} AS metadata
# """

VECTOR_GRAPH_SEARCH_ENTITY_LIMIT = 25

VECTOR_GRAPH_SEARCH_QUERY = """
WITH node as chunk, score
// find the document of the chunk
MATCH (chunk)-[:PART_OF]->(d:Document)

// aggregate chunk-details
WITH d, collect(DISTINCT {{chunk: chunk, score: score}}) AS chunks, avg(score) as avg_score
// fetch entities
CALL { WITH chunk
CALL {{ WITH chunks
UNWIND chunks as chunkScore
WITH chunkScore.chunk as chunk
// entities connected to the chunk
// todo only return entities that are actually in the chunk, remember we connect all extracted entities to all chunks
MATCH (chunk)-[:HAS_ENTITY]->(e)

// todo sort by relevancy (embeddding comparision?) cut off after X (e.g. 25) nodes?
OPTIONAL MATCH (chunk)-[:HAS_ENTITY]->(e)
WITH e, count(*) as numChunks
ORDER BY numChunks DESC LIMIT {no_of_entites}
// depending on match to query embedding either 1 or 2 step expansion
WITH CASE WHEN true // vector.similarity.cosine($embedding, e.embedding ) <= 0.95
THEN
collect { MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,1}(:!Chunk&!Document) RETURN path }
collect {{ OPTIONAL MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){{0,1}}(:!Chunk&!Document) RETURN path }}
ELSE
collect { MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document) RETURN path }
END as paths

RETURN collect{ unwind paths as p unwind relationships(p) as r return distinct r} as rels,
collect{ unwind paths as p unwind nodes(p) as n return distinct n} as nodes
}
// aggregate chunk-details and de-duplicate nodes and relationships
WITH d, collect(DISTINCT {chunk: chunk, score: score}) AS chunks, avg(score) as avg_score, apoc.coll.toSet(apoc.coll.flatten(collect(rels))) as rels,

// TODO sort by relevancy (embeddding comparision?) cut off after X (e.g. 25) nodes?
apoc.coll.toSet(apoc.coll.flatten(collect(
[r in rels |[startNode(r),endNode(r)]]),true)) as nodes
collect {{ OPTIONAL MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){{0,2}}(:!Chunk&!Document) RETURN path }}
END as paths, e
WITH apoc.coll.toSet(apoc.coll.flatten(collect(distinct paths))) as paths, collect(distinct e) as entities
// de-duplicate nodes and relationships across chunks
RETURN collect{{ unwind paths as p unwind relationships(p) as r return distinct r}} as rels,
collect{{ unwind paths as p unwind nodes(p) as n return distinct n}} as nodes, entities
}}

// generate metadata and text components for chunks, nodes and relationships
WITH d, avg_score,
[c IN chunks | c.chunk.text] AS texts,
[c IN chunks | {id: c.chunk.id, score: c.score}] AS chunkdetails,
[c IN chunks | {{id: c.chunk.id, score: c.score}}] AS chunkdetails,
apoc.coll.sort([n in nodes |

coalesce(apoc.coll.removeAll(labels(n),['__Entity__'])[0],"") +":"+
Expand All @@ -154,24 +218,20 @@
coalesce(apoc.coll.removeAll(labels(startNode(r)),['__Entity__'])[0],"") +":"+
startNode(r).id +
" " + type(r) + " " +
coalesce(apoc.coll.removeAll(labels(endNode(r)),['__Entity__'])[0],"") +":" +
endNode(r).id
coalesce(apoc.coll.removeAll(labels(endNode(r)),['__Entity__'])[0],"") +":" + endNode(r).id
]) as relTexts

, entities
// combine texts into response-text
WITH d, avg_score,chunkdetails,
"Text Content:\n" +
apoc.text.join(texts,"\n----\n") +
"\n----\nEntities:\n"+
apoc.text.join(nodeTexts,"\n") +
"\n----\nRelationships:\n"+
apoc.text.join(relTexts,"\n")

as text
RETURN text, avg_score as score, {length:size(text), source: COALESCE( CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkdetails: chunkdetails} AS metadata
"""



WITH d, avg_score,chunkdetails,
"Text Content:\\n" +
apoc.text.join(texts,"\\n----\\n") +
"\\n----\\nEntities:\\n"+
apoc.text.join(nodeTexts,"\\n") +
"\\n----\\nRelationships:\\n" +
apoc.text.join(relTexts,"\\n")

as text,entities

RETURN text, avg_score as score, {{length:size(text), source: COALESCE( CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkdetails: chunkdetails}} AS metadata
"""