neo4j-labs · kartikpersistent · Nov 12, 2024 · Oct 28, 2024 · Nov 12, 2024
diff --git a/README.md b/README.md
@@ -31,6 +31,11 @@ If you are using Neo4j Desktop, you will not be able to use the docker-compose b
 ### Local deployment
 #### Running through docker-compose
 By default only OpenAI and Diffbot are enabled since Gemini requires extra GCP configurations.
+Accoroding to enviornment we are configuring the models which is indicated by VITE_LLM_MODELS_PROD variable we can configure model based on our need.
+EX:
+```env
+VITE_LLM_MODELS_PROD="openai_gpt_4o,openai_gpt_4o_mini,diffbot,gemini_1.5_flash"
+```
 
 In your root folder, create a .env file with your OPENAI and DIFFBOT keys (if you want to use both):
 ```env
@@ -40,13 +45,13 @@ DIFFBOT_API_KEY="your-diffbot-key"
 
 if you only want OpenAI:
 ```env
-VITE_LLM_MODELS="diffbot,openai-gpt-3.5,openai-gpt-4o"
+VITE_LLM_MODELS_PROD="diffbot,openai-gpt-3.5,openai-gpt-4o"
 OPENAI_API_KEY="your-openai-key"
 ```
 
 if you only want Diffbot:
 ```env
-VITE_LLM_MODELS="diffbot"
+VITE_LLM_MODELS_PROD="diffbot"
 DIFFBOT_API_KEY="your-diffbot-key"
 ```
 
@@ -72,15 +77,15 @@ You can of course combine all (local, youtube, wikipedia, s3 and gcs) or remove
 
 ### Chat Modes
 
-By default,all of the chat modes will be available: vector, graph+vector and graph.
+By default,all of the chat modes will be available: vector, graph_vector, graph, fulltext, graph_vector_fulltext , entity_vector and global_vector.
 If none of the mode is mentioned in the chat modes variable all modes will be available:
 ```env
 VITE_CHAT_MODES=""
 ```
 
 If however you want to specify the only vector mode or only graph mode you can do that by specifying the mode in the env:
 ```env
-VITE_CHAT_MODES="vector,graph+vector"
+VITE_CHAT_MODES="vector,graph"
 ```
 
 #### Running Backend and Frontend separately (dev environment)
@@ -144,18 +149,19 @@ Allow unauthenticated request : Yes
 | VITE_BACKEND_API_URL         | Optional           | http://localhost:8000 | URL for backend API                                                                       |
 | VITE_BLOOM_URL               | Optional           | https://workspace-preview.neo4j.io/workspace/explore?connectURL={CONNECT_URL}&search=Show+me+a+graph&featureGenAISuggestions=true&featureGenAISuggestionsInternal=true | URL for Bloom visualization |
 | VITE_REACT_APP_SOURCES       | Mandatory          | local,youtube,wiki,s3 | List of input sources that will be available                                               |
-| VITE_LLM_MODELS              | Mandatory          | diffbot,openai-gpt-3.5,openai-gpt-4o | Models available for selection on the frontend, used for entities extraction and Q&A
 | VITE_CHAT_MODES              | Mandatory          | vector,graph+vector,graph,hybrid | Chat modes available for Q&A
 | VITE_ENV                     | Mandatory          | DEV or PROD           | Environment variable for the app                                                                 |
 | VITE_TIME_PER_PAGE          | Optional           | 50             | Time per page for processing                                                                    |
 | VITE_CHUNK_SIZE              | Optional           | 5242880       | Size of each chunk of file for upload                                                                |
 | VITE_GOOGLE_CLIENT_ID        | Optional           |               | Client ID for Google authentication                                                              |
+| VITE_LLM_MODELS_PROD         | Optional      | openai_gpt_4o,openai_gpt_4o_mini,diffbot,gemini_1.5_flash | To Distinguish models based on the Enviornment PROD or DEV 
 | GCS_FILE_CACHE          | Optional           | False         | If set to True, will save the files to process into GCS. If set to False, will save the files locally   |
 | ENTITY_EMBEDDING        | Optional           | False         | If set to True, It will add embeddings for each entity in database |
 | LLM_MODEL_CONFIG_ollama_<model_name>         | Optional      |               | Set ollama config as - model_name,model_local_url for local deployments |
 | RAGAS_EMBEDDING_MODEL         | Optional      | openai              | embedding model used by ragas evaluation framework                               |
 
 
+
 ## For local llms (Ollama)
 1. Pull the docker imgage of ollama
 ```bash

diff --git a/backend/score.py b/backend/score.py
@@ -12,14 +12,14 @@
 from langchain_google_vertexai import ChatVertexAI
 from src.api_response import create_api_response
 from src.graphDB_dataAccess import graphDBdataAccess
-from src.graph_query import get_graph_results
+from src.graph_query import get_graph_results,get_chunktext_results
 from src.chunkid_entities import get_entities_from_chunkids
 from src.post_processing import create_vector_fulltext_indexes, create_entity_embedding
 from sse_starlette.sse import EventSourceResponse
 from src.communities import create_communities
 from src.neighbours import get_neighbour_nodes
 import json
-from typing import List, Mapping
+from typing import List, Mapping, Union
 from starlette.middleware.sessions import SessionMiddleware
 import google_auth_oauthlib.flow
 from google.oauth2.credentials import Credentials
@@ -33,8 +33,10 @@
 from Secweb.ContentSecurityPolicy import ContentSecurityPolicy
 from Secweb.XContentTypeOptions import XContentTypeOptions
 from Secweb.XFrameOptions import XFrame
-
+from fastapi.middleware.gzip import GZipMiddleware
 from src.ragas_eval import *
+from starlette.types import ASGIApp, Message, Receive, Scope, Send
+import gzip
 
 logger = CustomLogger()
 CHUNK_DIR = os.path.join(os.path.dirname(__file__), "chunks")
@@ -49,14 +51,42 @@ def healthy():
 
 def sick():
     return False
-
+class CustomGZipMiddleware:
+    def __init__(
+        self,
+        app: ASGIApp,
+        paths: List[str],
+        minimum_size: int = 1000,
+        compresslevel: int = 5
+    ):
+        self.app = app
+        self.paths = paths
+        self.minimum_size = minimum_size
+        self.compresslevel = compresslevel
+
+    async def __call__(self, scope: Scope, receive: Receive, send: Send):
+        if scope["type"] != "http":
+            return await self.app(scope, receive, send)
+
+        path = scope["path"]
+        should_compress = any(path.startswith(gzip_path) for gzip_path in self.paths)
+
+        if not should_compress:
+            return await self.app(scope, receive, send)
+
+        gzip_middleware = GZipMiddleware(
+            app=self.app,
+            minimum_size=self.minimum_size,
+            compresslevel=self.compresslevel
+        )
+        await gzip_middleware(scope, receive, send)
 app = FastAPI()
 # SecWeb(app=app, Option={'referrer': False, 'xframe': False})
-# app.add_middleware(HSTS, Option={'max-age': 4})
-# app.add_middleware(ContentSecurityPolicy, Option={'default-src': ["'self'"], 'base-uri': ["'self'"], 'block-all-mixed-content': []}, script_nonce=False, style_nonce=False, report_only=False)
-# app.add_middleware(XContentTypeOptions)
-# app.add_middleware(XFrame, Option={'X-Frame-Options': 'DENY'})
-
+app.add_middleware(ContentSecurityPolicy, Option={'default-src': ["'self'"], 'base-uri': ["'self'"], 'block-all-mixed-content': []}, script_nonce=False, style_nonce=False, report_only=False)
+app.add_middleware(XContentTypeOptions)
+app.add_middleware(XFrame, Option={'X-Frame-Options': 'DENY'})
+#app.add_middleware(GZipMiddleware, minimum_size=1000, compresslevel=5)
+app.add_middleware(CustomGZipMiddleware, minimum_size=1000, compresslevel=5,paths=["/sources_list","/url/scan","/extract","/chat_bot","/chunk_entities","/get_neighbours","/graph_query","/schema","/populate_graph_schema","/get_unconnected_nodes_list","/get_duplicate_nodes","/fetch_chunktext"])
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -818,5 +848,57 @@ async def calculate_metric(question: str = Form(),
    finally:
        gc.collect()
 
+@app.post("/fetch_chunktext")
+async def fetch_chunktext(
+   uri: str = Form(),
+   database: str = Form(),
+   userName: str = Form(),
+   password: str = Form(),
+   document_name: str = Form(),
+   page_no: int = Form(1)
+):
+   try:
+       payload_json_obj = {
+           'api_name': 'fetch_chunktext',
+           'db_url': uri,
+           'userName': userName,
+           'database': database,
+           'document_name': document_name,
+           'page_no': page_no,
+           'logging_time': formatted_time(datetime.now(timezone.utc))
+       }
+       logger.log_struct(payload_json_obj, "INFO")
+       start = time.time()
+       result = await asyncio.to_thread(
+           get_chunktext_results,
+           uri=uri,
+           username=userName,
+           password=password,
+           database=database,
+           document_name=document_name,
+           page_no=page_no
+       )
+       end = time.time()
+       elapsed_time = end - start
+       json_obj = {
+           'api_name': 'fetch_chunktext',
+           'db_url': uri,
+           'document_name': document_name,
+           'page_no': page_no,
+           'logging_time': formatted_time(datetime.now(timezone.utc)),
+           'elapsed_api_time': f'{elapsed_time:.2f}'
+       }
+       logger.log_struct(json_obj, "INFO")
+       return create_api_response('Success', data=result, message=f"Total elapsed API time {elapsed_time:.2f}")
+   except Exception as e:
+       job_status = "Failed"
+       message = "Unable to get chunk text response"
+       error_message = str(e)
+       logging.exception(f'Exception in fetch_chunktext: {error_message}')
+       return create_api_response(job_status, message=message, error=error_message)
+   finally:
+       gc.collect()
+
+
 if __name__ == "__main__":
     uvicorn.run(app)
diff --git a/backend/src/communities.py b/backend/src/communities.py
@@ -107,24 +107,38 @@
 STORE_COMMUNITY_SUMMARIES = """
 UNWIND $data AS row
 MERGE (c:__Community__ {id:row.community})
-SET c.summary = row.summary
+SET c.summary = row.summary,
+    c.title = row.title
 """ 
 
+
 COMMUNITY_SYSTEM_TEMPLATE = "Given input triples, generate the information summary. No pre-amble."
 
-COMMUNITY_TEMPLATE = """Based on the provided nodes and relationships that belong to the same graph community,
-generate a natural language summary of the provided information:
-{community_info}
 
-Summary:""" 
+COMMUNITY_TEMPLATE = """
+Based on the provided nodes and relationships that belong to the same graph community,
+generate following output in exact format
+title: A concise title, no more than 4 words,
+summary: A natural language summary of the information
+{community_info}
+Example output:
+title: Example Title,
+summary: This is an example summary that describes the key information of this community.
+"""
 
 PARENT_COMMUNITY_SYSTEM_TEMPLATE = "Given an input list of community summaries, generate a summary of the information"
 
 PARENT_COMMUNITY_TEMPLATE = """Based on the provided list of community summaries that belong to the same graph community, 
-generate a natural language summary of the information.Include all the necessary information as possible
+generate following output in exact format
+title: A concise title, no more than 4 words,
+summary: A natural language summary of the information. Include all the necessary information as much as possible.
+
 {community_info}
 
-Summary:""" 
+Example output:
+title: Example Title,
+summary: This is an example summary that describes the key information of this community.
+""" 
 
 
 GET_COMMUNITY_DETAILS = """
@@ -277,8 +291,17 @@ def process_community_info(community, chain, is_parent=False):
             combined_text = " ".join(f"Summary {i+1}: {summary}" for i, summary in enumerate(community.get("texts", [])))
         else:
             combined_text = prepare_string(community)
-        summary = chain.invoke({'community_info': combined_text})
-        return {"community": community['communityId'], "summary": summary}
+        summary_response = chain.invoke({'community_info': combined_text})
+        lines = summary_response.splitlines()
+        title = "Untitled Community"
+        summary = ""
+        for line in lines:
+            if line.lower().startswith("title"):
+                title = line.split(":", 1)[-1].strip()
+            elif line.lower().startswith("summary"):
+                summary = line.split(":", 1)[-1].strip()     
+        logging.info(f"Community Title : {title}")
+        return {"community": community['communityId'], "title":title, "summary": summary}
     except Exception as e:
         logging.error(f"Failed to process community {community.get('communityId', 'unknown')}: {e}")
         return None
@@ -291,7 +314,7 @@ def create_community_summaries(gds, model):
         summaries = []
         with ThreadPoolExecutor() as executor:
             futures = [executor.submit(process_community_info, community, community_chain) for community in community_info_list.to_dict(orient="records")]
-            
+
             for future in as_completed(futures):
                 result = future.result()
                 if result:
@@ -482,9 +505,3 @@ def create_communities(uri, username, password, database,model=COMMUNITY_CREATIO
             logging.warning("Failed to write communities. Constraint was not applied.")
     except Exception as e:
         logging.error(f"Failed to create communities: {e}")
-
-
-
-
-
-
diff --git a/backend/src/graphDB_dataAccess.py b/backend/src/graphDB_dataAccess.py
@@ -354,7 +354,7 @@ def get_duplicate_nodes_list(self):
         score_value = float(os.environ.get('DUPLICATE_SCORE_VALUE'))
         text_distance = int(os.environ.get('DUPLICATE_TEXT_DISTANCE'))
         query_duplicate_nodes = """
-                MATCH (n:!Chunk&!Session&!Document&!`__Community__`) with n 
+                MATCH (n:!Chunk&!Session&!Document&!`__Community__`&!`__Entity__`) with n 
                 WHERE n.embedding is not null and n.id is not null // and size(toString(n.id)) > 3
                 WITH n ORDER BY count {{ (n)--() }} DESC, size(toString(n.id)) DESC // updated
                 WITH collect(n) as nodes

diff --git a/backend/src/graph_query.py b/backend/src/graph_query.py
@@ -3,7 +3,7 @@
 from neo4j import GraphDatabase
 import os
 import json
-from src.shared.constants import GRAPH_CHUNK_LIMIT,GRAPH_QUERY
+from src.shared.constants import GRAPH_CHUNK_LIMIT,GRAPH_QUERY,CHUNK_TEXT_QUERY,COUNT_CHUNKS_QUERY
 # from neo4j.debug import watch
 
 # watch("neo4j")
@@ -226,3 +226,34 @@ def get_graph_results(uri, username, password,database,document_names):
         driver.close()
 
 
+def get_chunktext_results(uri, username, password, database, document_name, page_no):
+   """Retrieves chunk text, position, and page number from graph data with pagination."""
+   try:
+       logging.info("Starting chunk text query process")
+       offset = 10
+       skip = (page_no - 1) * offset
+       limit = offset
+       driver = GraphDatabase.driver(uri, auth=(username, password))
+       with driver.session(database=database) as session:
+           total_chunks_result = session.run(COUNT_CHUNKS_QUERY, file_name=document_name)
+           total_chunks = total_chunks_result.single()["total_chunks"]
+           total_pages = (total_chunks + offset - 1) // offset  # Calculate total pages
+           records = session.run(CHUNK_TEXT_QUERY, file_name=document_name, skip=skip, limit=limit)
+           pageitems = [
+               {
+                   "text": record["chunk_text"],
+                   "position": record["chunk_position"],
+                   "pagenumber": record["page_number"]
+               }
+               for record in records
+           ]
+           logging.info(f"Query process completed with {len(pageitems)} chunks retrieved")
+           return {
+               "pageitems": pageitems,
+               "total_pages": total_pages
+           }
+   except Exception as e:
+       logging.error(f"An error occurred in get_chunktext_results. Error: {str(e)}")
+       raise Exception("An error occurred in get_chunktext_results. Please check the logs for more details.") from e
+   finally:
+       driver.close()
diff --git a/backend/src/neighbours.py b/backend/src/neighbours.py
@@ -20,7 +20,8 @@
             labels: [coalesce(apoc.coll.removeAll(labels(node), ['__Entity__'])[0], "*")],
             element_id: elementId(node),
             properties: { 
-                id: CASE WHEN node.id IS NOT NULL THEN node.id ELSE node.fileName END
+                id: CASE WHEN node.id IS NOT NULL THEN node.id ELSE node.fileName END,
+                title:  CASE WHEN node.title IS NOT NULL THEN node.title ELSE " " END
             }
         }
     ] AS nodes,

diff --git a/backend/src/shared/constants.py b/backend/src/shared/constants.py
@@ -161,6 +161,19 @@
     ] AS entities
 """
 
+COUNT_CHUNKS_QUERY = """
+MATCH (d:Document {fileName: $file_name})<-[:PART_OF]-(c:Chunk)
+RETURN count(c) AS total_chunks
+"""
+
+CHUNK_TEXT_QUERY = """
+MATCH (d:Document {fileName: $file_name})<-[:PART_OF]-(c:Chunk)
+RETURN c.text AS chunk_text, c.position AS chunk_position, c.page_number AS page_number
+ORDER BY c.position
+SKIP $skip
+LIMIT $limit
+"""
+
 ## CHAT SETUP
 CHAT_MAX_TOKENS = 1000
 CHAT_SEARCH_KWARG_SCORE_THRESHOLD = 0.5
@@ -717,4 +730,4 @@
     value "2023-03-15"."
     "## 5. Strict Compliance\n"
     "Adhere to the rules strictly. Non-compliance will result in termination."
-    """
+    """