Skip to content

STAGING to main #448

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 18 additions & 11 deletions backend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
FROM python:3.10
FROM python:3.10-slim
WORKDIR /code
ENV PORT 8000
EXPOSE 8000
# Install dependencies and clean up in one layer
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libgl1-mesa-glx \
cmake \
poppler-utils \
tesseract-ocr && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Set LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
# Copy requirements file and install Python dependencies
COPY requirements.txt /code/
RUN pip install --no-cache-dir --upgrade -r requirements.txt
# Copy application code
COPY . /code
RUN apt-get update \
&& apt-get install -y libgl1-mesa-glx cmake \
&& apt-get install -y poppler-utils \
&& apt install -y tesseract-ocr \
&& export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH \
&& pip install --no-cache-dir --upgrade -r /code/requirements.txt

# CMD ["uvicorn", "score:app", "--host", "0.0.0.0", "--port", "8000","--workers", "4"]
CMD ["gunicorn", "score:app","--workers","4","--worker-class","uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8000", "--timeout", "300"]

# Set command
CMD ["gunicorn", "score:app", "--workers", "2", "--worker-class", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:8000", "--timeout", "300"]
1 change: 1 addition & 0 deletions backend/example.env
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ LANGCHAIN_API_KEY = ""
LANGCHAIN_PROJECT = ""
LANGCHAIN_TRACING_V2 = ""
LANGCHAIN_ENDPOINT = ""
GCS_FILE_CACHE = "" #save the file into GCS or local, SHould be True or False
2 changes: 1 addition & 1 deletion backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ google-cloud-aiplatform
google-cloud-bigquery==3.19.0
google-cloud-core==2.4.1
google-cloud-resource-manager==1.12.3
google-cloud-storage==2.16.0
google-cloud-storage
google-crc32c==1.5.0
google-resumable-media==2.7.0
googleapis-common-protos==1.63.0
Expand Down
6 changes: 3 additions & 3 deletions backend/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ async def extract_knowledge_graph_from_file(
merged_file_path = os.path.join(MERGED_DIR,file_name)
logging.info(f'File path:{merged_file_path}')
result = await asyncio.to_thread(
extract_graph_from_file_local_file, graph, model, file_name, merged_file_path, allowedNodes, allowedRelationship)
extract_graph_from_file_local_file, graph, model, merged_file_path, file_name, allowedNodes, allowedRelationship)

elif source_type == 's3 bucket' and source_url:
result = await asyncio.to_thread(
Expand Down Expand Up @@ -191,7 +191,7 @@ async def extract_knowledge_graph_from_file(
error_message = str(e)
graphDb_data_Access.update_exception_db(file_name,error_message)
if source_type == 'local file':
delete_uploaded_local_file(merged_file_path, file_name)
delete_file_from_gcs(BUCKET_UPLOAD,file_name)
josn_obj = {'message':message,'error_message':error_message, 'file_name': file_name,'status':'Failed','db_url':uri,'failed_count':1, 'source_type': source_type}
logger.log_struct(josn_obj)
logging.exception(f'File Failed in extraction: {josn_obj}')
Expand Down Expand Up @@ -342,7 +342,7 @@ async def upload_large_file_into_chunks(file:UploadFile = File(...), chunkNumber
password=Form(None), database=Form(None)):
try:
graph = create_graph_database_connection(uri, userName, password, database)
result = await asyncio.to_thread(upload_file, graph, model, file, chunkNumber, totalChunks, originalname, CHUNK_DIR, MERGED_DIR)
result = await asyncio.to_thread(upload_file, graph, model, file, chunkNumber, totalChunks, originalname, uri, CHUNK_DIR, MERGED_DIR)
josn_obj = {'api_name':'upload','db_url':uri}
logger.log_struct(josn_obj)
if int(chunkNumber) == int(totalChunks):
Expand Down
27 changes: 18 additions & 9 deletions backend/src/QA_integration_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
MATCH (chunk)-[:PART_OF]->(d:Document)
CALL { WITH chunk
MATCH (chunk)-[:HAS_ENTITY]->(e)
MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,3}(:!Chunk&!Document)
MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document)
UNWIND rels as r
RETURN collect(distinct r) as rels
}
Expand All @@ -45,22 +45,26 @@
apoc.text.join(texts,"\n----\n") +
apoc.text.join(entities,"\n")
as text, entities, chunkIds, page_numbers ,start_times
RETURN text, score, {source: COALESCE(CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkIds:chunkIds, page_numbers:page_numbers,start_times:start_times} as metadata
RETURN text, score, {source: COALESCE(CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkIds:chunkIds, page_numbers:page_numbers,start_times:start_times,entities:entities} as metadata
"""

SYSTEM_TEMPLATE = """
You are an AI-powered question-answering agent. Your task is to provide accurate and concise responses to user queries based on the given context, chat history, and available resources.
You are an AI-powered question-answering agent. Your task is to provide accurate and comprehensive responses to user queries based on the given context, chat history, and available resources.

### Response Guidelines:
1. **Direct Answers**: Provide straightforward answers to the user's queries without headers unless requested. Avoid speculative responses.
1. **Direct Answers**: Provide clear and thorough answers to the user's queries without headers unless requested. Avoid speculative responses.
2. **Utilize History and Context**: Leverage relevant information from previous interactions, the current user input, and the context provided below.
3. **No Greetings in Follow-ups**: Start with a greeting in initial interactions. Avoid greetings in subsequent responses unless there's a significant break or the chat restarts.
4. **Admit Unknowns**: Clearly state if an answer is unknown. Avoid making unsupported statements.
5. **Avoid Hallucination**: Only provide information based on the context provided. Do not invent information.
6. **Response Length**: Keep responses concise and relevant. Aim for clarity and completeness within 2-3 sentences unless more detail is requested.
6. **Response Length**: Keep responses concise and relevant. Aim for clarity and completeness within 4-5 sentences unless more detail is requested.
7. **Tone and Style**: Maintain a professional and informative tone. Be friendly and approachable.
8. **Error Handling**: If a query is ambiguous or unclear, ask for clarification rather than providing a potentially incorrect answer.
9. **Fallback Options**: If the required information is not available in the provided context, provide a polite and helpful response. Example: "I don't have that information right now." or "I'm sorry, but I don't have that information. Is there something else I can help with?"
10. **Context Availability**: If the context is empty, do not provide answers based solely on internal knowledge. Instead, respond appropriately by indicating the lack of information.


**IMPORTANT** : DO NOT ANSWER FROM YOUR KNOWLEDGE BASE USE THE BELOW CONTEXT

### Context:
<context>
Expand All @@ -72,15 +76,18 @@
AI Response: 'Hello there! How can I assist you today?'

User: "What is Langchain?"
AI Response: "Langchain is a framework that enables the development of applications powered by large language models, such as chatbots."
AI Response: "Langchain is a framework that enables the development of applications powered by large language models, such as chatbots. It simplifies the integration of language models into various applications by providing useful tools and components."

User: "Can you explain how to use memory management in Langchain?"
AI Response: "Langchain's memory management involves utilizing built-in mechanisms to manage conversational context effectively, ensuring a coherent user experience."
AI Response: "Langchain's memory management involves utilizing built-in mechanisms to manage conversational context effectively. It ensures that the conversation remains coherent and relevant by maintaining the history of interactions and using it to inform responses."

User: "I need help with PyCaret's classification model."
AI Response: "PyCaret simplifies the process of building and deploying machine learning models. For classification tasks, you can use PyCaret's setup function to prepare your data, then compare and tune models."
AI Response: "PyCaret simplifies the process of building and deploying machine learning models. For classification tasks, you can use PyCaret's setup function to prepare your data. After setup, you can compare multiple models to find the best one, and then fine-tune it for better performance."

User: "What can you tell me about the latest realtime trends in AI?"
AI Response: "I don't have that information right now. Is there something else I can help with?"

Note: This system does not generate answers based solely on internal knowledge. It answers from the information provided in the user's current and previous inputs, and from explicitly referenced external sources.
Note: This system does not generate answers based solely on internal knowledge. It answers from the information provided in the user's current and previous inputs, and from the context.
"""

def get_neo4j_retriever(graph, index_name="vector", search_k=CHAT_SEARCH_KWARG_K, score_threshold=CHAT_SEARCH_KWARG_SCORE_THRESHOLD):
Expand Down Expand Up @@ -288,7 +295,9 @@ def QA_RAG(graph,model,question,session_id):
}
)
if docs:
# print(docs)
formatted_docs,sources = format_documents(docs)

doc_retrieval_time = time.time() - start_time
logging.info(f"Modified question and Documents retrieved in {doc_retrieval_time:.2f} seconds")

Expand Down
2 changes: 1 addition & 1 deletion backend/src/chunkid_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
MATCH (chunk)-[:PART_OF]->(d:Document)
CALL {WITH chunk
MATCH (chunk)-[:HAS_ENTITY]->(e)
MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,3}(:!Chunk&!Document)
MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document)
UNWIND rels as r
RETURN collect(distinct r) as rels
}
Expand Down
73 changes: 69 additions & 4 deletions backend/src/document_sources/gcs_bucket.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from PyPDF2 import PdfReader
import io
from google.oauth2.credentials import Credentials
import time

def get_gcs_bucket_files_info(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, creds):
storage_client = storage.Client(project=gcs_project_id, credentials=creds)
Expand Down Expand Up @@ -41,7 +42,7 @@ def get_gcs_bucket_files_info(gcs_project_id, gcs_bucket_name, gcs_bucket_folder
def load_pdf(file_path):
return PyMuPDFLoader(file_path)

def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token):
def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token=None):

if gcs_bucket_folder is not None:
if gcs_bucket_folder.endswith('/'):
Expand All @@ -56,8 +57,12 @@ def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, g
# pages = loader.load()
# file_name = gcs_blob_filename
#creds= Credentials(access_token)
creds= Credentials(access_token)
storage_client = storage.Client(project=gcs_project_id, credentials=creds)
if access_token is None:
storage_client = storage.Client(project=gcs_project_id)
else:
creds= Credentials(access_token)
storage_client = storage.Client(project=gcs_project_id, credentials=creds)
print(f'BLOB Name: {blob_name}')
bucket = storage_client.bucket(gcs_bucket_name)
blob = bucket.blob(blob_name)
content = blob.download_as_bytes()
Expand All @@ -70,4 +75,64 @@ def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, g
text += page.extract_text()
pages = [Document(page_content = text)]
return gcs_blob_filename, pages


def upload_file_to_gcs(file_chunk, chunk_number, original_file_name, bucket_name):
storage_client = storage.Client()

file_name = f'{original_file_name}_part_{chunk_number}'
bucket = storage_client.bucket(bucket_name)
file_data = file_chunk.file.read()
# print(f'data after read {file_data}')

blob = bucket.blob(file_name)
file_io = io.BytesIO(file_data)
blob.upload_from_file(file_io)
# Define the lifecycle rule to delete objects after 6 hours
# rule = {
# "action": {"type": "Delete"},
# "condition": {"age": 1} # Age in days (24 hours = 1 days)
# }

# # Get the current lifecycle policy
# lifecycle = list(bucket.lifecycle_rules)

# # Add the new rule
# lifecycle.append(rule)

# # Set the lifecycle policy on the bucket
# bucket.lifecycle_rules = lifecycle
# bucket.patch()
time.sleep(1)
logging.info('Chunk uploaded successfully in gcs')

def merge_file_gcs(bucket_name, original_file_name: str):
storage_client = storage.Client()
# Retrieve chunks from GCS
blobs = storage_client.list_blobs(bucket_name, prefix=f"{original_file_name}_part_")
chunks = []
for blob in blobs:
chunks.append(blob.download_as_bytes())
blob.delete()

# Merge chunks into a single file
merged_file = b"".join(chunks)
blob = storage_client.bucket(bucket_name).blob(original_file_name)
logging.info('save the merged file from chunks in gcs')
file_io = io.BytesIO(merged_file)
blob.upload_from_file(file_io)
pdf_reader = PdfReader(file_io)
file_size = len(merged_file)
total_pages = len(pdf_reader.pages)

return total_pages, file_size

def delete_file_from_gcs(bucket_name, file_name):
try:
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(file_name)
if blob.exists():
blob.delete()
logging.info('File deleted from GCS successfully')
except:
raise Exception('BLOB not exists in GCS')
7 changes: 5 additions & 2 deletions backend/src/document_sources/local_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,17 @@ def get_documents_from_file_by_path(file_path,file_name):

if page.metadata['page_number']>page_number:
page_number+=1
if not metadata:
metadata = {'total_pages':unstructured_pages[-1].metadata['page_number']}
pages.append(Document(page_content = page_content, metadata=metadata))
page_content=''

if page == unstructured_pages[-1]:
if not metadata:
metadata = {'total_pages':unstructured_pages[-1].metadata['page_number']}
pages.append(Document(page_content = page_content, metadata=metadata))

elif page.metadata['category']=='PageBreak':
elif page.metadata['category']=='PageBreak' and page!=unstructured_pages[0]:
page_number+=1
pages.append(Document(page_content = page_content, metadata=metadata))
page_content=''
Expand All @@ -65,5 +69,4 @@ def get_documents_from_file_by_path(file_path,file_name):
else:
logging.info(f'File {file_name} does not exist')
raise Exception(f'File {file_name} does not exist')

return file_name, pages , file_extension
6 changes: 3 additions & 3 deletions backend/src/graphDB_dataAccess.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import os
from datetime import datetime
from langchain_community.graphs import Neo4jGraph
from src.shared.common_fn import delete_uploaded_local_file
from src.api_response import create_api_response
from src.document_sources.gcs_bucket import delete_file_from_gcs
from src.shared.constants import BUCKET_UPLOAD
from src.entities.source_node import sourceNode
import json

Expand Down Expand Up @@ -175,7 +175,7 @@ def delete_file_from_graph(self, filenames, source_types, deleteEntities:str, me
merged_file_path = os.path.join(merged_dir, file_name)
if source_type == 'local file':
logging.info(f'Deleted File Path: {merged_file_path} and Deleted File Name : {file_name}')
delete_uploaded_local_file(merged_file_path, file_name)
delete_file_from_gcs(BUCKET_UPLOAD,file_name)

query_to_delete_document="""
MATCH (d:Document) where d.fileName in $filename_list and d.fileSource in $source_types_list
Expand Down
Loading