Skip to content

gcs file content read via storage client #417

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions backend/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ async def extract_knowledge_graph_from_file(
file_name=Form(None),
allowedNodes=Form(None),
allowedRelationship=Form(None),
language=Form(None)
language=Form(None),
access_token=Form(None)
):
"""
Calls 'extract_graph_from_file' in a new thread to create Neo4jGraph from a
Expand Down Expand Up @@ -177,7 +178,7 @@ async def extract_knowledge_graph_from_file(

elif source_type == 'gcs bucket' and gcs_bucket_name:
result = await asyncio.to_thread(
extract_graph_from_file_gcs, graph, model, gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, allowedNodes, allowedRelationship)
extract_graph_from_file_gcs, graph, model, gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token, allowedNodes, allowedRelationship)
else:
return create_api_response('Failed',message='source_type is other than accepted source')
if result is not None:
Expand Down
30 changes: 24 additions & 6 deletions backend/src/document_sources/gcs_bucket.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import os
import logging
from google.cloud import storage
from langchain_community.document_loaders import GCSFileLoader
from langchain_community.document_loaders import GCSFileLoader, GCSDirectoryLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.documents import Document
from PyPDF2 import PdfReader
import io
from google.oauth2.credentials import Credentials

def get_gcs_bucket_files_info(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, creds):
storage_client = storage.Client(project=gcs_project_id, credentials=creds)
Expand Down Expand Up @@ -37,7 +41,7 @@ def get_gcs_bucket_files_info(gcs_project_id, gcs_bucket_name, gcs_bucket_folder
def load_pdf(file_path):
return PyMuPDFLoader(file_path)

def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename):
def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token):

if gcs_bucket_folder is not None:
if gcs_bucket_folder.endswith('/'):
Expand All @@ -48,8 +52,22 @@ def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, g
blob_name = gcs_blob_filename
#credentials, project_id = google.auth.default()
logging.info(f"GCS project_id : {gcs_project_id}")
loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=load_pdf)
pages = loader.load()
file_name = gcs_blob_filename
return file_name, pages
#loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=load_pdf)
# pages = loader.load()
# file_name = gcs_blob_filename
#creds= Credentials(access_token)
creds= Credentials(access_token)
storage_client = storage.Client(project=gcs_project_id, credentials=creds)
bucket = storage_client.bucket(gcs_bucket_name)
blob = bucket.blob(blob_name)
content = blob.download_as_bytes()
pdf_file = io.BytesIO(content)
pdf_reader = PdfReader(pdf_file)

# Extract text from all pages
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
pages = [Document(page_content = text)]
return gcs_blob_filename, pages

1 change: 1 addition & 0 deletions backend/src/entities/source_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ class sourceNode:
language:str=None
is_cancelled:bool=None
processed_chunk:int=None
access_token:str=None
6 changes: 4 additions & 2 deletions backend/src/graphDB_dataAccess.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,15 @@ def create_source_node(self, obj_source_node:sourceNode):
d.processingTime = $pt, d.errorMessage = $e_message, d.nodeCount= $n_count,
d.relationshipCount = $r_count, d.model= $model, d.gcsBucket=$gcs_bucket,
d.gcsBucketFolder= $gcs_bucket_folder, d.language= $language,d.gcsProjectId= $gcs_project_id,
d.is_cancelled=False, d.total_pages=0, d.total_chunks=0, d.processed_chunk=0, d.total_pages=$total_pages""",
d.is_cancelled=False, d.total_pages=0, d.total_chunks=0, d.processed_chunk=0, d.total_pages=$total_pages,
d.access_token=$access_token""",
{"fn":obj_source_node.file_name, "fs":obj_source_node.file_size, "ft":obj_source_node.file_type, "st":job_status,
"url":obj_source_node.url,
"awsacc_key_id":obj_source_node.awsAccessKeyId, "f_source":obj_source_node.file_source, "c_at":obj_source_node.created_at,
"u_at":obj_source_node.created_at, "pt":0, "e_message":'', "n_count":0, "r_count":0, "model":obj_source_node.model,
"gcs_bucket": obj_source_node.gcsBucket, "gcs_bucket_folder": obj_source_node.gcsBucketFolder,
"language":obj_source_node.language, "gcs_project_id":obj_source_node.gcsProjectId, "total_pages": obj_source_node.total_pages})
"language":obj_source_node.language, "gcs_project_id":obj_source_node.gcsProjectId, "total_pages": obj_source_node.total_pages,
"access_token":obj_source_node.access_token})
except Exception as e:
error_message = str(e)
logging.info(f"error_message = {error_message}")
Expand Down
5 changes: 3 additions & 2 deletions backend/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def create_source_node_graph_url_gcs(graph, model, gcs_project_id, gcs_bucket_na
obj_source_node.gcsBucketFolder = file_metadata['gcsBucketFolder']
obj_source_node.gcsProjectId = file_metadata['gcsProjectId']
obj_source_node.created_at = datetime.now()
obj_source_node.access_token = credentials.token

try:
graphDb_data_Access = graphDBdataAccess(graph)
Expand Down Expand Up @@ -196,9 +197,9 @@ def extract_graph_from_file_Wikipedia(graph, model, wiki_query, max_sources, lan

return processing_source(graph, model, file_name, pages, allowedNodes, allowedRelationship)

def extract_graph_from_file_gcs(graph, model, gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, allowedNodes, allowedRelationship):
def extract_graph_from_file_gcs(graph, model, gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token, allowedNodes, allowedRelationship):

file_name, pages = get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename)
file_name, pages = get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token)
if pages==None or len(pages)==0:
raise Exception(f'Pdf content is not available for file : {file_name}')

Expand Down
3 changes: 2 additions & 1 deletion frontend/src/components/Content.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,8 @@ const Content: React.FC<ContentProps> = ({ isLeftExpanded, isRightExpanded }) =>
selectedNodes.map((l) => l.value),
selectedRels.map((t) => t.value),
fileItem.google_project_id,
fileItem.language
fileItem.language,
fileItem.access_token
);

if (apiResponse?.status === 'Failed') {
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/components/DropZone.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ const DropZone: FunctionComponent = () => {
processing: defaultValues.processing,
model: defaultValues.model,
fileSource: defaultValues.fileSource,
processingProgress:defaultValues.processingProgress
processingProgress: defaultValues.processingProgress,
});
}
});
Expand Down
1 change: 1 addition & 0 deletions frontend/src/components/FileTable.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ const FileTable: React.FC<FileTableProps> = ({ isExpanded, connectionStatus, set
? Math.floor((item.processed_chunk / item.total_chunks) * 100)
: undefined,
total_pages: item.total_pages ?? 0,
access_token: item.access_token ?? '',
});
}
});
Expand Down
4 changes: 3 additions & 1 deletion frontend/src/components/GCSModal.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ const GCSModal: React.FC<S3ModalProps> = ({ hideModal, open }) => {
google_project_id: item.gcsProjectId,
total_pages: 'NA',
id: uuidv4(),
access_token: codeResponse.access_token,
...defaultValues,
});
} else {
Expand All @@ -99,7 +100,8 @@ const GCSModal: React.FC<S3ModalProps> = ({ hideModal, open }) => {
processing: defaultValues.processing,
model: defaultValues.model,
fileSource: defaultValues.fileSource,
processingProgress:defaultValues.processingProgress
processingProgress: defaultValues.processingProgress,
access_token: codeResponse.access_token,
});
}
});
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/components/S3Modal.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ const S3Modal: React.FC<S3ModalProps> = ({ hideModal, open }) => {
processing: defaultValues.processing,
model: defaultValues.model,
fileSource: defaultValues.fileSource,
processingProgress:defaultValues.processingProgress
processingProgress: defaultValues.processingProgress,
});
}
});
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/components/SchemaFromText.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ const SchemaFromTextDialog = ({
fluid
value={userText}
onChange={(e) => setUserText(e.target.value)}
size="large"
size='large'
/>
<Dialog.Actions className='!mt-4'>
<Button loading={loading} disabled={userText.trim() === '' || loading} onClick={clickHandler}>
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/components/WikipediaModal.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ const WikipediaModal: React.FC<WikipediaModalTypes> = ({ hideModal, open }) => {
processing: defaultValues.processing,
model: defaultValues.model,
fileSource: defaultValues.fileSource,
processingProgress:defaultValues.processingProgress
processingProgress: defaultValues.processingProgress,
});
}
});
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/components/YoutubeModal.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ const YoutubeModal: React.FC<S3ModalProps> = ({ hideModal, open }) => {
processing: defaultValues.processing,
model: defaultValues.model,
fileSource: defaultValues.fileSource,
processingProgress:defaultValues.processingProgress
processingProgress: defaultValues.processingProgress,
});
}
});
Expand Down
3 changes: 3 additions & 0 deletions frontend/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ export interface CustomFileBase extends Partial<globalThis.File> {
google_project_id?: string;
language?: string;
processingProgress?: number;
access_token?: string;
}
export interface CustomFile extends CustomFileBase {
id: string;
Expand Down Expand Up @@ -62,6 +63,7 @@ export type ExtractParams = {
allowedRelationship?: string[];
gcs_project_id?: string;
language?: string;
access_token?: string;
} & { [key: string]: any };

export type UploadParams = {
Expand Down Expand Up @@ -118,6 +120,7 @@ export interface SourceNode {
processed_chunk?: number;
total_chunks?: number;
total_pages?: number;
access_token?: string;
}

export interface SideNavProps {
Expand Down
4 changes: 3 additions & 1 deletion frontend/src/utils/FileAPI.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ export const extractAPI = async (
allowedNodes?: string[],
allowedRelationship?: string[],
gcs_project_id?: string,
language?: string
language?: string,
access_token?: string
): Promise<any> => {
const urlExtract = `${url()}/extract`;
const method: Method = 'post';
Expand Down Expand Up @@ -65,6 +66,7 @@ export const extractAPI = async (
allowedNodes,
allowedRelationship,
gcs_project_id,
access_token,
};
} else if (source_type === 'youtube') {
additionalParams = {
Expand Down