Skip to content

Commit 642cac4

Browse files
kaustubh-darekarkartikpersistent
authored andcommitted
Updating langchain-neo4j package (#891)
* Adding langchain-Neo4j package in application * langchain neo4j package updated, requirement.txt file updated with only necessary packages and their latest versions. unnecessary imports removed. * refactored code. replaced print statements with logging info * Updated sentence transformer to huggingface embeddings
1 parent 269c451 commit 642cac4

15 files changed

+72
-299
lines changed

backend/requirements.txt

Lines changed: 40 additions & 166 deletions
Original file line numberDiff line numberDiff line change
@@ -1,183 +1,57 @@
1-
aiohttp==3.9.3
2-
aiosignal==1.3.1
3-
annotated-types==0.6.0
4-
antlr4-python3-runtime==4.9.3
5-
anyio==4.3.0
6-
async-timeout==4.0.3
71
asyncio==3.4.3
8-
attrs==23.2.0
9-
backoff==2.2.1
10-
beautifulsoup4==4.12.3
11-
boto3==1.34.140
12-
botocore==1.34.140
13-
cachetools==5.3.3
14-
certifi==2024.2.2
15-
cffi==1.16.0
16-
chardet==5.2.0
17-
charset-normalizer==3.3.2
18-
click==8.1.7
19-
coloredlogs==15.0.1
20-
contourpy==1.2.0
21-
cryptography==42.0.2
22-
cycler==0.12.1
23-
dataclasses-json==0.6.4
24-
dataclasses-json-speakeasy==0.5.11
25-
Deprecated==1.2.14
26-
distro==1.9.0
27-
docstring_parser==0.16
28-
effdet==0.4.1
29-
emoji==2.10.1
30-
exceptiongroup==1.2.0
31-
fastapi==0.111.0
2+
boto3==1.35.69
3+
botocore==1.35.69
4+
certifi==2024.8.30
5+
fastapi==0.115.5
326
fastapi-health==0.4.0
33-
filelock==3.13.1
34-
filetype==1.2.0
35-
flatbuffers==23.5.26
36-
fonttools==4.49.0
37-
frozenlist==1.4.1
38-
fsspec==2024.2.0
39-
google-api-core==2.18.0
40-
google-auth==2.29.0
41-
google_auth_oauthlib==1.2.0
42-
google-cloud-aiplatform==1.58.0
43-
google-cloud-bigquery==3.19.0
7+
google-api-core==2.23.0
8+
google-auth==2.36.0
9+
google_auth_oauthlib==1.2.1
4410
google-cloud-core==2.4.1
45-
google-cloud-resource-manager==1.12.3
46-
google-cloud-storage==2.17.0
47-
google-crc32c==1.5.0
48-
google-resumable-media==2.7.0
49-
googleapis-common-protos==1.63.0
50-
greenlet==3.0.3
51-
grpc-google-iam-v1==0.13.0
52-
grpcio==1.62.1
53-
google-ai-generativelanguage==0.6.6
54-
grpcio-status==1.62.1
55-
h11==0.14.0
56-
httpcore==1.0.4
57-
httpx==0.27.0
58-
huggingface-hub
59-
humanfriendly==10.0
60-
idna==3.6
61-
importlib-resources==6.1.1
11+
json-repair==0.30.2
6212
pip-install==1.3.5
63-
iopath==0.1.10
64-
Jinja2==3.1.3
65-
jmespath==1.0.1
66-
joblib==1.3.2
67-
jsonpatch==1.33
68-
jsonpath-python==1.0.6
69-
jsonpointer==2.4
70-
json-repair==0.25.2
71-
kiwisolver==1.4.5
72-
langchain==0.3.0
73-
langchain-aws==0.2.1
74-
langchain-anthropic==0.2.1
75-
langchain-fireworks==0.2.0
76-
langchain-google-genai==2.0.0
77-
langchain-community==0.3.0
78-
langchain-core==0.3.5
79-
langchain-experimental==0.3.1
80-
langchain-google-vertexai==2.0.1
81-
langchain-groq==0.2.0
82-
langchain-openai==0.2.0
83-
langchain-text-splitters==0.3.0
13+
langchain==0.3.8
14+
langchain-aws==0.2.7
15+
langchain-anthropic==0.3.0
16+
langchain-fireworks==0.2.5
17+
langchain-community==0.3.8
18+
langchain-core==0.3.21
19+
langchain-experimental==0.3.3
20+
langchain-google-vertexai==2.0.7
21+
langchain-groq==0.2.1
22+
langchain-openai==0.2.9
23+
langchain-text-splitters==0.3.2
24+
langchain-huggingface==0.1.2
8425
langdetect==1.0.9
85-
langsmith==0.1.128
86-
layoutparser==0.3.4
26+
langsmith==0.1.146
8727
langserve==0.3.0
88-
#langchain-cli==0.0.25
89-
lxml==5.1.0
90-
MarkupSafe==2.1.5
91-
marshmallow==3.20.2
92-
matplotlib==3.7.2
93-
mpmath==1.3.0
94-
multidict==6.0.5
95-
mypy-extensions==1.0.0
9628
neo4j-rust-ext
97-
networkx==3.2.1
98-
nltk==3.8.1
99-
numpy==1.26.4
100-
omegaconf==2.3.0
101-
onnx==1.16.1
102-
onnxruntime==1.18.1
103-
openai==1.47.1
104-
opencv-python==4.8.0.76
105-
orjson==3.9.15
106-
packaging==23.2
107-
pandas==2.2.0
108-
pdf2image==1.17.0
109-
pdfminer.six==20221105
110-
pdfplumber==0.10.4
111-
pikepdf==8.11.0
112-
pillow==10.2.0
113-
pillow_heif==0.15.0
114-
portalocker==2.8.2
115-
proto-plus==1.23.0
116-
protobuf==4.23.4
117-
psutil==6.0.0
118-
pyasn1==0.6.0
119-
pyasn1_modules==0.4.0
120-
pycocotools==2.0.7
121-
pycparser==2.21
122-
pydantic==2.8.2
123-
pydantic_core==2.20.1
124-
pyparsing==3.0.9
125-
pypdf==4.0.1
126-
PyPDF2==3.0.1
127-
pypdfium2==4.27.0
128-
pytesseract==0.3.10
129-
python-dateutil==2.8.2
29+
nltk==3.9.1
30+
openai==1.55.1
31+
opencv-python==4.10.0.84
32+
psutil==6.1.0
33+
pydantic==2.9.0
13034
python-dotenv==1.0.1
131-
python-iso639==2024.2.7
132-
python-magic==0.4.27
133-
python-multipart==0.0.9
134-
pytube==15.0.0
135-
pytz==2024.1
136-
PyYAML==6.0.1
137-
rapidfuzz==3.6.1
138-
regex==2023.12.25
139-
requests==2.32.3
140-
rsa==4.9
141-
s3transfer==0.10.1
142-
safetensors==0.4.1
143-
shapely==2.0.3
144-
six==1.16.0
145-
sniffio==1.3.1
146-
soupsieve==2.5
147-
starlette==0.37.2
148-
sse-starlette==2.1.2
35+
PyPDF2==3.0.1
36+
PyMuPDF==1.24.14
37+
starlette==0.41.3
38+
sse-starlette==2.1.3
14939
starlette-session==0.4.3
150-
sympy==1.12
151-
tabulate==0.9.0
152-
tenacity==8.2.3
153-
tiktoken==0.7.0
154-
timm==0.9.12
155-
tokenizers==0.19
156-
tqdm==4.66.2
157-
transformers==4.42.3
158-
types-protobuf
159-
types-requests
160-
typing-inspect==0.9.0
161-
typing_extensions==4.12.2
162-
tzdata==2024.1
163-
unstructured==0.14.9
164-
unstructured-client==0.23.8
165-
unstructured-inference==0.7.36
166-
unstructured.pytesseract==0.3.12
167-
unstructured[all-docs]==0.14.9
40+
tqdm==4.67.1
41+
unstructured[all-docs]==0.16.6
16842
urllib3==2.2.2
169-
uvicorn==0.30.1
170-
gunicorn==22.0.0
43+
uvicorn==0.32.1
44+
gunicorn==23.0.0
17145
wikipedia==1.4.0
17246
wrapt==1.16.0
17347
yarl==1.9.4
174-
youtube-transcript-api==0.6.2
48+
youtube-transcript-api==0.6.3
17549
zipp==3.17.0
176-
sentence-transformers==3.0.1
177-
google-cloud-logging==3.10.0
178-
PyMuPDF==1.24.5
50+
sentence-transformers==3.3.1
51+
google-cloud-logging==3.11.3
17952
pypandoc==1.13
180-
graphdatascience==1.10
53+
graphdatascience==1.12
18154
Secweb==1.11.0
182-
ragas==0.2.2
55+
ragas==0.2.6
18356
rouge_score==0.1.2
57+
langchain-neo4j==0.1.1

backend/score.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from src.ragas_eval import *
3737
from starlette.types import ASGIApp, Message, Receive, Scope, Send
3838
import gzip
39+
from langchain_neo4j import Neo4jGraph
3940

4041
logger = CustomLogger()
4142
CHUNK_DIR = os.path.join(os.path.dirname(__file__), "chunks")
@@ -587,7 +588,6 @@ async def generate():
587588
graph = create_graph_database_connection(uri, userName, decoded_password, database)
588589
graphDb_data_Access = graphDBdataAccess(graph)
589590
result = graphDb_data_Access.get_current_status_document_node(file_name)
590-
# print(f'Result of document status in SSE : {result}')
591591
if len(result) > 0:
592592
status = json.dumps({'fileName':file_name,
593593
'status':result[0]['Status'],
@@ -674,7 +674,7 @@ async def get_document_status(file_name, url, userName, password, database):
674674
}
675675
else:
676676
status = {'fileName':file_name, 'status':'Failed'}
677-
print(f'Result of document status in refresh : {result}')
677+
logging.info(f'Result of document status in refresh : {result}')
678678
return create_api_response('Success',message="",file_name=status)
679679
except Exception as e:
680680
message=f"Unable to get the document status"
@@ -967,7 +967,7 @@ async def fetch_chunktext(
967967
async def backend_connection_configuation():
968968
try:
969969
graph = Neo4jGraph()
970-
print(f'login connection status of object: {graph}')
970+
logging.info(f'login connection status of object: {graph}')
971971
if graph is not None:
972972
graph_connection = True
973973
isURI = os.getenv('NEO4J_URI')

backend/src/create_chunks.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
from langchain_text_splitters import TokenTextSplitter
22
from langchain.docstore.document import Document
3-
from langchain_community.graphs import Neo4jGraph
3+
from langchain_neo4j import Neo4jGraph
44
import logging
5-
import os
65
from src.document_sources.youtube import get_chunks_with_timestamps, get_calculated_timestamps
76
import re
87

@@ -25,9 +24,6 @@ def split_file_into_chunks(self):
2524
A list of chunks each of which is a langchain Document.
2625
"""
2726
logging.info("Split file into smaller chunks")
28-
full_document = Document(
29-
page_content = self.pages_content
30-
)
3127
text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=20)
3228
if 'page' in self.pages[0].metadata:
3329
chunks = []

backend/src/diffbot_transformer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
2-
from langchain_community.graphs import Neo4jGraph
2+
#from langchain_community.graphs import Neo4jGraph
3+
from langchain_neo4j import Neo4jGraph
34
from langchain.docstore.document import Document
45
from typing import List
56
import os

backend/src/document_sources/gcs_bucket.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,15 +101,12 @@ def merge_file_gcs(bucket_name, original_file_name: str, folder_name_sha1_hashed
101101
try:
102102
storage_client = storage.Client()
103103
bucket = storage_client.bucket(bucket_name)
104-
# Retrieve chunks from GCS
105-
# blobs = storage_client.list_blobs(bucket_name, prefix=folder_name_sha1_hashed)
106-
# print(f'before sorted blobs: {blobs}')
107104
chunks = []
108105
for i in range(1,total_chunks+1):
109106
blob_name = folder_name_sha1_hashed + '/' + f"{original_file_name}_part_{i}"
110107
blob = bucket.blob(blob_name)
111108
if blob.exists():
112-
print(f'Blob Name: {blob.name}')
109+
logging.info(f'Blob Name: {blob.name}')
113110
chunks.append(blob.download_as_bytes())
114111
blob.delete()
115112

backend/src/document_sources/local_file.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,8 @@
2020

2121
def load_document_content(file_path):
2222
if Path(file_path).suffix.lower() == '.pdf':
23-
print("in if")
2423
return PyMuPDFLoader(file_path)
2524
else:
26-
print("in else")
2725
return UnstructuredFileLoader(file_path, mode="elements",autodetect_encoding=True)
2826

2927
def get_documents_from_file_by_path(file_path,file_name):

backend/src/document_sources/youtube.py

Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,16 @@
1-
from pathlib import Path
21
from langchain.docstore.document import Document
3-
from langchain_community.document_loaders import YoutubeLoader
4-
from pytube import YouTube
52
from youtube_transcript_api import YouTubeTranscriptApi
63
import logging
74
from urllib.parse import urlparse,parse_qs
85
from difflib import SequenceMatcher
96
from datetime import timedelta
10-
from langchain_community.document_loaders.youtube import TranscriptFormat
117
from src.shared.constants import YOUTUBE_CHUNK_SIZE_SECONDS
128
from typing import List, Dict, Any
139
import os
1410
import re
15-
from langchain_community.document_loaders import GoogleApiClient, GoogleApiYoutubeLoader
1611

1712
def get_youtube_transcript(youtube_id):
1813
try:
19-
#transcript = YouTubeTranscriptApi.get_transcript(youtube_id)
20-
# transcript_list = YouTubeTranscriptApi.list_transcripts(youtube_id)
21-
# transcript = transcript_list.find_transcript(["en"])
22-
# transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
2314
proxy = os.environ.get("YOUTUBE_TRANSCRIPT_PROXY")
2415
proxies = { 'https': proxy }
2516
transcript_pieces = YouTubeTranscriptApi.get_transcript(youtube_id, proxies = proxies)
@@ -28,15 +19,6 @@ def get_youtube_transcript(youtube_id):
2819
message = f"Youtube transcript is not available for youtube Id: {youtube_id}"
2920
raise Exception(message)
3021

31-
# def get_youtube_combined_transcript(youtube_id):
32-
# try:
33-
# transcript_dict = get_youtube_transcript(youtube_id)
34-
# transcript = YouTubeTranscriptApi.get_transcript(youtube_id)
35-
# return transcript
36-
# except Exception as e:
37-
# message = f"Youtube transcript is not available for youtube Id: {youtube_id}"
38-
# raise Exception(message)
39-
4022
def get_youtube_combined_transcript(youtube_id):
4123
try:
4224
transcript_dict = get_youtube_transcript(youtube_id)
@@ -64,25 +46,6 @@ def create_youtube_url(url):
6446
def get_documents_from_youtube(url):
6547
try:
6648
match = re.search(r'(?:v=)([0-9A-Za-z_-]{11})\s*',url)
67-
# youtube_loader = YoutubeLoader.from_youtube_url(url,
68-
# language=["en-US", "en-gb", "en-ca", "en-au","zh-CN", "zh-Hans", "zh-TW", "fr-FR","de-DE","it-IT","ja-JP","pt-BR","ru-RU","es-ES"],
69-
# translation = "en",
70-
# add_video_info=True,
71-
# transcript_format=TranscriptFormat.CHUNKS,
72-
# chunk_size_seconds=YOUTUBE_CHUNK_SIZE_SECONDS)
73-
# video_id = parse_qs(urlparse(url).query).get('v')
74-
# cred_path = os.path.join(os.getcwd(),"llm-experiments_credentials.json")
75-
# print(f'Credential file path on youtube.py {cred_path}')
76-
# google_api_client = GoogleApiClient(service_account_path=Path(cred_path))
77-
# youtube_loader_channel = GoogleApiYoutubeLoader(
78-
# google_api_client=google_api_client,
79-
# video_ids=[video_id[0].strip()], add_video_info=True
80-
# )
81-
# youtube_transcript = youtube_loader_channel.load()
82-
# pages = youtube_loader.load()
83-
# print(f'youtube page_content: {youtube_transcript[0].page_content}')
84-
# print(f'youtube id: {youtube_transcript[0].metadata["id"]}')
85-
# print(f'youtube title: {youtube_transcript[0].metadata["snippet"]["title"]}')
8649
transcript= get_youtube_transcript(match.group(1))
8750
transcript_content=''
8851
counter = YOUTUBE_CHUNK_SIZE_SECONDS

0 commit comments

Comments
 (0)