Skip to content

Commit 5bf9dc2

Browse files
committed
File Upload
0 parents  commit 5bf9dc2

File tree

11 files changed

+1328
-0
lines changed

11 files changed

+1328
-0
lines changed

.gitignore

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
# Project specific
2+
saved_data/
3+
4+
# OS Specific
5+
.DS_Store
6+
7+
# Byte-compiled / optimized / DLL files
8+
__pycache__/
9+
*.py[cod]
10+
*$py.class
11+
12+
# C extensions
13+
*.so
14+
15+
# Distribution / packaging
16+
.Python
17+
build/
18+
develop-eggs/
19+
dist/
20+
downloads/
21+
eggs/
22+
.eggs/
23+
lib/
24+
lib64/
25+
parts/
26+
sdist/
27+
var/
28+
wheels/
29+
share/python-wheels/
30+
*.egg-info/
31+
.installed.cfg
32+
*.egg
33+
MANIFEST
34+
35+
# PyInstaller
36+
# Usually these files are written by a python script from a template
37+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
38+
*.manifest
39+
*.spec
40+
41+
# Installer logs
42+
pip-log.txt
43+
pip-delete-this-directory.txt
44+
45+
# Unit test / coverage reports
46+
htmlcov/
47+
.tox/
48+
.nox/
49+
.coverage
50+
.coverage.*
51+
.cache
52+
nosetests.xml
53+
coverage.xml
54+
*.cover
55+
*.py,cover
56+
.hypothesis/
57+
.pytest_cache/
58+
cover/
59+
60+
# Translations
61+
*.mo
62+
*.pot
63+
64+
# Django stuff:
65+
*.log
66+
local_settings.py
67+
db.sqlite3
68+
db.sqlite3-journal
69+
70+
# Flask stuff:
71+
instance/
72+
.webassets-cache
73+
74+
# Scrapy stuff:
75+
.scrapy
76+
77+
# Sphinx documentation
78+
docs/_build/
79+
80+
# PyBuilder
81+
.pybuilder/
82+
target/
83+
84+
# Jupyter Notebook
85+
.ipynb_checkpoints
86+
87+
# IPython
88+
profile_default/
89+
ipython_config.py
90+
91+
# pyenv
92+
# For a library or package, you might want to ignore these files since the code is
93+
# intended to run in multiple environments; otherwise, check them in:
94+
# .python-version
95+
96+
# pipenv
97+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
98+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
99+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
100+
# install all needed dependencies.
101+
#Pipfile.lock
102+
103+
# poetry
104+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105+
# This is especially recommended for binary packages to ensure reproducibility, and is more
106+
# commonly ignored for libraries.
107+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108+
#poetry.lock
109+
110+
# pdm
111+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112+
#pdm.lock
113+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114+
# in version control.
115+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116+
.pdm.toml
117+
.pdm-python
118+
.pdm-build/
119+
120+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121+
__pypackages__/
122+
123+
# Celery stuff
124+
celerybeat-schedule
125+
celerybeat.pid
126+
127+
# SageMath parsed files
128+
*.sage.py
129+
130+
# Environments
131+
.env
132+
.venv
133+
env/
134+
venv/
135+
ENV/
136+
env.bak/
137+
venv.bak/
138+
139+
# Spyder project settings
140+
.spyderproject
141+
.spyproject
142+
143+
# Rope project settings
144+
.ropeproject
145+
146+
# mkdocs documentation
147+
/site
148+
149+
# mypy
150+
.mypy_cache/
151+
.dmypy.json
152+
dmypy.json
153+
154+
# Pyre type checker
155+
.pyre/
156+
157+
# pytype static type analyzer
158+
.pytype/
159+
160+
# Cython debug symbols
161+
cython_debug/
162+
163+
# PyCharm
164+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166+
# and can be added to the global gitignore or merged into this file. For a more nuclear
167+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
168+
#.idea/

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# EPUB-RAG
2+
3+
A simple Question Answering (QA) Application built using LangChain. Answers questions about plotlines, characters, events , and more about any ebook provided in an .epub format. Useful for catching up on books left unread for a long time :)

configs/qa_config.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
epub:
2+
epub_path: 'data/mol.epub'
3+
4+
vectorstore:
5+
load: false
6+
persist_dir: 'saved_data/db'
7+
k: 6
8+
9+
api_keys:
10+
LANGCHAIN_API_KEY: {{ LANGCHAIN_API_KEY }}
11+
OPENAI_API_KEY: {{ OPENAI_API_KEY }}

data/mol.epub

1.02 MB
Binary file not shown.

data/pride_and_prejudice.epub

1.68 MB
Binary file not shown.

engine/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .text_qa import run_qa_pipeline

engine/_utils.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import random
2+
import string
3+
4+
def generate_session_id(length=6):
5+
# Define the character set
6+
characters = string.ascii_lowercase + string.digits # This includes 'a-z' and '0-9'
7+
8+
# Generate the random string
9+
session_id = ''.join(random.choices(characters, k=length))
10+
11+
return session_id

engine/text_qa.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
from typing import Tuple, List
2+
import re
3+
import os
4+
import sys
5+
6+
import ebooklib
7+
from ebooklib import epub
8+
from bs4 import BeautifulSoup
9+
10+
import engine._utils as utils
11+
12+
import langchain
13+
import langchain.schema
14+
from langchain_text_splitters import RecursiveCharacterTextSplitter
15+
from langchain_chroma import Chroma
16+
from langchain_community.chat_message_histories import ChatMessageHistory
17+
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
18+
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
19+
from langchain.chains.combine_documents import create_stuff_documents_chain
20+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
21+
from langchain_core.chat_history import BaseChatMessageHistory
22+
from langchain_core.runnables.history import RunnableWithMessageHistory
23+
24+
25+
26+
def extract_text_from_epub(epub_path):
27+
book = epub.read_epub(epub_path)
28+
texts = []
29+
30+
for item in book.get_items():
31+
if item.get_type() == ebooklib.ITEM_DOCUMENT:
32+
# Convert the item content to a string, decode it to utf-8
33+
content = item.get_content().decode('utf-8')
34+
# Use BeautifulSoup to parse HTML and extract text
35+
soup = BeautifulSoup(content, 'html.parser')
36+
# Extract all text from the parsed HTML
37+
text = soup.get_text(separator=' ', strip=True)
38+
texts.append(text)
39+
40+
# Join all texts into one string, you might want to process this further depending on your RAG implementation
41+
full_text = ' '.join(texts)
42+
return full_text
43+
44+
45+
def create_document_from_text(extracted_text: str) -> Tuple[langchain.schema.Document, int, int]:
46+
doc = langchain.schema.Document(page_content=extracted_text, metadata={'source':'from_epub_string'})
47+
chapters = re.findall(r'(?:Chapter|Chap\.|Chap)\s+(?:\w+\s+)*(\w+)', extracted_text, re.IGNORECASE)
48+
num_chapters = len(chapters)
49+
chunk_size = len(doc.page_content) // num_chapters
50+
chunk_overlap = chunk_size // 10
51+
print(f'chunk_size: {chunk_size}, chunk_overlap: {chunk_overlap}')
52+
return doc, chunk_size, chunk_overlap
53+
54+
55+
def get_all_splits(doc: langchain.schema.Document, chunk_size: int, chunk_overlap: int):
56+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=True)
57+
all_splits = text_splitter.split_documents([doc])
58+
return all_splits
59+
60+
61+
def create_vectorstore(persist_dir: str, splits: List[str]):
62+
persist_dir = 'saved_data/db'
63+
os.makedirs(persist_dir, exist_ok=True)
64+
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(), persist_directory=persist_dir)
65+
return vectorstore
66+
67+
68+
def load_vectorstore(persist_dir: str):
69+
return Chroma(persist_directory=persist_dir, embedding_function=OpenAIEmbeddings())
70+
71+
72+
def get_response(retriever, llm, question, store, session_id):
73+
# Contextualization
74+
contextualize_q_system_prompt = (
75+
"Given a chat history and the latest user question "
76+
"which might reference context in the chat history, "
77+
"formulate a standalone question which can be understood "
78+
"without the chat history. Do NOT answer the question, "
79+
"just reformulate it if needed and otherwise return it as is."
80+
)
81+
contextualize_q_prompt = ChatPromptTemplate.from_messages(
82+
[
83+
("system", contextualize_q_system_prompt),
84+
MessagesPlaceholder("chat_history"),
85+
("human", "{input}"),
86+
]
87+
)
88+
history_aware_retriever = create_history_aware_retriever(
89+
llm, retriever, contextualize_q_prompt
90+
)
91+
92+
system_prompt = (
93+
"You are an assistant for question-answering tasks. "
94+
"Use the following pieces of retrieved context to answer "
95+
"the question. If you don't know the answer, say that you "
96+
"don't know. Use three to fifteen sentences maximum and keep the "
97+
"answer concise whenever possible."
98+
"\n\n"
99+
"{context}"
100+
)
101+
102+
# prompt = ChatPromptTemplate.from_messages(
103+
# [
104+
# ("system", system_prompt),
105+
# ("human", "{input}"),
106+
# ]
107+
# )
108+
prompt = ChatPromptTemplate.from_messages(
109+
[
110+
('system', system_prompt),
111+
MessagesPlaceholder('chat_history'),
112+
('human', '{input}')
113+
]
114+
)
115+
116+
117+
question_answer_chain = create_stuff_documents_chain(llm, prompt)
118+
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
119+
120+
def get_session_history(session_id: str) -> BaseChatMessageHistory:
121+
if session_id not in store:
122+
store[session_id] = ChatMessageHistory()
123+
return store[session_id]
124+
125+
conversational_rag_chain = RunnableWithMessageHistory(
126+
rag_chain,
127+
get_session_history,
128+
input_messages_key="input",
129+
history_messages_key="chat_history",
130+
output_messages_key="answer",
131+
)
132+
133+
response = conversational_rag_chain.invoke(
134+
{'input':question},
135+
config={
136+
'configurable': {'session_id':session_id}
137+
}
138+
)
139+
return response
140+
141+
# response = rag_chain.invoke({"input": question})
142+
# return response
143+
144+
145+
def run_qa_pipeline(config):
146+
epub_path = config['epub']['epub_path']
147+
extracted_text = extract_text_from_epub(epub_path)
148+
doc, chunk_size, chunk_overlap = create_document_from_text(extracted_text)
149+
all_splits = get_all_splits(doc, chunk_size, chunk_overlap)
150+
if config['vectorstore']['load']:
151+
vectorstore = load_vectorstore( config['vectorstore']['persist_dir'] )
152+
else:
153+
vectorstore = create_vectorstore( config['vectorstore']['persist_dir'], all_splits )
154+
155+
retriever = vectorstore.as_retriever(search_type='similarity', search_kwargs={'k':config['vectorstore']['k']})
156+
llm = ChatOpenAI(model='gpt-4o-mini')
157+
158+
print(f'Enter your questions about your novel loaded from {epub_path}. Type "exit" to quit the program.')
159+
session_id = utils.generate_session_id()
160+
store = {}
161+
while True:
162+
question = input('Ask me a question: ').strip()
163+
if question.lower() == 'exit':
164+
print('terminating...')
165+
sys.exit(0)
166+
167+
response = get_response(retriever, llm, question, store, session_id)
168+
answer = response['answer']
169+
print(f'Response:\n{answer}', end='\n\n')

0 commit comments

Comments
 (0)