1+ from typing import Tuple , List
2+ import re
3+ import os
4+ import sys
5+
6+ import ebooklib
7+ from ebooklib import epub
8+ from bs4 import BeautifulSoup
9+
10+ import engine ._utils as utils
11+
12+ import langchain
13+ import langchain .schema
14+ from langchain_text_splitters import RecursiveCharacterTextSplitter
15+ from langchain_chroma import Chroma
16+ from langchain_community .chat_message_histories import ChatMessageHistory
17+ from langchain_openai import OpenAIEmbeddings , ChatOpenAI
18+ from langchain .chains import create_retrieval_chain , create_history_aware_retriever
19+ from langchain .chains .combine_documents import create_stuff_documents_chain
20+ from langchain_core .prompts import ChatPromptTemplate , MessagesPlaceholder
21+ from langchain_core .chat_history import BaseChatMessageHistory
22+ from langchain_core .runnables .history import RunnableWithMessageHistory
23+
24+
25+
26+ def extract_text_from_epub (epub_path ):
27+ book = epub .read_epub (epub_path )
28+ texts = []
29+
30+ for item in book .get_items ():
31+ if item .get_type () == ebooklib .ITEM_DOCUMENT :
32+ # Convert the item content to a string, decode it to utf-8
33+ content = item .get_content ().decode ('utf-8' )
34+ # Use BeautifulSoup to parse HTML and extract text
35+ soup = BeautifulSoup (content , 'html.parser' )
36+ # Extract all text from the parsed HTML
37+ text = soup .get_text (separator = ' ' , strip = True )
38+ texts .append (text )
39+
40+ # Join all texts into one string, you might want to process this further depending on your RAG implementation
41+ full_text = ' ' .join (texts )
42+ return full_text
43+
44+
45+ def create_document_from_text (extracted_text : str ) -> Tuple [langchain .schema .Document , int , int ]:
46+ doc = langchain .schema .Document (page_content = extracted_text , metadata = {'source' :'from_epub_string' })
47+ chapters = re .findall (r'(?:Chapter|Chap\.|Chap)\s+(?:\w+\s+)*(\w+)' , extracted_text , re .IGNORECASE )
48+ num_chapters = len (chapters )
49+ chunk_size = len (doc .page_content ) // num_chapters
50+ chunk_overlap = chunk_size // 10
51+ print (f'chunk_size: { chunk_size } , chunk_overlap: { chunk_overlap } ' )
52+ return doc , chunk_size , chunk_overlap
53+
54+
55+ def get_all_splits (doc : langchain .schema .Document , chunk_size : int , chunk_overlap : int ):
56+ text_splitter = RecursiveCharacterTextSplitter (chunk_size = chunk_size , chunk_overlap = chunk_overlap , add_start_index = True )
57+ all_splits = text_splitter .split_documents ([doc ])
58+ return all_splits
59+
60+
61+ def create_vectorstore (persist_dir : str , splits : List [str ]):
62+ persist_dir = 'saved_data/db'
63+ os .makedirs (persist_dir , exist_ok = True )
64+ vectorstore = Chroma .from_documents (documents = splits , embedding = OpenAIEmbeddings (), persist_directory = persist_dir )
65+ return vectorstore
66+
67+
68+ def load_vectorstore (persist_dir : str ):
69+ return Chroma (persist_directory = persist_dir , embedding_function = OpenAIEmbeddings ())
70+
71+
72+ def get_response (retriever , llm , question , store , session_id ):
73+ # Contextualization
74+ contextualize_q_system_prompt = (
75+ "Given a chat history and the latest user question "
76+ "which might reference context in the chat history, "
77+ "formulate a standalone question which can be understood "
78+ "without the chat history. Do NOT answer the question, "
79+ "just reformulate it if needed and otherwise return it as is."
80+ )
81+ contextualize_q_prompt = ChatPromptTemplate .from_messages (
82+ [
83+ ("system" , contextualize_q_system_prompt ),
84+ MessagesPlaceholder ("chat_history" ),
85+ ("human" , "{input}" ),
86+ ]
87+ )
88+ history_aware_retriever = create_history_aware_retriever (
89+ llm , retriever , contextualize_q_prompt
90+ )
91+
92+ system_prompt = (
93+ "You are an assistant for question-answering tasks. "
94+ "Use the following pieces of retrieved context to answer "
95+ "the question. If you don't know the answer, say that you "
96+ "don't know. Use three to fifteen sentences maximum and keep the "
97+ "answer concise whenever possible."
98+ "\n \n "
99+ "{context}"
100+ )
101+
102+ # prompt = ChatPromptTemplate.from_messages(
103+ # [
104+ # ("system", system_prompt),
105+ # ("human", "{input}"),
106+ # ]
107+ # )
108+ prompt = ChatPromptTemplate .from_messages (
109+ [
110+ ('system' , system_prompt ),
111+ MessagesPlaceholder ('chat_history' ),
112+ ('human' , '{input}' )
113+ ]
114+ )
115+
116+
117+ question_answer_chain = create_stuff_documents_chain (llm , prompt )
118+ rag_chain = create_retrieval_chain (history_aware_retriever , question_answer_chain )
119+
120+ def get_session_history (session_id : str ) -> BaseChatMessageHistory :
121+ if session_id not in store :
122+ store [session_id ] = ChatMessageHistory ()
123+ return store [session_id ]
124+
125+ conversational_rag_chain = RunnableWithMessageHistory (
126+ rag_chain ,
127+ get_session_history ,
128+ input_messages_key = "input" ,
129+ history_messages_key = "chat_history" ,
130+ output_messages_key = "answer" ,
131+ )
132+
133+ response = conversational_rag_chain .invoke (
134+ {'input' :question },
135+ config = {
136+ 'configurable' : {'session_id' :session_id }
137+ }
138+ )
139+ return response
140+
141+ # response = rag_chain.invoke({"input": question})
142+ # return response
143+
144+
145+ def run_qa_pipeline (config ):
146+ epub_path = config ['epub' ]['epub_path' ]
147+ extracted_text = extract_text_from_epub (epub_path )
148+ doc , chunk_size , chunk_overlap = create_document_from_text (extracted_text )
149+ all_splits = get_all_splits (doc , chunk_size , chunk_overlap )
150+ if config ['vectorstore' ]['load' ]:
151+ vectorstore = load_vectorstore ( config ['vectorstore' ]['persist_dir' ] )
152+ else :
153+ vectorstore = create_vectorstore ( config ['vectorstore' ]['persist_dir' ], all_splits )
154+
155+ retriever = vectorstore .as_retriever (search_type = 'similarity' , search_kwargs = {'k' :config ['vectorstore' ]['k' ]})
156+ llm = ChatOpenAI (model = 'gpt-4o-mini' )
157+
158+ print (f'Enter your questions about your novel loaded from { epub_path } . Type "exit" to quit the program.' )
159+ session_id = utils .generate_session_id ()
160+ store = {}
161+ while True :
162+ question = input ('Ask me a question: ' ).strip ()
163+ if question .lower () == 'exit' :
164+ print ('terminating...' )
165+ sys .exit (0 )
166+
167+ response = get_response (retriever , llm , question , store , session_id )
168+ answer = response ['answer' ]
169+ print (f'Response:\n { answer } ' , end = '\n \n ' )
0 commit comments