Skip to content

Commit 07f2d33

Browse files
authored
Merge pull request #109 from chojuninengu/improve-error-handling
Refactor GitHub RAG application: added URL validation, improved error…
2 parents b6eead9 + f21cdcd commit 07f2d33

File tree

1 file changed

+184
-132
lines changed

1 file changed

+184
-132
lines changed

github-rag/app.py

Lines changed: 184 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -1,174 +1,226 @@
11
import os
2-
32
import gc
43
import tempfile
54
import uuid
65
import pandas as pd
6+
from typing import Optional, Dict, Any
7+
import logging
78

89
from gitingest import ingest
9-
10-
from llama_index.core import Settings
11-
from llama_index.core import PromptTemplate
12-
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
10+
from llama_index.core import Settings, PromptTemplate, VectorStoreIndex, SimpleDirectoryReader
1311
from llama_index.core.node_parser import MarkdownNodeParser
14-
1512
import streamlit as st
16-
1713
from dotenv import load_dotenv
1814

15+
# Configure logging
16+
logging.basicConfig(level=logging.INFO)
17+
logger = logging.getLogger(__name__)
18+
1919
load_dotenv()
2020

21-
if "id" not in st.session_state:
22-
st.session_state.id = uuid.uuid4()
23-
st.session_state.file_cache = {}
21+
# Constants
22+
MAX_REPO_SIZE = 100 * 1024 * 1024 # 100MB
23+
SUPPORTED_REPO_TYPES = ['.py', '.md', '.ipynb', '.js', '.ts', '.json']
2424

25-
session_id = st.session_state.id
26-
client = None
25+
class GitHubRAGError(Exception):
26+
"""Custom exception for GitHub RAG application errors"""
27+
pass
28+
29+
def validate_github_url(url: str) -> bool:
30+
"""Validate GitHub repository URL"""
31+
return url.startswith(('https://github.com/', 'http://github.com/'))
32+
33+
def get_repo_name(url: str) -> str:
34+
"""Extract repository name from URL"""
35+
try:
36+
return url.split('/')[-1].replace('.git', '')
37+
except Exception as e:
38+
raise GitHubRAGError(f"Invalid repository URL: {str(e)}")
2739

2840
def reset_chat():
41+
"""Reset chat session and clean up resources"""
42+
try:
43+
st.session_state.messages = []
44+
st.session_state.context = None
45+
gc.collect()
46+
logger.info("Chat session reset successfully")
47+
except Exception as e:
48+
logger.error(f"Error resetting chat: {str(e)}")
49+
raise GitHubRAGError("Failed to reset chat session")
50+
51+
def process_with_gitingets(github_url: str) -> tuple:
52+
"""Process GitHub repository using gitingest"""
53+
try:
54+
summary, tree, content = ingest(github_url)
55+
if not all([summary, tree, content]):
56+
raise GitHubRAGError("Failed to process repository: Missing data")
57+
return summary, tree, content
58+
except Exception as e:
59+
logger.error(f"Error processing repository: {str(e)}")
60+
raise GitHubRAGError(f"Failed to process repository: {str(e)}")
61+
62+
def create_query_engine(content_path: str, repo_name: str) -> Any:
63+
"""Create and configure query engine"""
64+
try:
65+
loader = SimpleDirectoryReader(input_dir=content_path)
66+
docs = loader.load_data()
67+
node_parser = MarkdownNodeParser()
68+
index = VectorStoreIndex.from_documents(
69+
documents=docs,
70+
transformations=[node_parser],
71+
show_progress=True
72+
)
73+
74+
qa_prompt_tmpl_str = """
75+
You are an AI assistant specialized in analyzing GitHub repositories.
76+
77+
Repository structure:
78+
{tree}
79+
---------------------
80+
81+
Context information from the repository:
82+
{context_str}
83+
---------------------
84+
85+
Given the repository structure and context above, provide a clear and precise answer to the query.
86+
Focus on the repository's content, code structure, and implementation details.
87+
If the information is not available in the context, respond with 'I don't have enough information about that aspect of the repository.'
88+
89+
Query: {query_str}
90+
Answer: """
91+
92+
qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)
93+
query_engine = index.as_query_engine(streaming=True)
94+
query_engine.update_prompts(
95+
{"response_synthesizer:text_qa_template": qa_prompt_tmpl}
96+
)
97+
return query_engine
98+
except Exception as e:
99+
logger.error(f"Error creating query engine: {str(e)}")
100+
raise GitHubRAGError(f"Failed to create query engine: {str(e)}")
101+
102+
# Initialize session state
103+
if "id" not in st.session_state:
104+
st.session_state.id = uuid.uuid4()
105+
st.session_state.file_cache = {}
29106
st.session_state.messages = []
30-
st.session_state.context = None
31-
gc.collect()
32-
33-
def process_with_gitingets(github_url):
34-
# or from URL
35-
summary, tree, content = ingest(github_url)
36-
return summary, tree, content
37107

108+
session_id = st.session_state.id
38109

110+
# Sidebar
39111
with st.sidebar:
40-
st.header(f"Add your GitHub repository!")
112+
st.header("Add your GitHub repository!")
113+
114+
github_url = st.text_input(
115+
"Enter GitHub repository URL",
116+
placeholder="https://github.com/username/repo",
117+
help="Enter a valid GitHub repository URL"
118+
)
41119

42-
github_url = st.text_input("Enter GitHub repository URL", placeholder="GitHub URL")
43-
load_repo = st.button("Load Repository")
120+
load_repo = st.button("Load Repository", type="primary")
44121

45122
if github_url and load_repo:
46123
try:
47-
with tempfile.TemporaryDirectory() as temp_dir:
48-
st.write("Processing your repository...")
49-
repo_name = github_url.split('/')[-1]
50-
file_key = f"{session_id}-{repo_name}"
51-
52-
if file_key not in st.session_state.get('file_cache', {}):
53-
54-
if os.path.exists(temp_dir):
55-
summary, tree, content = process_with_gitingets(github_url)
56-
57-
# Write summary to a markdown file in temp directory
58-
content_path = os.path.join(temp_dir, f"{repo_name}_content.md")
59-
with open(content_path, "w", encoding="utf-8") as f:
60-
f.write(content)
61-
loader = SimpleDirectoryReader(
62-
input_dir=temp_dir,
63-
)
64-
else:
65-
st.error('Could not find the file you uploaded, please check again...')
66-
st.stop()
67-
68-
docs = loader.load_data()
69-
node_parser = MarkdownNodeParser()
70-
index = VectorStoreIndex.from_documents(documents=docs, transformations=[node_parser], show_progress=True)
71-
72-
# Create the query engine, where we use a cohere reranker on the fetched node
73-
query_engine = index.as_query_engine(streaming=True)
74-
75-
# ====== Customise prompt template ======
76-
qa_prompt_tmpl_str = """
77-
You are an AI assistant specialized in analyzing GitHub repositories.
78-
79-
Repository structure:
80-
{tree}
81-
---------------------
82-
83-
Context information from the repository:
84-
{context_str}
85-
---------------------
86-
87-
Given the repository structure and context above, provide a clear and precise answer to the query.
88-
Focus on the repository's content, code structure, and implementation details.
89-
If the information is not available in the context, respond with 'I don't have enough information about that aspect of the repository.'
90-
91-
Query: {query_str}
92-
Answer: """
93-
qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)
94-
95-
query_engine.update_prompts(
96-
{"response_synthesizer:text_qa_template": qa_prompt_tmpl}
97-
)
98-
99-
st.session_state.file_cache[file_key] = query_engine
100-
else:
101-
query_engine = st.session_state.file_cache[file_key]
124+
# Validate URL
125+
if not validate_github_url(github_url):
126+
st.error("Please enter a valid GitHub repository URL")
127+
st.stop()
102128

103-
# Inform the user that the file is processed and Display the PDF uploaded
104-
st.success("Ready to Chat!")
129+
repo_name = get_repo_name(github_url)
130+
file_key = f"{session_id}-{repo_name}"
131+
132+
if file_key not in st.session_state.file_cache:
133+
with st.spinner("Processing your repository..."):
134+
with tempfile.TemporaryDirectory() as temp_dir:
135+
try:
136+
summary, tree, content = process_with_gitingets(github_url)
137+
138+
# Write content to temporary file
139+
content_path = os.path.join(temp_dir, f"{repo_name}_content.md")
140+
with open(content_path, "w", encoding="utf-8") as f:
141+
f.write(content)
142+
143+
# Create and cache query engine
144+
query_engine = create_query_engine(temp_dir, repo_name)
145+
st.session_state.file_cache[file_key] = query_engine
146+
147+
st.success("Repository loaded successfully! Ready to chat.")
148+
logger.info(f"Successfully processed repository: {repo_name}")
149+
150+
except GitHubRAGError as e:
151+
st.error(str(e))
152+
logger.error(f"Error processing repository {repo_name}: {str(e)}")
153+
st.stop()
154+
except Exception as e:
155+
st.error("An unexpected error occurred while processing the repository")
156+
logger.error(f"Unexpected error: {str(e)}")
157+
st.stop()
158+
else:
159+
st.info("Repository already loaded. Ready to chat!")
160+
105161
except Exception as e:
106-
st.error(f"An error occurred: {e}")
107-
st.stop()
162+
st.error(f"An error occurred: {str(e)}")
163+
logger.error(f"Error in repository loading process: {str(e)}")
164+
st.stop()
108165

166+
# Main content
109167
col1, col2 = st.columns([6, 1])
110168

111169
with col1:
112-
st.header(f"Chat with GitHub using RAG </>")
170+
st.header("Chat with GitHub using RAG </>")
113171

114172
with col2:
115-
st.button("Clear ↺", on_click=reset_chat)
116-
117-
# Initialize chat history
118-
if "messages" not in st.session_state:
119-
reset_chat()
173+
st.button("Clear Chat ↺", on_click=reset_chat, help="Clear chat history and reset session")
120174

121-
122-
# Display chat messages from history on app rerun
175+
# Display chat history
123176
for message in st.session_state.messages:
124177
with st.chat_message(message["role"]):
125178
st.markdown(message["content"])
126179

127-
128-
# Accept user input
180+
# Chat input
129181
if prompt := st.chat_input("What's up?"):
130-
# Add user message to chat history
131-
st.session_state.messages.append({"role": "user", "content": prompt})
132-
# Display user message in chat message container
133-
with st.chat_message("user"):
134-
st.markdown(prompt)
135-
136-
# Display assistant response in chat message container
137-
with st.chat_message("assistant"):
138-
message_placeholder = st.empty()
139-
full_response = ""
182+
try:
183+
# Add user message to chat history
184+
st.session_state.messages.append({"role": "user", "content": prompt})
140185

141-
try:
142-
# Get the repo name from the GitHub URL
143-
repo_name = github_url.split('/')[-1]
144-
file_key = f"{session_id}-{repo_name}"
145-
146-
# Get query engine from session state
147-
query_engine = st.session_state.file_cache.get(file_key)
186+
# Display user message
187+
with st.chat_message("user"):
188+
st.markdown(prompt)
189+
190+
# Process and display assistant response
191+
with st.chat_message("assistant"):
192+
message_placeholder = st.empty()
193+
full_response = ""
148194

149-
if query_engine is None:
150-
st.error("Please load a repository first!")
151-
st.stop()
195+
try:
196+
repo_name = get_repo_name(github_url)
197+
file_key = f"{session_id}-{repo_name}"
198+
query_engine = st.session_state.file_cache.get(file_key)
152199

153-
# Use the query engine
154-
response = query_engine.query(prompt)
155-
156-
# Handle streaming response
157-
if hasattr(response, 'response_gen'):
158-
for chunk in response.response_gen:
159-
if isinstance(chunk, str): # Only process string chunks
160-
full_response += chunk
161-
message_placeholder.markdown(full_response + "▌")
162-
else:
163-
# Handle non-streaming response
164-
full_response = str(response)
200+
if query_engine is None:
201+
raise GitHubRAGError("Please load a repository first!")
202+
203+
response = query_engine.query(prompt)
204+
205+
if hasattr(response, 'response_gen'):
206+
for chunk in response.response_gen:
207+
if isinstance(chunk, str):
208+
full_response += chunk
209+
message_placeholder.markdown(full_response + "▌")
210+
else:
211+
full_response = str(response)
212+
message_placeholder.markdown(full_response)
213+
165214
message_placeholder.markdown(full_response)
166-
167-
message_placeholder.markdown(full_response)
168-
except Exception as e:
169-
st.error(f"An error occurred while processing your query: {str(e)}")
170-
full_response = "Sorry, I encountered an error while processing your request."
171-
message_placeholder.markdown(full_response)
172-
173-
# Add assistant response to chat history
174-
st.session_state.messages.append({"role": "assistant", "content": full_response})
215+
st.session_state.messages.append({"role": "assistant", "content": full_response})
216+
217+
except GitHubRAGError as e:
218+
st.error(str(e))
219+
logger.error(f"Error in chat processing: {str(e)}")
220+
except Exception as e:
221+
st.error("An unexpected error occurred while processing your query")
222+
logger.error(f"Unexpected error in chat: {str(e)}")
223+
224+
except Exception as e:
225+
st.error("An error occurred in the chat system")
226+
logger.error(f"Chat system error: {str(e)}")

0 commit comments

Comments
 (0)