11import os
2-
32import gc
43import tempfile
54import uuid
65import pandas as pd
6+ from typing import Optional , Dict , Any
7+ import logging
78
89from gitingest import ingest
9-
10- from llama_index .core import Settings
11- from llama_index .core import PromptTemplate
12- from llama_index .core import VectorStoreIndex , SimpleDirectoryReader
10+ from llama_index .core import Settings , PromptTemplate , VectorStoreIndex , SimpleDirectoryReader
1311from llama_index .core .node_parser import MarkdownNodeParser
14-
1512import streamlit as st
16-
1713from dotenv import load_dotenv
1814
15+ # Configure logging
16+ logging .basicConfig (level = logging .INFO )
17+ logger = logging .getLogger (__name__ )
18+
1919load_dotenv ()
2020
21- if "id" not in st . session_state :
22- st . session_state . id = uuid . uuid4 ()
23- st . session_state . file_cache = {}
21+ # Constants
22+ MAX_REPO_SIZE = 100 * 1024 * 1024 # 100MB
23+ SUPPORTED_REPO_TYPES = [ '.py' , '.md' , '.ipynb' , '.js' , '.ts' , '.json' ]
2424
25- session_id = st .session_state .id
26- client = None
25+ class GitHubRAGError (Exception ):
26+ """Custom exception for GitHub RAG application errors"""
27+ pass
28+
29+ def validate_github_url (url : str ) -> bool :
30+ """Validate GitHub repository URL"""
31+ return url .startswith (('https://github.com/' , 'http://github.com/' ))
32+
33+ def get_repo_name (url : str ) -> str :
34+ """Extract repository name from URL"""
35+ try :
36+ return url .split ('/' )[- 1 ].replace ('.git' , '' )
37+ except Exception as e :
38+ raise GitHubRAGError (f"Invalid repository URL: { str (e )} " )
2739
2840def reset_chat ():
41+ """Reset chat session and clean up resources"""
42+ try :
43+ st .session_state .messages = []
44+ st .session_state .context = None
45+ gc .collect ()
46+ logger .info ("Chat session reset successfully" )
47+ except Exception as e :
48+ logger .error (f"Error resetting chat: { str (e )} " )
49+ raise GitHubRAGError ("Failed to reset chat session" )
50+
51+ def process_with_gitingets (github_url : str ) -> tuple :
52+ """Process GitHub repository using gitingest"""
53+ try :
54+ summary , tree , content = ingest (github_url )
55+ if not all ([summary , tree , content ]):
56+ raise GitHubRAGError ("Failed to process repository: Missing data" )
57+ return summary , tree , content
58+ except Exception as e :
59+ logger .error (f"Error processing repository: { str (e )} " )
60+ raise GitHubRAGError (f"Failed to process repository: { str (e )} " )
61+
62+ def create_query_engine (content_path : str , repo_name : str ) -> Any :
63+ """Create and configure query engine"""
64+ try :
65+ loader = SimpleDirectoryReader (input_dir = content_path )
66+ docs = loader .load_data ()
67+ node_parser = MarkdownNodeParser ()
68+ index = VectorStoreIndex .from_documents (
69+ documents = docs ,
70+ transformations = [node_parser ],
71+ show_progress = True
72+ )
73+
74+ qa_prompt_tmpl_str = """
75+ You are an AI assistant specialized in analyzing GitHub repositories.
76+
77+ Repository structure:
78+ {tree}
79+ ---------------------
80+
81+ Context information from the repository:
82+ {context_str}
83+ ---------------------
84+
85+ Given the repository structure and context above, provide a clear and precise answer to the query.
86+ Focus on the repository's content, code structure, and implementation details.
87+ If the information is not available in the context, respond with 'I don't have enough information about that aspect of the repository.'
88+
89+ Query: {query_str}
90+ Answer: """
91+
92+ qa_prompt_tmpl = PromptTemplate (qa_prompt_tmpl_str )
93+ query_engine = index .as_query_engine (streaming = True )
94+ query_engine .update_prompts (
95+ {"response_synthesizer:text_qa_template" : qa_prompt_tmpl }
96+ )
97+ return query_engine
98+ except Exception as e :
99+ logger .error (f"Error creating query engine: { str (e )} " )
100+ raise GitHubRAGError (f"Failed to create query engine: { str (e )} " )
101+
102+ # Initialize session state
103+ if "id" not in st .session_state :
104+ st .session_state .id = uuid .uuid4 ()
105+ st .session_state .file_cache = {}
29106 st .session_state .messages = []
30- st .session_state .context = None
31- gc .collect ()
32-
33- def process_with_gitingets (github_url ):
34- # or from URL
35- summary , tree , content = ingest (github_url )
36- return summary , tree , content
37107
108+ session_id = st .session_state .id
38109
110+ # Sidebar
39111with st .sidebar :
40- st .header (f"Add your GitHub repository!" )
112+ st .header ("Add your GitHub repository!" )
113+
114+ github_url = st .text_input (
115+ "Enter GitHub repository URL" ,
116+ placeholder = "https://github.com/username/repo" ,
117+ help = "Enter a valid GitHub repository URL"
118+ )
41119
42- github_url = st .text_input ("Enter GitHub repository URL" , placeholder = "GitHub URL" )
43- load_repo = st .button ("Load Repository" )
120+ load_repo = st .button ("Load Repository" , type = "primary" )
44121
45122 if github_url and load_repo :
46123 try :
47- with tempfile .TemporaryDirectory () as temp_dir :
48- st .write ("Processing your repository..." )
49- repo_name = github_url .split ('/' )[- 1 ]
50- file_key = f"{ session_id } -{ repo_name } "
51-
52- if file_key not in st .session_state .get ('file_cache' , {}):
53-
54- if os .path .exists (temp_dir ):
55- summary , tree , content = process_with_gitingets (github_url )
56-
57- # Write summary to a markdown file in temp directory
58- content_path = os .path .join (temp_dir , f"{ repo_name } _content.md" )
59- with open (content_path , "w" , encoding = "utf-8" ) as f :
60- f .write (content )
61- loader = SimpleDirectoryReader (
62- input_dir = temp_dir ,
63- )
64- else :
65- st .error ('Could not find the file you uploaded, please check again...' )
66- st .stop ()
67-
68- docs = loader .load_data ()
69- node_parser = MarkdownNodeParser ()
70- index = VectorStoreIndex .from_documents (documents = docs , transformations = [node_parser ], show_progress = True )
71-
72- # Create the query engine, where we use a cohere reranker on the fetched node
73- query_engine = index .as_query_engine (streaming = True )
74-
75- # ====== Customise prompt template ======
76- qa_prompt_tmpl_str = """
77- You are an AI assistant specialized in analyzing GitHub repositories.
78-
79- Repository structure:
80- {tree}
81- ---------------------
82-
83- Context information from the repository:
84- {context_str}
85- ---------------------
86-
87- Given the repository structure and context above, provide a clear and precise answer to the query.
88- Focus on the repository's content, code structure, and implementation details.
89- If the information is not available in the context, respond with 'I don't have enough information about that aspect of the repository.'
90-
91- Query: {query_str}
92- Answer: """
93- qa_prompt_tmpl = PromptTemplate (qa_prompt_tmpl_str )
94-
95- query_engine .update_prompts (
96- {"response_synthesizer:text_qa_template" : qa_prompt_tmpl }
97- )
98-
99- st .session_state .file_cache [file_key ] = query_engine
100- else :
101- query_engine = st .session_state .file_cache [file_key ]
124+ # Validate URL
125+ if not validate_github_url (github_url ):
126+ st .error ("Please enter a valid GitHub repository URL" )
127+ st .stop ()
102128
103- # Inform the user that the file is processed and Display the PDF uploaded
104- st .success ("Ready to Chat!" )
129+ repo_name = get_repo_name (github_url )
130+ file_key = f"{ session_id } -{ repo_name } "
131+
132+ if file_key not in st .session_state .file_cache :
133+ with st .spinner ("Processing your repository..." ):
134+ with tempfile .TemporaryDirectory () as temp_dir :
135+ try :
136+ summary , tree , content = process_with_gitingets (github_url )
137+
138+ # Write content to temporary file
139+ content_path = os .path .join (temp_dir , f"{ repo_name } _content.md" )
140+ with open (content_path , "w" , encoding = "utf-8" ) as f :
141+ f .write (content )
142+
143+ # Create and cache query engine
144+ query_engine = create_query_engine (temp_dir , repo_name )
145+ st .session_state .file_cache [file_key ] = query_engine
146+
147+ st .success ("Repository loaded successfully! Ready to chat." )
148+ logger .info (f"Successfully processed repository: { repo_name } " )
149+
150+ except GitHubRAGError as e :
151+ st .error (str (e ))
152+ logger .error (f"Error processing repository { repo_name } : { str (e )} " )
153+ st .stop ()
154+ except Exception as e :
155+ st .error ("An unexpected error occurred while processing the repository" )
156+ logger .error (f"Unexpected error: { str (e )} " )
157+ st .stop ()
158+ else :
159+ st .info ("Repository already loaded. Ready to chat!" )
160+
105161 except Exception as e :
106- st .error (f"An error occurred: { e } " )
107- st .stop ()
162+ st .error (f"An error occurred: { str (e )} " )
163+ logger .error (f"Error in repository loading process: { str (e )} " )
164+ st .stop ()
108165
166+ # Main content
109167col1 , col2 = st .columns ([6 , 1 ])
110168
111169with col1 :
112- st .header (f "Chat with GitHub using RAG </>" )
170+ st .header ("Chat with GitHub using RAG </>" )
113171
114172with col2 :
115- st .button ("Clear ↺" , on_click = reset_chat )
116-
117- # Initialize chat history
118- if "messages" not in st .session_state :
119- reset_chat ()
173+ st .button ("Clear Chat ↺" , on_click = reset_chat , help = "Clear chat history and reset session" )
120174
121-
122- # Display chat messages from history on app rerun
175+ # Display chat history
123176for message in st .session_state .messages :
124177 with st .chat_message (message ["role" ]):
125178 st .markdown (message ["content" ])
126179
127-
128- # Accept user input
180+ # Chat input
129181if prompt := st .chat_input ("What's up?" ):
130- # Add user message to chat history
131- st .session_state .messages .append ({"role" : "user" , "content" : prompt })
132- # Display user message in chat message container
133- with st .chat_message ("user" ):
134- st .markdown (prompt )
135-
136- # Display assistant response in chat message container
137- with st .chat_message ("assistant" ):
138- message_placeholder = st .empty ()
139- full_response = ""
182+ try :
183+ # Add user message to chat history
184+ st .session_state .messages .append ({"role" : "user" , "content" : prompt })
140185
141- try :
142- # Get the repo name from the GitHub URL
143- repo_name = github_url .split ('/' )[- 1 ]
144- file_key = f"{ session_id } -{ repo_name } "
145-
146- # Get query engine from session state
147- query_engine = st .session_state .file_cache .get (file_key )
186+ # Display user message
187+ with st .chat_message ("user" ):
188+ st .markdown (prompt )
189+
190+ # Process and display assistant response
191+ with st .chat_message ("assistant" ):
192+ message_placeholder = st .empty ()
193+ full_response = ""
148194
149- if query_engine is None :
150- st .error ("Please load a repository first!" )
151- st .stop ()
195+ try :
196+ repo_name = get_repo_name (github_url )
197+ file_key = f"{ session_id } -{ repo_name } "
198+ query_engine = st .session_state .file_cache .get (file_key )
152199
153- # Use the query engine
154- response = query_engine .query (prompt )
155-
156- # Handle streaming response
157- if hasattr (response , 'response_gen' ):
158- for chunk in response .response_gen :
159- if isinstance (chunk , str ): # Only process string chunks
160- full_response += chunk
161- message_placeholder .markdown (full_response + "▌" )
162- else :
163- # Handle non-streaming response
164- full_response = str (response )
200+ if query_engine is None :
201+ raise GitHubRAGError ("Please load a repository first!" )
202+
203+ response = query_engine .query (prompt )
204+
205+ if hasattr (response , 'response_gen' ):
206+ for chunk in response .response_gen :
207+ if isinstance (chunk , str ):
208+ full_response += chunk
209+ message_placeholder .markdown (full_response + "▌" )
210+ else :
211+ full_response = str (response )
212+ message_placeholder .markdown (full_response )
213+
165214 message_placeholder .markdown (full_response )
166-
167- message_placeholder .markdown (full_response )
168- except Exception as e :
169- st .error (f"An error occurred while processing your query: { str (e )} " )
170- full_response = "Sorry, I encountered an error while processing your request."
171- message_placeholder .markdown (full_response )
172-
173- # Add assistant response to chat history
174- st .session_state .messages .append ({"role" : "assistant" , "content" : full_response })
215+ st .session_state .messages .append ({"role" : "assistant" , "content" : full_response })
216+
217+ except GitHubRAGError as e :
218+ st .error (str (e ))
219+ logger .error (f"Error in chat processing: { str (e )} " )
220+ except Exception as e :
221+ st .error ("An unexpected error occurred while processing your query" )
222+ logger .error (f"Unexpected error in chat: { str (e )} " )
223+
224+ except Exception as e :
225+ st .error ("An error occurred in the chat system" )
226+ logger .error (f"Chat system error: { str (e )} " )
0 commit comments