Merge pull request #109 from chojuninengu/improve-error-handling

patchy631 · web-flow · commit 07f2d331095f · 2025-04-11T17:34:19.000+05:30
Refactor GitHub RAG application: added URL validation, improved error…
diff --git a/github-rag/app.py b/github-rag/app.py
@@ -1,174 +1,226 @@
 import os
-
 import gc
 import tempfile
 import uuid
 import pandas as pd
+from typing import Optional, Dict, Any
+import logging
 
 from gitingest import ingest
-
-from llama_index.core import Settings
-from llama_index.core import PromptTemplate
-from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
+from llama_index.core import Settings, PromptTemplate, VectorStoreIndex, SimpleDirectoryReader
 from llama_index.core.node_parser import MarkdownNodeParser
-
 import streamlit as st
-
 from dotenv import load_dotenv
 
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
 load_dotenv()
 
-if "id" not in st.session_state:
-    st.session_state.id = uuid.uuid4()
-    st.session_state.file_cache = {}
+# Constants
+MAX_REPO_SIZE = 100 * 1024 * 1024  # 100MB
+SUPPORTED_REPO_TYPES = ['.py', '.md', '.ipynb', '.js', '.ts', '.json']
 
-session_id = st.session_state.id
-client = None
+class GitHubRAGError(Exception):
+    """Custom exception for GitHub RAG application errors"""
+    pass
+
+def validate_github_url(url: str) -> bool:
+    """Validate GitHub repository URL"""
+    return url.startswith(('https://github.com/', 'http://github.com/'))
+
+def get_repo_name(url: str) -> str:
+    """Extract repository name from URL"""
+    try:
+        return url.split('/')[-1].replace('.git', '')
+    except Exception as e:
+        raise GitHubRAGError(f"Invalid repository URL: {str(e)}")
 
 def reset_chat():
+    """Reset chat session and clean up resources"""
+    try:
+        st.session_state.messages = []
+        st.session_state.context = None
+        gc.collect()
+        logger.info("Chat session reset successfully")
+    except Exception as e:
+        logger.error(f"Error resetting chat: {str(e)}")
+        raise GitHubRAGError("Failed to reset chat session")
+
+def process_with_gitingets(github_url: str) -> tuple:
+    """Process GitHub repository using gitingest"""
+    try:
+        summary, tree, content = ingest(github_url)
+        if not all([summary, tree, content]):
+            raise GitHubRAGError("Failed to process repository: Missing data")
+        return summary, tree, content
+    except Exception as e:
+        logger.error(f"Error processing repository: {str(e)}")
+        raise GitHubRAGError(f"Failed to process repository: {str(e)}")
+
+def create_query_engine(content_path: str, repo_name: str) -> Any:
+    """Create and configure query engine"""
+    try:
+        loader = SimpleDirectoryReader(input_dir=content_path)
+        docs = loader.load_data()
+        node_parser = MarkdownNodeParser()
+        index = VectorStoreIndex.from_documents(
+            documents=docs, 
+            transformations=[node_parser], 
+            show_progress=True
+        )
+
+        qa_prompt_tmpl_str = """
+        You are an AI assistant specialized in analyzing GitHub repositories.
+
+        Repository structure:
+        {tree}
+        ---------------------
+
+        Context information from the repository:
+        {context_str}
+        ---------------------
+
+        Given the repository structure and context above, provide a clear and precise answer to the query. 
+        Focus on the repository's content, code structure, and implementation details. 
+        If the information is not available in the context, respond with 'I don't have enough information about that aspect of the repository.'
+
+        Query: {query_str}
+        Answer: """
+        
+        qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)
+        query_engine = index.as_query_engine(streaming=True)
+        query_engine.update_prompts(
+            {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
+        )
+        return query_engine
+    except Exception as e:
+        logger.error(f"Error creating query engine: {str(e)}")
+        raise GitHubRAGError(f"Failed to create query engine: {str(e)}")
+
+# Initialize session state
+if "id" not in st.session_state:
+    st.session_state.id = uuid.uuid4()
+    st.session_state.file_cache = {}
     st.session_state.messages = []
-    st.session_state.context = None
-    gc.collect()
-
-def process_with_gitingets(github_url):
-    # or from URL
-    summary, tree, content = ingest(github_url)
-    return summary, tree, content
 
+session_id = st.session_state.id
 
+# Sidebar
 with st.sidebar:
-    st.header(f"Add your GitHub repository!")
+    st.header("Add your GitHub repository!")
+    
+    github_url = st.text_input(
+        "Enter GitHub repository URL",
+        placeholder="https://github.com/username/repo",
+        help="Enter a valid GitHub repository URL"
+    )
     
-    github_url = st.text_input("Enter GitHub repository URL", placeholder="GitHub URL")
-    load_repo = st.button("Load Repository")
+    load_repo = st.button("Load Repository", type="primary")
 
     if github_url and load_repo:
         try:
-            with tempfile.TemporaryDirectory() as temp_dir:
-                st.write("Processing your repository...")
-                repo_name = github_url.split('/')[-1]
-                file_key = f"{session_id}-{repo_name}"
-                
-                if file_key not in st.session_state.get('file_cache', {}):
-
-                    if os.path.exists(temp_dir):
-                        summary, tree, content = process_with_gitingets(github_url)
-
-                        # Write summary to a markdown file in temp directory
-                        content_path = os.path.join(temp_dir, f"{repo_name}_content.md")
-                        with open(content_path, "w", encoding="utf-8") as f:
-                            f.write(content)
-                        loader = SimpleDirectoryReader(
-                            input_dir=temp_dir,
-                        )
-                    else:    
-                        st.error('Could not find the file you uploaded, please check again...')
-                        st.stop()
-                    
-                    docs = loader.load_data()
-                    node_parser = MarkdownNodeParser()
-                    index = VectorStoreIndex.from_documents(documents=docs, transformations=[node_parser], show_progress=True)
-
-                    # Create the query engine, where we use a cohere reranker on the fetched node
-                    query_engine = index.as_query_engine(streaming=True)
-
-                    # ====== Customise prompt template ======
-                    qa_prompt_tmpl_str = """
-                    You are an AI assistant specialized in analyzing GitHub repositories.
-
-                    Repository structure:
-                    {tree}
-                    ---------------------
-
-                    Context information from the repository:
-                    {context_str}
-                    ---------------------
-
-                    Given the repository structure and context above, provide a clear and precise answer to the query. 
-                    Focus on the repository's content, code structure, and implementation details. 
-                    If the information is not available in the context, respond with 'I don't have enough information about that aspect of the repository.'
-
-                    Query: {query_str}
-                    Answer: """
-                    qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)
-
-                    query_engine.update_prompts(
-                        {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
-                    )
-                    
-                    st.session_state.file_cache[file_key] = query_engine
-                else:
-                    query_engine = st.session_state.file_cache[file_key]
+            # Validate URL
+            if not validate_github_url(github_url):
+                st.error("Please enter a valid GitHub repository URL")
+                st.stop()
 
-                # Inform the user that the file is processed and Display the PDF uploaded
-                st.success("Ready to Chat!")
+            repo_name = get_repo_name(github_url)
+            file_key = f"{session_id}-{repo_name}"
+            
+            if file_key not in st.session_state.file_cache:
+                with st.spinner("Processing your repository..."):
+                    with tempfile.TemporaryDirectory() as temp_dir:
+                        try:
+                            summary, tree, content = process_with_gitingets(github_url)
+                            
+                            # Write content to temporary file
+                            content_path = os.path.join(temp_dir, f"{repo_name}_content.md")
+                            with open(content_path, "w", encoding="utf-8") as f:
+                                f.write(content)
+                            
+                            # Create and cache query engine
+                            query_engine = create_query_engine(temp_dir, repo_name)
+                            st.session_state.file_cache[file_key] = query_engine
+                            
+                            st.success("Repository loaded successfully! Ready to chat.")
+                            logger.info(f"Successfully processed repository: {repo_name}")
+                            
+                        except GitHubRAGError as e:
+                            st.error(str(e))
+                            logger.error(f"Error processing repository {repo_name}: {str(e)}")
+                            st.stop()
+                        except Exception as e:
+                            st.error("An unexpected error occurred while processing the repository")
+                            logger.error(f"Unexpected error: {str(e)}")
+                            st.stop()
+            else:
+                st.info("Repository already loaded. Ready to chat!")
+                
         except Exception as e:
-            st.error(f"An error occurred: {e}")
-            st.stop()     
+            st.error(f"An error occurred: {str(e)}")
+            logger.error(f"Error in repository loading process: {str(e)}")
+            st.stop()
 
+# Main content
 col1, col2 = st.columns([6, 1])
 
 with col1:
-    st.header(f"Chat with GitHub using RAG </>")
+    st.header("Chat with GitHub using RAG </>")
 
 with col2:
-    st.button("Clear ↺", on_click=reset_chat)
-
-# Initialize chat history
-if "messages" not in st.session_state:
-    reset_chat()
+    st.button("Clear Chat ↺", on_click=reset_chat, help="Clear chat history and reset session")
 
-
-# Display chat messages from history on app rerun
+# Display chat history
 for message in st.session_state.messages:
     with st.chat_message(message["role"]):
         st.markdown(message["content"])
 
-
-# Accept user input
+# Chat input
 if prompt := st.chat_input("What's up?"):
-    # Add user message to chat history
-    st.session_state.messages.append({"role": "user", "content": prompt})
-    # Display user message in chat message container
-    with st.chat_message("user"):
-        st.markdown(prompt)
-
-    # Display assistant response in chat message container
-    with st.chat_message("assistant"):
-        message_placeholder = st.empty()
-        full_response = ""
+    try:
+        # Add user message to chat history
+        st.session_state.messages.append({"role": "user", "content": prompt})
         
-        try:
-            # Get the repo name from the GitHub URL
-            repo_name = github_url.split('/')[-1]
-            file_key = f"{session_id}-{repo_name}"
-            
-            # Get query engine from session state
-            query_engine = st.session_state.file_cache.get(file_key)
+        # Display user message
+        with st.chat_message("user"):
+            st.markdown(prompt)
+
+        # Process and display assistant response
+        with st.chat_message("assistant"):
+            message_placeholder = st.empty()
+            full_response = ""
             
-            if query_engine is None:
-                st.error("Please load a repository first!")
-                st.stop()
+            try:
+                repo_name = get_repo_name(github_url)
+                file_key = f"{session_id}-{repo_name}"
+                query_engine = st.session_state.file_cache.get(file_key)
                 
-            # Use the query engine
-            response = query_engine.query(prompt)
-            
-            # Handle streaming response
-            if hasattr(response, 'response_gen'):
-                for chunk in response.response_gen:
-                    if isinstance(chunk, str):  # Only process string chunks
-                        full_response += chunk
-                        message_placeholder.markdown(full_response + "▌")
-            else:
-                # Handle non-streaming response
-                full_response = str(response)
+                if query_engine is None:
+                    raise GitHubRAGError("Please load a repository first!")
+                
+                response = query_engine.query(prompt)
+                
+                if hasattr(response, 'response_gen'):
+                    for chunk in response.response_gen:
+                        if isinstance(chunk, str):
+                            full_response += chunk
+                            message_placeholder.markdown(full_response + "▌")
+                else:
+                    full_response = str(response)
+                    message_placeholder.markdown(full_response)
+                    
                 message_placeholder.markdown(full_response)
-
-            message_placeholder.markdown(full_response)
-        except Exception as e:
-            st.error(f"An error occurred while processing your query: {str(e)}")
-            full_response = "Sorry, I encountered an error while processing your request."
-            message_placeholder.markdown(full_response)
-
-    # Add assistant response to chat history
-    st.session_state.messages.append({"role": "assistant", "content": full_response})
+                st.session_state.messages.append({"role": "assistant", "content": full_response})
+                
+            except GitHubRAGError as e:
+                st.error(str(e))
+                logger.error(f"Error in chat processing: {str(e)}")
+            except Exception as e:
+                st.error("An unexpected error occurred while processing your query")
+                logger.error(f"Unexpected error in chat: {str(e)}")
+                
+    except Exception as e:
+        st.error("An error occurred in the chat system")
+        logger.error(f"Chat system error: {str(e)}")