Merge pull request #110 from smokeyScraper/contributor_recommendation_tool

chandansgowda · web-flow · commit a1159d74129f · 2025-07-27T19:16:23.000+05:30
[feat]: implement github contributor recommendation tool
diff --git a/backend/app/agents/devrel/github/github_toolkit.py b/backend/app/agents/devrel/github/github_toolkit.py
@@ -6,7 +6,7 @@
 from .prompts.intent_analysis import GITHUB_INTENT_ANALYSIS_PROMPT
 from .tools.search import handle_web_search
 # TODO: Implement all tools
-# from .tools.contributor_recommendation import handle_contributor_recommendation
+from .tools.contributor_recommendation import handle_contributor_recommendation
 # from .tools.repository_query import handle_repo_query
 # from .tools.issue_creation import handle_issue_creation
 # from .tools.documentation_generation import handle_documentation_generation
@@ -102,8 +102,7 @@ async def execute(self, query: str) -> Dict[str, Any]:
             logger.info(f"Executing {classification} for query")
 
             if classification == "contributor_recommendation":
-                result = "Not implemented"
-                # result = await handle_contributor_recommendation(query)
+                result = await handle_contributor_recommendation(query)
             elif classification == "repo_support":
                 result = "Not implemented"
                 # result = await handle_repo_query(query)
diff --git a/backend/app/agents/devrel/github/prompts/contributor_recommendation/issue_summarization.py b/backend/app/agents/devrel/github/prompts/contributor_recommendation/issue_summarization.py
@@ -0,0 +1,20 @@
+ISSUE_SUMMARIZATION_PROMPT = """You are a technical analyst optimizing GitHub issues for contributor search. 
+
+Analyze the provided GitHub issue and create a technical summary optimized for finding relevant expert contributors.
+
+Focus on:
+- Core technical problem or feature request
+- Specific technologies, frameworks, libraries, APIs mentioned
+- Technical skills and expertise required to solve this
+- Programming languages and tools involved
+- System components affected (frontend, backend, database, etc.)
+
+Create a summary that reads like a job requirement for finding the right technical expert.
+
+**GitHub Issue Content:**
+---
+{issue_content}
+---
+
+**Optimized Technical Summary for Contributor Search:**
+"""
diff --git a/backend/app/agents/devrel/github/prompts/contributor_recommendation/query_alignment.py b/backend/app/agents/devrel/github/prompts/contributor_recommendation/query_alignment.py
@@ -0,0 +1,37 @@
+QUERY_ALIGNMENT_PROMPT = """Analyze this contributor recommendation request and process it for technical search:
+
+USER REQUEST: {query}
+
+Your task:
+1. Extract the core technical requirements 
+2. Generate a clean, technical search query optimized for finding contributor profiles
+3. Extract specific keywords that would appear in developer profiles (languages, frameworks, tools, domains)
+
+Guidelines:
+- aligned_query: Convert user request into clear technical language that matches how developers describe their skills
+- keywords: Extract 3-5 specific technical terms (React, Python, API, database, etc.)
+- Focus on technologies, not business requirements
+- Make it searchable against developer profiles and contribution history
+
+Example transformations:
+
+Input: "I need help with our Stripe payment integration issue"
+Output: {{"query_type": "general", "aligned_query": "developer with payment processing and Stripe API integration experience", "keywords": ["Stripe", "payment", "API", "integration"], "technical_domain": "backend"}}
+
+Input: "Find experts for database optimization"
+Output: {{"query_type": "general", "aligned_query": "backend developer with database performance optimization experience", "keywords": ["database", "optimization", "performance", "SQL"], "technical_domain": "backend"}}
+
+Input: "https://github.com/owner/repo/issues/123 - authentication bug"
+Output: {{"query_type": "github_issue", "aligned_query": "developer with authentication and security implementation experience", "keywords": ["authentication", "security", "OAuth", "JWT"], "technical_domain": "security"}}
+
+IMPORTANT FORMATTING RULES:
+- DO NOT use markdown formatting
+- DO NOT wrap in code blocks (```)
+- DO NOT add any text before or after the JSON
+- DO NOT add explanations
+- Return EXACTLY this format: {{"query_type": "...", "aligned_query": "...", "keywords": [...], "technical_domain": "..."}}
+
+Expected JSON schema:
+{{"query_type": "github_issue" | "general", "aligned_query": "clean technical search text", "keywords": ["keyword1", "keyword2"], "technical_domain": "frontend|backend|fullstack|ml|devops|mobile|security|other"}}
+
+Return the JSON object only:"""
diff --git a/backend/app/agents/devrel/github/prompts/intent_analysis.py b/backend/app/agents/devrel/github/prompts/intent_analysis.py
@@ -2,7 +2,7 @@
 
 AVAILABLE FUNCTIONS:
 - web_search: Search the web for information  
-- contributor_recommendation: Finding the right people to review PRs, assign issues, or collaborate
+- contributor_recommendation: Finding the right people to review PRs, assign issues, or collaborate (supports both issue URLs and general queries)
 - repo_support: Questions about codebase structure, dependencies, impact analysis, architecture
 - issue_creation: Creating bug reports, feature requests, or tracking items
 - documentation_generation: Generating docs, READMEs, API docs, guides, or explanations
@@ -12,7 +12,13 @@
 USER QUERY: {user_query}
 
 Classification guidelines:
-- contributor_recommendation: Finding reviewers, assignees, collaborators
+- contributor_recommendation: 
+  * "who should review this PR/issue?"
+  * "find experts in React/Python/ML"
+  * "recommend assignees for stripe integration"
+  * "best people for database optimization"
+  * URLs like github.com/owner/repo/issues/123
+  * "I need help with RabbitMQ, can you suggest some people?"
 - repo_support: Code structure, dependencies, impact analysis, architecture  
 - issue_creation: Creating bugs, features, tracking items
 - documentation_generation: Docs, READMEs, guides, explanations
diff --git a/backend/app/agents/devrel/github/tools/contributor_recommendation.py b/backend/app/agents/devrel/github/tools/contributor_recommendation.py
@@ -1 +1,170 @@
+import logging
+import re
+from typing import Any, Dict
+from urllib.parse import urlparse
+from langchain_core.messages import HumanMessage
+from langchain_google_genai import ChatGoogleGenerativeAI
 
+from app.core.config import settings
+from app.database.weaviate.operations import search_contributors
+from app.services.github.issue_processor import GitHubIssueProcessor
+from app.services.embedding_service.service import EmbeddingService
+from ..prompts.contributor_recommendation.query_alignment import QUERY_ALIGNMENT_PROMPT
+
+logger = logging.getLogger(__name__)
+
+class ContributorRecommendationWorkflow:
+    """
+    Contributor recommendation with proper query alignment for hybrid search.
+    """
+
+    def __init__(self):
+        self.query_alignment_llm = ChatGoogleGenerativeAI(
+            model=settings.github_agent_model,
+            temperature=0.1,
+            google_api_key=settings.gemini_api_key
+        )
+        self.embedding_service = EmbeddingService()
+
+    async def _align_user_request(self, query: str) -> Dict[str, Any]:
+        """
+        Align user request into optimized format for hybrid search.
+        Extract clean technical query + keywords that match contributor profiles.
+        """
+        logger.info("Aligning user request for hybrid search optimization")
+
+        url_match = re.search(r'https?://github\.com/[\w-]+/[\w.-]+/issues/\d+', query)
+
+        if url_match:
+            issue_content = await self._fetch_github_issue_content(url_match.group(0))
+            full_query = f"{query}\n\nIssue content: {issue_content}"
+        else:
+            full_query = query
+
+        prompt = QUERY_ALIGNMENT_PROMPT.format(query=full_query)
+        response = await self.query_alignment_llm.ainvoke([HumanMessage(content=prompt)])
+
+        try:
+            import json
+            print(response)
+            result = json.loads(response.content.strip())
+            logger.info(f"Query aligned: '{result.get('aligned_query')}' with keywords: {result.get('keywords')}")
+            return result
+        except json.JSONDecodeError:
+            logger.warning("Failed to parse alignment result, using fallback")
+            return {
+                "query_type": "general",
+                "aligned_query": query,
+                "keywords": [],
+                "technical_domain": "other"
+            }
+
+    async def _fetch_github_issue_content(self, github_url: str) -> str:
+        """Fetch GitHub issue content."""
+        try:
+            parsed_url = urlparse(github_url)
+            path_parts = parsed_url.path.strip('/').split('/')
+
+            if len(path_parts) >= 4 and path_parts[2] == "issues":
+                owner, repo, issue_number = path_parts[0], path_parts[1], int(path_parts[3])
+                processor = GitHubIssueProcessor(owner, repo, issue_number)
+
+                content = await processor.fetch_issue_content()
+                return content
+            else:
+                raise ValueError("Invalid GitHub issue URL")
+
+        except Exception as e:
+            logger.error(f"GitHub issue fetching failed: {e}")
+            raise
+
+async def handle_contributor_recommendation(query: str) -> Dict[str, Any]:
+    """
+    Main entry point with unified query processing.
+    """
+    logger.info(f"Processing contributor recommendation: {query[:100]}...")
+
+    try:
+        workflow = ContributorRecommendationWorkflow()
+
+        alignment_result = await workflow._align_user_request(query)
+        search_text = alignment_result.get("aligned_query", query)
+
+        logger.info("Generating embedding for semantic search")
+        enhanced_search_text = f"Looking for contributor with expertise in: {search_text}"
+        query_embedding = await workflow.embedding_service.get_embedding(enhanced_search_text)
+        logger.info(f"Generated embedding with dimension: {len(query_embedding)}")
+
+        logger.info("Performing hybrid search (semantic + keyword matching)")
+
+        results = await search_contributors(
+            query_embedding=query_embedding,
+            keywords=alignment_result.get("keywords", []),
+            limit=5,
+            vector_weight=0.7,  # Semantic similarity
+            bm25_weight=0.3     # Keyword matching
+        )
+
+        logger.info(f"Search complete: Found {len(results)} potential contributors")
+
+        if not results:
+            logger.info("No contributors found matching the search criteria")
+            return {
+                "status": "success",
+                "recommendations": [],
+                "message": "No suitable contributors found",
+                "search_query": search_text,
+                "keywords_used": alignment_result.get("keywords", []),
+                "technical_domain": alignment_result.get("technical_domain", "other")
+            }
+
+        logger.info("Formatting recommendations with scores")
+        recommendations = []
+        for contributor in results:
+            languages = contributor.get('languages', [])
+            topics = contributor.get('topics', [])
+            hybrid_score = contributor.get('hybrid_score', 0)
+            vector_score = contributor.get('vector_score', 0)
+            bm25_score = contributor.get('bm25_score', 0)
+
+            reason_parts = []
+            if languages:
+                reason_parts.append(f"Expert in {', '.join(languages)}")
+            if topics:
+                reason_parts.append(f"Active in {', '.join(topics)}")
+
+            username = contributor.get("github_username")
+            recommendation = {
+                "user": username,
+                "reason": " • ".join(reason_parts) if reason_parts else "Strong technical match",
+                "search_score": round(hybrid_score, 4),
+                "vector_score": round(vector_score, 4),
+                "keyword_score": round(bm25_score, 4),
+                "languages": languages,
+                "topics": topics
+            }
+
+            recommendations.append(recommendation)
+            logger.info(
+                f"@{username} (score: {hybrid_score:.4f}) - {reason_parts[0] if reason_parts else 'Technical match'}")
+
+        logger.info(f"Successfully generated {len(recommendations)} contributor recommendations")
+
+        return {
+            "status": "success",
+            "recommendations": recommendations,
+            "message": f"Found {len(recommendations)} suitable contributors",
+            "search_query": search_text,
+            "keywords_used": alignment_result.get("keywords", []),
+            "technical_domain": alignment_result.get("technical_domain", "other"),
+            "search_metadata": {
+                "total_candidates": len(results),
+                "vector_weight": 0.7,
+                "keyword_weight": 0.3,
+                "embedding_dimension": len(query_embedding)
+            }
+        }
+
+    except Exception as e:
+        logger.error(f"Error in contributor recommendation: {str(e)}", exc_info=True)
+        return {"status": "error", "message": str(e)}
diff --git a/backend/app/agents/devrel/prompts/response_prompt.py b/backend/app/agents/devrel/prompts/response_prompt.py
@@ -26,16 +26,25 @@
 - Avoid complex markdown formatting like **bold** or *italic*
 - Use plain text with clear line breaks
 - Format links as plain URLs: https://example.com
-- Use simple emojis for visual appeal: 🔗 📚 ⚡ 
+- Use simple emojis for visual appeal
 - Keep paragraphs short and scannable
 - Use "→" for arrows instead of markdown arrows
 
+SPECIAL FORMATTING FOR CONTRIBUTOR RECOMMENDATIONS:
+If the task result contains contributor recommendations:
+- Start with "Found X Contributors" 
+- Show search query used and keywords
+- For each contributor: "1. username (Score: X.XXX)"
+- Include their expertise/reason for recommendation
+- End with metadata about search and actionable guidance
+
 Instructions:
 1. Synthesize all information - Use reasoning process, tool results, and task results together
 2. Address the user's needs - Focus on what they're trying to accomplish  
 3. Be actionable - Provide specific steps, resources, or guidance
 4. Stay DevRel-focused - Be encouraging, helpful, and community-oriented
 5. Reference sources - Mention what you researched or considered when relevant
 6. Format for readability - Clean, simple text that displays well
+7. For contributor recommendations - Use the special formatting above to show scores and details
 
 Create a helpful, comprehensive response:"""
diff --git a/backend/app/api/v1/auth.py b/backend/app/api/v1/auth.py
@@ -2,7 +2,7 @@
 from fastapi.responses import HTMLResponse
 from app.database.supabase.client import get_supabase_client
 from app.services.auth.verification import find_user_by_session_and_verify, get_verification_session_info
-from app.services.user.profiling import profile_user_from_github
+from app.services.github.user.profiling import profile_user_from_github
 from typing import Optional
 import logging
 import asyncio
diff --git a/backend/app/services/github/issue_processor.py b/backend/app/services/github/issue_processor.py
@@ -0,0 +1,83 @@
+import logging
+from typing import List
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.messages import HumanMessage
+
+from app.core.config import settings
+from app.services.embedding_service.service import EmbeddingService
+from app.services.github.user.profiling import GitHubUserProfiler
+from app.agents.devrel.github.prompts.contributor_recommendation.issue_summarization import ISSUE_SUMMARIZATION_PROMPT
+
+logger = logging.getLogger(__name__)
+
+
+class GitHubIssueProcessor:
+    """
+    A service to fetch, summarize, and embed a GitHub issue.
+    """
+
+    def __init__(self, owner: str, repo: str, issue_number: int):
+        self.owner = owner
+        self.repo = repo
+        self.issue_number = issue_number
+        self.summarizer_llm = ChatGoogleGenerativeAI(
+            model=settings.github_agent_model,
+            temperature=0.1,
+            google_api_key=settings.gemini_api_key
+        )
+        self.embedding_service = EmbeddingService()
+
+    async def fetch_issue_content(self) -> str:
+        """
+        Fetches and consolidates all text content from a GitHub issue.
+        """
+        logger.info(f"Fetching content for {self.owner}/{self.repo}#{self.issue_number}")
+        async with GitHubUserProfiler() as profiler:
+            issue_url = f"{profiler.base_url}/repos/{self.owner}/{self.repo}/issues/{self.issue_number}"
+            comments_url = f"{issue_url}/comments"
+
+            issue_data = await profiler.request(issue_url)
+            if not issue_data:
+                raise ValueError("Failed to fetch issue data.")
+
+            content_parts = [
+                f"Title: {issue_data['title']}",
+                f"Body: {issue_data['body']}",
+            ]
+
+            comments_data = await profiler.request(comments_url)
+            if comments_data:
+                comment_texts = [
+                    f"Comment by {c['user']['login']}: {c['body']}"
+                    for c in comments_data if c.get('body')
+                ]
+                content_parts.extend(comment_texts)
+
+            return "\n\n---\n\n".join(content_parts)
+
+    async def _summarize_context(self, content: str) -> str:
+        """Generates a technical summary of the issue content using an LLM."""
+        logger.info(f"Summarizing issue content for {self.owner}/{self.repo}#{self.issue_number}")
+        prompt = ISSUE_SUMMARIZATION_PROMPT.format(issue_content=content)
+        response = await self.summarizer_llm.ainvoke([HumanMessage(content=prompt)])
+        logger.info(f"Generated summary: {response.content.strip()[:100]}")
+        return response.content.strip()
+
+    async def get_embedding_for_issue(self) -> List[float]:
+        """
+        Orchestrates the entire process: fetch, summarize, and embed.
+        Returns a vector embedding representing the issue.
+        """
+        try:
+            content = await self.fetch_issue_content()
+            if not content:
+                raise ValueError("Failed to fetch issue content.")
+
+            summary = await self._summarize_context(content)
+
+            logger.info("Embedding issue summary")
+            embedding = await self.embedding_service.get_embedding(summary)
+            return embedding
+        except Exception as e:
+            logger.error(f"Error processing issue {self.owner}/{self.repo}#{self.issue_number}: {str(e)}")
+            raise e
diff --git a/backend/app/services/github/user/__init__.py b/backend/app/services/github/user/__init__.py
diff --git a/backend/app/services/github/user/profiling.py b/backend/app/services/github/user/profiling.py