mozilla · suhaibmujahid · May 5, 2025 · Apr 22, 2025 · Apr 23, 2025 · Apr 22, 2025
diff --git a/bugbug/tools/code_review.py b/bugbug/tools/code_review.py
@@ -20,6 +20,7 @@
 from langchain.memory import ConversationBufferMemory
 from langchain.prompts import PromptTemplate
 from langchain_openai import OpenAIEmbeddings
+from libmozdata.phabricator import ConduitError
 from tenacity import retry, retry_if_exception_type, stop_after_attempt
 from tqdm import tqdm
 from unidiff import Hunk, PatchedFile, PatchSet
@@ -48,6 +49,7 @@ class InlineComment:
     is_done: bool | None = None
     hunk_start_line: int | None = None
     hunk_end_line: int | None = None
+    is_generated: bool | None = None
 
 
 class ModelResultError(Exception):
@@ -121,6 +123,7 @@ class LargeDiffError(Exception):
 **Valid Comment Examples**:
 
 {comment_examples}
+{approved_examples}
 
 **Patch to Review**:
 
@@ -456,6 +459,11 @@ def target_end(hunk: Hunk) -> int:
             # prioritize added lines over deleted lines because comments are more
             # likely to be on added lines than deleted lines.
             if len(matching_hunks) > 1:
+                logger.warning(
+                    "Multiple matching hunks found for comment %s in file %s",
+                    comment.id,
+                    comment.filename,
+                )
                 for hunk in matching_hunks:
                     for line in hunk:
                         if line.is_added and line.target_line_no == comment.start_line:
@@ -469,11 +477,6 @@ def target_end(hunk: Hunk) -> int:
                             return hunk
 
             if len(matching_hunks) != 0:
-                logger.warning(
-                    "Multiple matching hunks found for comment %s in file %s",
-                    comment.id,
-                    comment.filename,
-                )
                 return matching_hunks[0]
 
         elif comment.on_removed_code:
@@ -488,6 +491,10 @@ def target_end(hunk: Hunk) -> int:
 
     def retrieve_comments_with_hunks(self):
         def comment_filter(comment: InlineComment):
+            # We want to keep all generated comments
+            if comment.is_generated:
+                return True
+
             comment_content = comment.content
 
             # Ignore very short and very long comments
@@ -527,6 +534,9 @@ def comment_filter(comment: InlineComment):
                 # TODO: use log instead of print
                 print(f"Failed to parse {diff_id}")
                 continue
+            except ConduitError:
+                logger.warning("Failed to load %d", diff_id)
+                continue
 
             file_map = {
                 patched_file.path: patched_file
@@ -582,10 +592,6 @@ def get_all_inline_comments(
                 if transaction["fields"]["replyToCommentPHID"] is not None:
                     continue
 
-                # Ignore bot comments
-                if transaction["authorPHID"] == "PHID-USER-cje4weq32o3xyuegalpj":
-                    continue
-
                 if len(transaction["comments"]) != 1:
                     # Follow up: https://github.com/mozilla/bugbug/issues/4218
                     logger.warning(
@@ -595,10 +601,23 @@ def get_all_inline_comments(
                     continue
 
                 transaction_comment = transaction["comments"][0]
-                comment_id = transaction_comment["id"]
-                date_created = transaction_comment["dateCreated"]
                 comment_content = transaction_comment["content"]["raw"]
+                is_generated = (
+                    # This includes comments generated by Review Helper, but
+                    # excludes any comments that have been edited by the user.
+                    "> This comment was generated automatically and has been approved by"
+                    in comment_content
+                )
+
+                # Ignore bot comments, except the ones by Review Helper
+                if (
+                    transaction["authorPHID"] == "PHID-USER-cje4weq32o3xyuegalpj"
+                    and not is_generated
+                ):
+                    continue
 
+                comment_id = transaction_comment["id"]
+                date_created = transaction_comment["dateCreated"]
                 diff_id = transaction["fields"]["diff"]["id"]
                 filename = transaction["fields"]["path"]
                 start_line = transaction["fields"]["line"]
@@ -616,15 +635,16 @@ def get_all_inline_comments(
                 # TODO: we could create an extended dataclass for this
                 # instead of adding optional fields.
                 comment = InlineComment(
-                    filename,
-                    start_line,
-                    end_line,
-                    comment_content,
-                    on_removed_code,
-                    comment_id,
-                    date_created,
-                    date_modified,
-                    is_done,
+                    filename=filename,
+                    start_line=start_line,
+                    end_line=end_line,
+                    content=comment_content,
+                    on_removed_code=on_removed_code,
+                    id=comment_id,
+                    date_created=date_created,
+                    date_modified=date_modified,
+                    is_done=is_done,
+                    is_generated=is_generated,
                 )
 
                 if not comment_filter(comment):
@@ -1246,6 +1266,7 @@ def _generate_suggestions(self, patch: Patch):
                 input=PROMPT_TEMPLATE_REVIEW.format(
                     patch=formatted_patch,
                     comment_examples=self._get_comment_examples(patch),
+                    approved_examples=self._get_generated_examples(patch),
                     target_code_consistency=self.target_software or "rest of the",
                 )
             )
@@ -1298,14 +1319,40 @@ def run(self, patch: Patch) -> list[InlineComment] | None:
 
         return list(generate_processed_output(raw_output, patch.patch_set))
 
+    def _get_generated_examples(self, patch):
+        """Get examples of comments that were generated by an LLM.
+
+        Since the comments are posted, it means that they were approved by a
+        reviewer. Thus, we can use them as examples of good comments.
+        """
+        if not self.review_comments_db:
+            return ""
+
+        comment_examples = [
+            result.payload["comment"]["content"]
+            for result in self.review_comments_db.find_similar_patch_comments(
+                patch, limit=5, generated=True
+            )
+        ]
+        if not comment_examples:
+            return ""
+
+        template = """
+**Examples of comments that you suggested on other patches and developers found useful**:
+
+- {comment_examples}
+"""
+
+        return template.format(comment_examples="\n    - ".join(comment_examples))
+
     def _get_comment_examples(self, patch):
         comment_examples = []
 
         if self.review_comments_db:
             comment_examples = [
                 result.payload
                 for result in self.review_comments_db.find_similar_patch_comments(
-                    patch, limit=10
+                    patch, limit=10, generated=False
                 )
             ]
 
@@ -1378,7 +1425,13 @@ def __init__(self, vector_db: VectorDB) -> None:
             model="text-embedding-3-large", api_key=get_secret("OPENAI_API_KEY")
         )
 
-    def clean_comment(self, comment):
+    def clean_comment(self, comment: str):
+        # We do not want to keep the LLM note in the comment, it is not useful
+        # when using the comment as examples.
+        llm_note_index = comment.find("> This comment was generated automatically ")
+        if llm_note_index != -1:
+            comment = comment[:llm_note_index]
+
         # TODO: use the nav info instead of removing it
         comment = self.NAV_PATTERN.sub("", comment)
         comment = self.WHITESPACE_PATTERN.sub(" ", comment)
@@ -1388,6 +1441,7 @@ def clean_comment(self, comment):
 
     def add_comments_by_hunk(self, items: Iterable[tuple[Hunk, InlineComment]]):
         point_ids = set(self.vector_db.get_existing_ids())
+        logger.info("Will skip %d comments that already exist", len(point_ids))
 
         def vector_points():
             for hunk, comment in items:
@@ -1396,19 +1450,34 @@ def vector_points():
 
                 str_hunk = str(hunk)
                 vector = self.embeddings.embed_query(str_hunk)
+
+                comment_data = asdict(comment)
+                comment_data["content"] = self.clean_comment(comment.content)
                 payload = {
                     "hunk": str_hunk,
-                    "comment": asdict(comment),
+                    "comment": comment_data,
+                    "version": 2,
                 }
 
                 yield VectorPoint(id=comment.id, vector=vector, payload=payload)
 
         self.vector_db.insert(vector_points())
 
-    def find_similar_hunk_comments(self, hunk: Hunk):
-        return self.vector_db.search(self.embeddings.embed_query(str(hunk)))
+    def find_similar_hunk_comments(self, hunk: Hunk, generated: bool | None = None):
+        return self.vector_db.search(
+            self.embeddings.embed_query(str(hunk)),
+            filter=(
+                QueryFilter(
+                    must_match={"comment.is_generated": generated},
+                )
+                if generated is not None
+                else None
+            ),
+        )
 
-    def find_similar_patch_comments(self, patch: Patch, limit: int):
+    def find_similar_patch_comments(
+        self, patch: Patch, limit: int, generated: bool | None = None
+    ):
         assert limit > 0, "Limit must be greater than 0"
 
         patch_set = PatchSet.from_string(patch.raw_diff)
@@ -1421,7 +1490,7 @@ def find_similar_patch_comments(self, patch: Patch, limit: int):
                 continue
 
             for hunk in patched_file:
-                for result in self.find_similar_hunk_comments(hunk):
+                for result in self.find_similar_hunk_comments(hunk, generated):
                     if result is not None and (
                         result.id not in max_score_per_comment
                         or result.score > max_score_per_comment[result.id].score

diff --git a/bugbug/vectordb.py b/bugbug/vectordb.py
@@ -129,7 +129,7 @@ def get_existing_ids(self) -> Iterable[int]:
         while offset is not None:
             points, offset = self.client.scroll(
                 collection_name=self.collection_name,
-                limit=100,
+                limit=100000,
                 with_payload=False,
                 with_vectors=False,
                 offset=offset,