Skip to content

Commit 055bb47

Browse files
Merge pull request #645 from Ayyanaruto/filter-bots
feat(analytics): add filtering of bot events in create_pr_metrics and…
2 parents 8a545c0 + ca98718 commit 055bb47

File tree

6 files changed

+223
-5
lines changed

6 files changed

+223
-5
lines changed

backend/analytics_server/mhq/service/code/sync/etl_code_analytics.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
PullRequestState,
1111
)
1212
from mhq.utils.time import Interval
13+
from mhq.utils.string import is_bot_name
1314

1415

1516
class CodeETLAnalyticsService:
@@ -22,7 +23,8 @@ def create_pr_metrics(
2223
if pr.state == PullRequestState.OPEN:
2324
return pr
2425

25-
pr_performance = self.get_pr_performance(pr, pr_events)
26+
non_bot_pr_events = self.filter_non_bot_events(pr_events)
27+
pr_performance = self.get_pr_performance(pr, non_bot_pr_events)
2628

2729
pr.first_response_time = (
2830
pr_performance.first_review_time
@@ -39,11 +41,15 @@ def create_pr_metrics(
3941
pr_performance.cycle_time if pr_performance.cycle_time != -1 else None
4042
)
4143
pr.reviewers = list(
42-
{e.actor_username for e in pr_events if e.actor_username != pr.author}
44+
{
45+
e.actor_username
46+
for e in non_bot_pr_events
47+
if e.actor_username != pr.author
48+
}
4349
)
4450

4551
if pr_commits:
46-
pr.rework_cycles = self.get_rework_cycles(pr, pr_events, pr_commits)
52+
pr.rework_cycles = self.get_rework_cycles(pr, non_bot_pr_events, pr_commits)
4753
pr_commits.sort(key=lambda x: x.created_at)
4854
first_commit_to_open = pr.created_at - pr_commits[0].created_at
4955
if isinstance(first_commit_to_open, timedelta):
@@ -173,3 +179,14 @@ def get_rework_cycles(
173179
rework_cycles += 1
174180

175181
return rework_cycles
182+
183+
def filter_non_bot_events(
184+
self, pr_events: List[PullRequestEvent]
185+
) -> List[PullRequestEvent]:
186+
"""Filter out events created by bot users using regex patterns."""
187+
return [
188+
event
189+
for event in pr_events
190+
if event.actor_username is not None
191+
and (not is_bot_name(event.actor_username))
192+
]

backend/analytics_server/mhq/service/code/sync/etl_github_handler.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -183,9 +183,9 @@ def process_pr(
183183
pr_commits_model_list: List[PullRequestCommit] = self._to_pr_commits(
184184
commits, pr_model
185185
)
186-
186+
non_bot_pr_events_model_list = self._github_bot_filter(pr_events_model_list)
187187
pr_model = self.code_etl_analytics_service.create_pr_metrics(
188-
pr_model, pr_events_model_list, pr_commits_model_list
188+
pr_model, non_bot_pr_events_model_list, pr_commits_model_list
189189
)
190190

191191
return pr_model, pr_events_model_list, pr_commits_model_list
@@ -356,6 +356,17 @@ def _dt_from_github_dt_string(dt_string: str) -> datetime:
356356
dt_without_timezone = datetime.strptime(dt_string, ISO_8601_DATE_FORMAT)
357357
return dt_without_timezone.replace(tzinfo=pytz.UTC)
358358

359+
@staticmethod
360+
def _github_bot_filter(pr_events: List[PullRequestEvent]) -> List[PullRequestEvent]:
361+
filtered_events = []
362+
for pr_event in pr_events:
363+
pr_event_data = (
364+
pr_event.data.get("user") or pr_event.data.get("actor") or {}
365+
)
366+
if not (pr_event_data.get("type") == "Bot"):
367+
filtered_events.append(pr_event)
368+
return filtered_events
369+
359370

360371
def get_github_etl_handler(org_id: str) -> GithubETLHandler:
361372
def _get_access_token():
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
from uuid import uuid4
2+
import re
23

34

45
def uuid4_str():
56
return str(uuid4())
7+
8+
9+
def is_bot_name(name: str) -> bool:
10+
pattern = re.compile(
11+
r"(?i)(\b[\w@-]*[-_\[\]@ ]+bot[-_\d\[\]]*\b|\[bot\]|_bot_|_bot$|^bot_)"
12+
)
13+
return bool(pattern.search(name))

backend/analytics_server/tests/service/code/sync/test_etl_code_analytics.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,3 +500,108 @@ def test_rework_cycles_returs_1_for_multiple_approvals():
500500
)
501501
== 1
502502
)
503+
504+
505+
def test_filter_non_bot_events_common_patterns():
506+
pr_service = CodeETLAnalyticsService()
507+
508+
bot_events = [
509+
get_pull_request_event(reviewer="github-actions[bot]"),
510+
get_pull_request_event(reviewer="jenkins-bot"),
511+
get_pull_request_event(reviewer="renovate[bot]"),
512+
get_pull_request_event(reviewer="test_bot_service"),
513+
get_pull_request_event(reviewer="my_bot"),
514+
get_pull_request_event(reviewer="bot_user"),
515+
get_pull_request_event(reviewer="SomeService-bot-123"),
516+
get_pull_request_event(reviewer="CI-BOT"),
517+
get_pull_request_event(reviewer="bot-123[bot]"),
518+
get_pull_request_event(reviewer="helper_bot_v2"),
519+
]
520+
human_events = [
521+
get_pull_request_event(reviewer="john_doe"),
522+
get_pull_request_event(reviewer="robotics_expert"),
523+
get_pull_request_event(reviewer="sabotage"),
524+
get_pull_request_event(reviewer="lobotomy"),
525+
get_pull_request_event(reviewer="cubot"),
526+
get_pull_request_event(reviewer="botany"),
527+
]
528+
529+
all_events = bot_events + human_events
530+
filtered_events = pr_service.filter_non_bot_events(all_events)
531+
532+
assert len(filtered_events) == len(human_events)
533+
534+
filtered_usernames = [e.actor_username for e in filtered_events]
535+
for event in bot_events:
536+
assert event.actor_username not in filtered_usernames
537+
538+
for event in human_events:
539+
assert event.actor_username in filtered_usernames
540+
541+
542+
def test_filter_non_bot_events_edge_cases():
543+
pr_service = CodeETLAnalyticsService()
544+
545+
events = [
546+
get_pull_request_event(reviewer="test-bot[123]"),
547+
get_pull_request_event(reviewer="deploy bot"),
548+
get_pull_request_event(reviewer="special@bot@chars"),
549+
get_pull_request_event(reviewer="robo"),
550+
get_pull_request_event(reviewer="botanic"),
551+
get_pull_request_event(reviewer="robot"),
552+
get_pull_request_event(reviewer="abot"),
553+
]
554+
555+
filtered_events = pr_service.filter_non_bot_events(events)
556+
557+
expected_remaining = ["robo", "botanic", "robot", "abot"]
558+
filtered_usernames = [e.actor_username for e in filtered_events]
559+
560+
assert len(filtered_events) == 4
561+
for username in expected_remaining:
562+
assert username in filtered_usernames
563+
564+
565+
def test_create_pr_metrics_bot_detection_in_review_events():
566+
pr_service = CodeETLAnalyticsService()
567+
t1 = time_now()
568+
pr = get_pull_request(state=PullRequestState.MERGED, created_at=t1, updated_at=t1)
569+
570+
bot_reviewers = [
571+
"github-actions[bot]",
572+
"dependabot-preview[bot]",
573+
"Jenkins_Bot",
574+
"ci_bot_service",
575+
"bot_reviewer",
576+
"tool-bot-123",
577+
"_bot_helper",
578+
"helper_bot",
579+
]
580+
581+
bot_events = []
582+
for i, reviewer in enumerate(bot_reviewers):
583+
bot_events.append(
584+
get_pull_request_event(
585+
pull_request_id=pr.id,
586+
reviewer=reviewer,
587+
state=PullRequestEventState.COMMENTED.value,
588+
created_at=t1 + timedelta(minutes=i + 1),
589+
)
590+
)
591+
592+
human_event = get_pull_request_event(
593+
pull_request_id=pr.id,
594+
reviewer="human_reviewer",
595+
state=PullRequestEventState.APPROVED.value,
596+
created_at=t1 + timedelta(hours=1),
597+
)
598+
599+
all_events = bot_events + [human_event]
600+
601+
pr_metrics = pr_service.create_pr_metrics(pr, all_events, [])
602+
603+
assert len(pr_metrics.reviewers) == 1
604+
assert "human_reviewer" in pr_metrics.reviewers
605+
606+
for bot_reviewer in bot_reviewers:
607+
assert bot_reviewer not in pr_metrics.reviewers

backend/analytics_server/tests/service/code/sync/test_etl_github_handler.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,3 +351,32 @@ def test__dt_from_github_dt_string_given_date_string_returns_correct_datetime():
351351
date_string = "2024-04-18T10:53:15Z"
352352
expected = datetime(2024, 4, 18, 10, 53, 15, tzinfo=pytz.UTC)
353353
assert GithubETLHandler._dt_from_github_dt_string(date_string) == expected
354+
355+
356+
def test__github_bot_filter_given_bot_events_returns_empty_list():
357+
github_etl_handler = GithubETLHandler(ORG_ID, None, None, None, None)
358+
bot_event = get_pull_request_event()
359+
bot_event.data = {"user": {"type": "Bot", "login": "dependabot"}}
360+
result = github_etl_handler._github_bot_filter([bot_event])
361+
assert len(result) == 0
362+
363+
364+
def test__github_bot_filter_given_non_bot_events_returns_same_list():
365+
# Arrange
366+
github_etl_handler = GithubETLHandler(ORG_ID, None, None, None, None)
367+
human_event = get_pull_request_event()
368+
human_event.data = {"user": {"type": "User", "login": "john_doe"}}
369+
result = github_etl_handler._github_bot_filter([human_event])
370+
assert len(result) == 1
371+
assert result[0] == human_event
372+
373+
374+
def test__github_bot_filter_given_mixed_events_returns_non_bot_events():
375+
github_etl_handler = GithubETLHandler(ORG_ID, None, None, None, None)
376+
bot_event = get_pull_request_event()
377+
bot_event.data = {"actor": {"type": "Bot", "login": "dependabot"}}
378+
human_event = get_pull_request_event()
379+
human_event.data = {"user": {"type": "User", "login": "john_doe"}}
380+
result = github_etl_handler._github_bot_filter([bot_event, human_event])
381+
assert len(result) == 1
382+
assert result[0] == human_event
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from mhq.utils.string import is_bot_name
2+
3+
4+
def test_simple_bot_names():
5+
assert is_bot_name("test_bot")
6+
assert is_bot_name("test-bot")
7+
8+
9+
def test_bot_with_prefixes_and_suffixes():
10+
assert is_bot_name("my_bot")
11+
assert is_bot_name("my-bot")
12+
assert is_bot_name("my bot")
13+
assert is_bot_name("test_bot_123")
14+
assert is_bot_name("test-bot-123")
15+
assert is_bot_name("test bot 123")
16+
17+
18+
def test_special_patterns():
19+
assert is_bot_name("name_bot_suffix")
20+
assert is_bot_name("name_bot")
21+
assert is_bot_name("bot_name")
22+
assert is_bot_name("my_bot_is_cool")
23+
24+
25+
def test_case_insensitivity():
26+
assert is_bot_name("my_BOT")
27+
assert is_bot_name("MY-bot")
28+
assert is_bot_name("My Bot")
29+
30+
31+
def test_special_characters():
32+
assert is_bot_name("test@bot")
33+
assert is_bot_name("[bot]")
34+
35+
36+
def test_negative_cases():
37+
assert not is_bot_name("robotics")
38+
assert not is_bot_name("lobotomy")
39+
assert not is_bot_name("botany")
40+
assert not is_bot_name("about")
41+
assert not is_bot_name("robotic")
42+
assert not is_bot_name("bots")
43+
44+
45+
def test_edge_cases():
46+
assert not is_bot_name("")
47+
assert not is_bot_name(" ")
48+
assert not is_bot_name("12345")

0 commit comments

Comments
 (0)