-
-
Notifications
You must be signed in to change notification settings - Fork 4.6k
chore: Split explorer index projects over 23h #106563
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,7 +18,6 @@ | |
| from sentry.tasks.base import instrumented_task | ||
| from sentry.tasks.statistical_detectors import compute_delay | ||
| from sentry.taskworker.namespaces import seer_tasks | ||
| from sentry.utils.cache import cache | ||
| from sentry.utils.query import RangeQuerySetWrapper | ||
|
|
||
| logger = logging.getLogger("sentry.tasks.seer_explorer_indexer") | ||
|
|
@@ -27,8 +26,8 @@ | |
| LAST_RUN_CACHE_TIMEOUT = 24 * 60 * 60 # 24 hours | ||
|
|
||
| EXPLORER_INDEX_PROJECTS_PER_BATCH = 100 | ||
| # Use a larger prime number to spread indexing tasks throughout the day | ||
| EXPLORER_INDEX_DISPATCH_STEP = timedelta(seconds=127) | ||
| # Use a larger prime number to prevent thundering | ||
| EXPLORER_INDEX_DISPATCH_STEP = timedelta(seconds=37) | ||
|
|
||
| FEATURE_NAMES = ["organizations:gen-ai-features", "organizations:seer-explorer-index"] | ||
|
|
||
|
|
@@ -41,6 +40,7 @@ def get_seer_explorer_enabled_projects() -> Generator[tuple[int, int]]: | |
| Tuple of (project_id, organization_id) | ||
| """ | ||
| projects = Project.objects.filter(status=ObjectStatus.ACTIVE).select_related("organization") | ||
| current_hour = django_timezone.now().hour | ||
|
|
||
| for project in RangeQuerySetWrapper( | ||
| projects, | ||
|
|
@@ -50,6 +50,12 @@ def get_seer_explorer_enabled_projects() -> Generator[tuple[int, int]]: | |
| logger.info("seer.explorer_index.killswitch.enable flag enabled, skipping") | ||
| return | ||
|
|
||
| if not project.flags.has_transactions: | ||
| continue | ||
|
|
||
| if project.id % 23 != current_hour: | ||
| continue | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hour sharding uses modulo 23 but hours range 0-23Medium Severity The sharding logic
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hour check evaluated repeatedly can cause race conditionMedium Severity The
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems unlikely to happen in prod but probably good practice in case it cases instability in tests. |
||
|
|
||
| with sentry_sdk.start_span(op="seer_explorer_index.has_feature"): | ||
| batch_result = features.batch_has(FEATURE_NAMES, organization=project.organization) | ||
|
|
||
|
|
@@ -90,17 +96,6 @@ def schedule_explorer_index() -> None: | |
| logger.info("seer.explorer_index.enable flag is disabled") | ||
| return | ||
|
|
||
| last_run = cache.get(LAST_RUN_CACHE_KEY) | ||
|
|
||
| # Default is 24 hours, making it an option so we can trigger | ||
| # the run at higher frequencies as we roll it out. | ||
| explorer_index_run_frequency = timedelta( | ||
| minutes=options.get("seer.explorer_index.run_frequency.minutes") | ||
| ) | ||
| if last_run and last_run > django_timezone.now() - explorer_index_run_frequency: | ||
| logger.info("Index updated less than 24 hours ago, skiping") | ||
| return | ||
|
|
||
| now = django_timezone.now() | ||
|
|
||
| projects = get_seer_explorer_enabled_projects() | ||
|
|
@@ -111,8 +106,6 @@ def schedule_explorer_index() -> None: | |
| for _ in projects: | ||
| scheduled_count += 1 | ||
|
|
||
| cache.set(LAST_RUN_CACHE_KEY, now, LAST_RUN_CACHE_TIMEOUT) | ||
|
|
||
| logger.info( | ||
| "Successfully scheduled explorer index jobs for valid projects", | ||
| extra={ | ||
|
|
@@ -139,9 +132,7 @@ def dispatch_explorer_index_projects( | |
| batch: list[tuple[int, int]] = [] | ||
| count = 0 | ||
|
|
||
| explorer_index_run_frequency = timedelta( | ||
| minutes=options.get("seer.explorer_index.run_frequency.minutes") | ||
| ) | ||
| explorer_index_run_frequency = timedelta(hours=1) | ||
|
|
||
| for project_id, org_id in all_projects: | ||
| batch.append((project_id, org_id)) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Bug: The sharding logic
project.id % 23will never equalcurrent_hourwhen the hour is 23, causing projects for that hour to be skipped.Severity: MEDIUM
Suggested Fix
To ensure all 24 hours are utilized, change the modulo divisor from 23 to 24. The condition should be updated to
project.id % 24 == current_hour.Prompt for AI Agent
Did we get this right? 👍 / 👎 to inform future reviews.