Skip to content

Commit 5e34163

Browse files
authored
ref(grouping): Use varying grouphash cache retention (#104460)
When we began to talk about caching grouphashes during ingest, one of the considerations was how much space in the cache we'd end up using. To give ourselves a ceiling, we took the worst-case scenario and made sure it wasn't going to be a problem. According to ops it wasn't, so we proceeded. In that worst-case scenario, we never get a cache hit, every grouphash is eligible to be cached (has a group already assigned), and every event has two different grouphashes associated with it. In that case, every event we see would lead to two new hashes being added to the cache. We knew that wasn't actually going to be true - we definitely _will_ get hits, we definitely _will_ encounter new grouphashes (which we won't put in the cache because they aren't yet assigned to a group), and not every event has two grouphashes. That said, until we started doing the actual caching, we didn't know just how far from the truth that worst-case scenario was going to be. Now that we have rolled it out, we can see that in fact hashes are being added to the cache only about 12% of the time. This means we can safely play around with longer cache retention times to see if it will meaningfully improve our hit rate. To make such experimentation easier, this PR introduces the ability to have the retention time be variable, so that we can compare different values against each other. Instead of a fixed number, we now will be choosing from a list of possible times. So that we can accurately report the expiry time which was used, we're also for now going to cache the time alongside the value itself. (I tried to cache them together as a tuple, but that ended up not working, so instead the time is cached separately with `_expiry` added onto the end of the cache key.) Once we find the happy medium between hit rate and storage space, we can back out these changes and go back to using a fixed number. I've therefore left TODOs explaining how to do that so it will be easy to switch back.
1 parent 86a8682 commit 5e34163

File tree

3 files changed

+60
-9
lines changed

3 files changed

+60
-9
lines changed

src/sentry/grouping/ingest/hashing.py

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import copy
44
import logging
55
from collections.abc import Iterable, Sequence
6-
from typing import TYPE_CHECKING
6+
from typing import TYPE_CHECKING, Literal
77

88
import sentry_sdk
99
from django.core.cache import cache
@@ -209,28 +209,41 @@ def find_grouphash_with_group(
209209
return None
210210

211211

212+
# TODO: This can go once we've settled on an expiry time for each cache
213+
def _get_cache_expiry(cache_key: str, cache_type: Literal["existence", "object"]) -> int:
214+
option_name = f"grouping.ingest_grouphash_{cache_type}_cache_expiry.trial_values"
215+
possible_cache_expiries = options.get(option_name)
216+
return possible_cache_expiries[hash(cache_key) % len(possible_cache_expiries)]
217+
218+
212219
def _grouphash_exists_for_hash_value(hash_value: str, project: Project, use_caching: bool) -> bool:
213220
"""
214221
Check whether a given hash value has a corresponding `GroupHash` record in the database.
215222
216223
If `use_caching` is True, cache the boolean result. Cache retention is controlled by the
217224
`grouping.ingest_grouphash_existence_cache_expiry` option.
225+
226+
TODO: That last sentence is temporarily untrue. While we're experimenting with retention
227+
periods, cache retention is actually controlled by the helper `_get_cache_expiry`.
218228
"""
219229
with metrics.timer(
220230
"grouping.get_or_create_grouphashes.check_secondary_hash_existence"
221231
) as metrics_tags:
222232
if use_caching:
223233
cache_key = get_grouphash_existence_cache_key(hash_value, project.id)
224-
cache_expiry = options.get("grouping.ingest_grouphash_existence_cache_expiry")
225234

226235
grouphash_exists = cache.get(cache_key)
236+
cache_expiry_used_when_setting = cache.get(
237+
cache_key + "_expiry",
238+
default=options.get("grouping.ingest_grouphash_existence_cache_expiry"),
239+
)
227240
got_cache_hit = grouphash_exists is not None
228241
metrics_tags["cache_result"] = "hit" if got_cache_hit else "miss"
229242

230243
if got_cache_hit:
231244
metrics_tags["grouphash_exists"] = grouphash_exists
232245
# TODO: Temporary tag to let us compare hit rates across different retention periods
233-
metrics_tags["expiry_seconds"] = cache_expiry
246+
metrics_tags["expiry_seconds"] = cache_expiry_used_when_setting
234247

235248
return grouphash_exists
236249

@@ -240,7 +253,13 @@ def _grouphash_exists_for_hash_value(hash_value: str, project: Project, use_cach
240253
metrics_tags["grouphash_exists"] = grouphash_exists
241254
metrics_tags["cache_set"] = True
242255

256+
# TODO: This can go back to being just
257+
# cache_expiry = options.get("grouping.ingest_grouphash_existence_cache_expiry")
258+
# cache.set(cache_key, grouphash_exists, cache_expiry)
259+
# once we've settled on a good retention period
260+
cache_expiry = _get_cache_expiry(cache_key, cache_type="existence")
243261
cache.set(cache_key, grouphash_exists, cache_expiry)
262+
cache.set(cache_key + "_expiry", cache_expiry, cache_expiry)
244263

245264
return grouphash_exists
246265

@@ -255,21 +274,27 @@ def _get_or_create_single_grouphash(
255274
`GroupHash` object. (Grouphashes without a group aren't cached because their data is about to
256275
change when a group is assigned.) Cache retention is controlled by the
257276
`grouping.ingest_grouphash_object_cache_expiry` option.
277+
278+
TODO: That last sentence is temporarily untrue. While we're experimenting with retention
279+
periods, cache retention is actually controlled by the helper `_get_cache_expiry`.
258280
"""
259281
with metrics.timer(
260282
"grouping.get_or_create_grouphashes.get_or_create_grouphash"
261283
) as metrics_tags:
262284
if use_caching:
263285
cache_key = get_grouphash_object_cache_key(hash_value, project.id)
264-
cache_expiry = options.get("grouping.ingest_grouphash_object_cache_expiry")
265286

266287
grouphash = cache.get(cache_key)
288+
cache_expiry_used_when_setting = cache.get(
289+
cache_key + "_expiry",
290+
default=options.get("grouping.ingest_grouphash_object_cache_expiry"),
291+
)
267292
got_cache_hit = grouphash is not None
268293
metrics_tags["cache_result"] = "hit" if got_cache_hit else "miss"
269294

270295
if got_cache_hit:
271296
# TODO: Temporary tag to let us compare hit rates across different retention periods
272-
metrics_tags["expiry_seconds"] = cache_expiry
297+
metrics_tags["expiry_seconds"] = cache_expiry_used_when_setting
273298

274299
return (grouphash, False)
275300

@@ -281,7 +306,13 @@ def _get_or_create_single_grouphash(
281306
if use_caching and grouphash.group_id is not None:
282307
metrics_tags["cache_set"] = True
283308

309+
# TODO: This can go back to being just
310+
# cache_expiry = options.get("grouping.ingest_grouphash_object_cache_expiry")
311+
# cache.set(cache_key, grouphash, cache_expiry)
312+
# once we've settled on a good retention period
313+
cache_expiry = _get_cache_expiry(cache_key, cache_type="object")
284314
cache.set(cache_key, grouphash, cache_expiry)
315+
cache.set(cache_key + "_expiry", cache_expiry, cache_expiry)
285316

286317
return (grouphash, created)
287318

src/sentry/options/defaults.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2914,6 +2914,22 @@
29142914
flags=FLAG_AUTOMATOR_MODIFIABLE,
29152915
)
29162916

2917+
# TODO: Temporary options to let us play around with expiry times and see what hit rates they give
2918+
# us. Once we've decided, we can stick our values into the two expiry options above and get rid of
2919+
# these two options.
2920+
register(
2921+
"grouping.ingest_grouphash_existence_cache_expiry.trial_values",
2922+
type=Sequence,
2923+
default=[60, 120, 600],
2924+
flags=FLAG_AUTOMATOR_MODIFIABLE,
2925+
)
2926+
register(
2927+
"grouping.ingest_grouphash_object_cache_expiry.trial_values",
2928+
type=Sequence,
2929+
default=[60, 120, 600],
2930+
flags=FLAG_AUTOMATOR_MODIFIABLE,
2931+
)
2932+
29172933

29182934
# Sample rate for double writing to experimental dsn
29192935
register(

tests/sentry/event_manager/test_event_manager_grouping.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import pytest
1111
from django.core.cache import cache
1212

13-
from sentry import audit_log, options
13+
from sentry import audit_log
1414
from sentry.conf.server import DEFAULT_GROUPING_CONFIG, SENTRY_GROUPING_CONFIG_TRANSITION_DURATION
1515
from sentry.event_manager import _get_updated_group_title
1616
from sentry.eventtypes.base import DefaultEvent
@@ -20,7 +20,7 @@
2020
get_grouphash_object_cache_key,
2121
)
2222
from sentry.grouping.ingest.config import update_or_set_grouping_config_if_needed
23-
from sentry.grouping.ingest.hashing import get_or_create_grouphashes
23+
from sentry.grouping.ingest.hashing import _get_cache_expiry, get_or_create_grouphashes
2424
from sentry.models.auditlogentry import AuditLogEntry
2525
from sentry.models.group import Group
2626
from sentry.models.grouphash import GroupHash
@@ -400,13 +400,17 @@ def run_test(
400400
grouping_config_id = "old_config"
401401
grouping_config_option = "sentry:secondary_grouping_config"
402402
cache_key = get_grouphash_existence_cache_key(hash_value, self.project.id)
403-
cache_expiry = options.get("grouping.ingest_grouphash_existence_cache_expiry")
403+
# TODO: This can go back to being `options.get("grouping.ingest_grouphash_existence_cache_expiry")`
404+
# once we've settled on a retention period
405+
cache_expiry = _get_cache_expiry(cache_key, cache_type="existence")
404406
cached_value: Any = grouphash_exists
405407
else:
406408
grouping_config_id = "new_config"
407409
grouping_config_option = "sentry:grouping_config"
408410
cache_key = get_grouphash_object_cache_key(hash_value, self.project.id)
409-
cache_expiry = options.get("grouping.ingest_grouphash_object_cache_expiry")
411+
# TODO: This can go back to being `options.get("grouping.ingest_grouphash_object_cache_expiry")`
412+
# once we've settled on a retention period
413+
cache_expiry = _get_cache_expiry(cache_key, cache_type="object")
410414
cached_value = grouphash
411415

412416
self.project.update_option(grouping_config_option, grouping_config_id)

0 commit comments

Comments
 (0)