Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 0b87eb8

Browse files
authored
Make DictionaryCache have better expiry properties (#13292)
1 parent 13341dd commit 0b87eb8

File tree

7 files changed

+358
-43
lines changed

7 files changed

+358
-43
lines changed

changelog.d/13292.misc

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Make `DictionaryCache` expire full entries if they haven't been queried in a while, even if specific keys have been queried recently.

synapse/storage/databases/state/store.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,14 @@ def _get_state_for_group_using_cache(
202202
requests state from the cache, if False we need to query the DB for the
203203
missing state.
204204
"""
205-
cache_entry = cache.get(group)
205+
# If we are asked explicitly for a subset of keys, we only ask for those
206+
# from the cache. This ensures that the `DictionaryCache` can make
207+
# better decisions about what to cache and what to expire.
208+
dict_keys = None
209+
if not state_filter.has_wildcards():
210+
dict_keys = state_filter.concrete_types()
211+
212+
cache_entry = cache.get(group, dict_keys=dict_keys)
206213
state_dict_ids = cache_entry.value
207214

208215
if cache_entry.full or state_filter.is_full():

synapse/util/caches/dictionary_cache.py

+187-31
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,13 @@
1414
import enum
1515
import logging
1616
import threading
17-
from typing import Any, Dict, Generic, Iterable, Optional, Set, TypeVar
17+
from typing import Any, Dict, Generic, Iterable, Optional, Set, Tuple, TypeVar, Union
1818

1919
import attr
20+
from typing_extensions import Literal
2021

2122
from synapse.util.caches.lrucache import LruCache
23+
from synapse.util.caches.treecache import TreeCache
2224

2325
logger = logging.getLogger(__name__)
2426

@@ -33,10 +35,12 @@
3335

3436
# This class can't be generic because it uses slots with attrs.
3537
# See: https://github.com/python-attrs/attrs/issues/313
36-
@attr.s(slots=True, auto_attribs=True)
38+
@attr.s(slots=True, frozen=True, auto_attribs=True)
3739
class DictionaryEntry: # should be: Generic[DKT, DV].
3840
"""Returned when getting an entry from the cache
3941
42+
If `full` is true then `known_absent` will be the empty set.
43+
4044
Attributes:
4145
full: Whether the cache has the full or dict or just some keys.
4246
If not full then not all requested keys will necessarily be present
@@ -53,20 +57,90 @@ def __len__(self) -> int:
5357
return len(self.value)
5458

5559

60+
class _FullCacheKey(enum.Enum):
61+
"""The key we use to cache the full dict."""
62+
63+
KEY = object()
64+
65+
5666
class _Sentinel(enum.Enum):
5767
# defining a sentinel in this way allows mypy to correctly handle the
5868
# type of a dictionary lookup.
5969
sentinel = object()
6070

6171

72+
class _PerKeyValue(Generic[DV]):
73+
"""The cached value of a dictionary key. If `value` is the sentinel,
74+
indicates that the requested key is known to *not* be in the full dict.
75+
"""
76+
77+
__slots__ = ["value"]
78+
79+
def __init__(self, value: Union[DV, Literal[_Sentinel.sentinel]]) -> None:
80+
self.value = value
81+
82+
def __len__(self) -> int:
83+
# We add a `__len__` implementation as we use this class in a cache
84+
# where the values are variable length.
85+
return 1
86+
87+
6288
class DictionaryCache(Generic[KT, DKT, DV]):
6389
"""Caches key -> dictionary lookups, supporting caching partial dicts, i.e.
6490
fetching a subset of dictionary keys for a particular key.
91+
92+
This cache has two levels of key. First there is the "cache key" (of type
93+
`KT`), which maps to a dict. The keys to that dict are the "dict key" (of
94+
type `DKT`). The overall structure is therefore `KT->DKT->DV`. For
95+
example, it might look like:
96+
97+
{
98+
1: { 1: "a", 2: "b" },
99+
2: { 1: "c" },
100+
}
101+
102+
It is possible to look up either individual dict keys, or the *complete*
103+
dict for a given cache key.
104+
105+
Each dict item, and the complete dict is treated as a separate LRU
106+
entry for the purpose of cache expiry. For example, given:
107+
dict_cache.get(1, None) -> DictionaryEntry({1: "a", 2: "b"})
108+
dict_cache.get(1, [1]) -> DictionaryEntry({1: "a"})
109+
dict_cache.get(1, [2]) -> DictionaryEntry({2: "b"})
110+
111+
... then the cache entry for the complete dict will expire first,
112+
followed by the cache entry for the '1' dict key, and finally that
113+
for the '2' dict key.
65114
"""
66115

67116
def __init__(self, name: str, max_entries: int = 1000):
68-
self.cache: LruCache[KT, DictionaryEntry] = LruCache(
69-
max_size=max_entries, cache_name=name, size_callback=len
117+
# We use a single LruCache to store two different types of entries:
118+
# 1. Map from (key, dict_key) -> dict value (or sentinel, indicating
119+
# the key doesn't exist in the dict); and
120+
# 2. Map from (key, _FullCacheKey.KEY) -> full dict.
121+
#
122+
# The former is used when explicit keys of the dictionary are looked up,
123+
# and the latter when the full dictionary is requested.
124+
#
125+
# If when explicit keys are requested and not in the cache, we then look
126+
# to see if we have the full dict and use that if we do. If found in the
127+
# full dict each key is added into the cache.
128+
#
129+
# This set up allows the `LruCache` to prune the full dict entries if
130+
# they haven't been used in a while, even when there have been recent
131+
# queries for subsets of the dict.
132+
#
133+
# Typing:
134+
# * A key of `(KT, DKT)` has a value of `_PerKeyValue`
135+
# * A key of `(KT, _FullCacheKey.KEY)` has a value of `Dict[DKT, DV]`
136+
self.cache: LruCache[
137+
Tuple[KT, Union[DKT, Literal[_FullCacheKey.KEY]]],
138+
Union[_PerKeyValue, Dict[DKT, DV]],
139+
] = LruCache(
140+
max_size=max_entries,
141+
cache_name=name,
142+
cache_type=TreeCache,
143+
size_callback=len,
70144
)
71145

72146
self.name = name
@@ -91,23 +165,83 @@ def get(
91165
Args:
92166
key
93167
dict_keys: If given a set of keys then return only those keys
94-
that exist in the cache.
168+
that exist in the cache. If None then returns the full dict
169+
if it is in the cache.
95170
96171
Returns:
97-
DictionaryEntry
172+
DictionaryEntry: If `dict_keys` is not None then `DictionaryEntry`
173+
will contain include the keys that are in the cache. If None then
174+
will either return the full dict if in the cache, or the empty
175+
dict (with `full` set to False) if it isn't.
98176
"""
99-
entry = self.cache.get(key, _Sentinel.sentinel)
100-
if entry is not _Sentinel.sentinel:
101-
if dict_keys is None:
102-
return DictionaryEntry(
103-
entry.full, entry.known_absent, dict(entry.value)
104-
)
177+
if dict_keys is None:
178+
# The caller wants the full set of dictionary keys for this cache key
179+
return self._get_full_dict(key)
180+
181+
# We are being asked for a subset of keys.
182+
183+
# First go and check for each requested dict key in the cache, tracking
184+
# which we couldn't find.
185+
values = {}
186+
known_absent = set()
187+
missing = []
188+
for dict_key in dict_keys:
189+
entry = self.cache.get((key, dict_key), _Sentinel.sentinel)
190+
if entry is _Sentinel.sentinel:
191+
missing.append(dict_key)
192+
continue
193+
194+
assert isinstance(entry, _PerKeyValue)
195+
196+
if entry.value is _Sentinel.sentinel:
197+
known_absent.add(dict_key)
105198
else:
106-
return DictionaryEntry(
107-
entry.full,
108-
entry.known_absent,
109-
{k: entry.value[k] for k in dict_keys if k in entry.value},
110-
)
199+
values[dict_key] = entry.value
200+
201+
# If we found everything we can return immediately.
202+
if not missing:
203+
return DictionaryEntry(False, known_absent, values)
204+
205+
# We are missing some keys, so check if we happen to have the full dict in
206+
# the cache.
207+
#
208+
# We don't update the last access time for this cache fetch, as we
209+
# aren't explicitly interested in the full dict and so we don't want
210+
# requests for explicit dict keys to keep the full dict in the cache.
211+
entry = self.cache.get(
212+
(key, _FullCacheKey.KEY),
213+
_Sentinel.sentinel,
214+
update_last_access=False,
215+
)
216+
if entry is _Sentinel.sentinel:
217+
# Not in the cache, return the subset of keys we found.
218+
return DictionaryEntry(False, known_absent, values)
219+
220+
# We have the full dict!
221+
assert isinstance(entry, dict)
222+
223+
for dict_key in missing:
224+
# We explicitly add each dict key to the cache, so that cache hit
225+
# rates and LRU times for each key can be tracked separately.
226+
value = entry.get(dict_key, _Sentinel.sentinel) # type: ignore[arg-type]
227+
self.cache[(key, dict_key)] = _PerKeyValue(value)
228+
229+
if value is not _Sentinel.sentinel:
230+
values[dict_key] = value
231+
232+
return DictionaryEntry(True, set(), values)
233+
234+
def _get_full_dict(
235+
self,
236+
key: KT,
237+
) -> DictionaryEntry:
238+
"""Fetch the full dict for the given key."""
239+
240+
# First we check if we have cached the full dict.
241+
entry = self.cache.get((key, _FullCacheKey.KEY), _Sentinel.sentinel)
242+
if entry is not _Sentinel.sentinel:
243+
assert isinstance(entry, dict)
244+
return DictionaryEntry(True, set(), entry)
111245

112246
return DictionaryEntry(False, set(), {})
113247

@@ -117,7 +251,13 @@ def invalidate(self, key: KT) -> None:
117251
# Increment the sequence number so that any SELECT statements that
118252
# raced with the INSERT don't update the cache (SYN-369)
119253
self.sequence += 1
120-
self.cache.pop(key, None)
254+
255+
# We want to drop all information about the dict for the given key, so
256+
# we use `del_multi` to delete it all in one go.
257+
#
258+
# We ignore the type error here: `del_multi` accepts a truncated key
259+
# (when the key type is a tuple).
260+
self.cache.del_multi((key,)) # type: ignore[arg-type]
121261

122262
def invalidate_all(self) -> None:
123263
self.check_thread()
@@ -131,7 +271,16 @@ def update(
131271
value: Dict[DKT, DV],
132272
fetched_keys: Optional[Iterable[DKT]] = None,
133273
) -> None:
134-
"""Updates the entry in the cache
274+
"""Updates the entry in the cache.
275+
276+
Note: This does *not* invalidate any existing entries for the `key`.
277+
In particular, if we add an entry for the cached "full dict" with
278+
`fetched_keys=None`, existing entries for individual dict keys are
279+
not invalidated. Likewise, adding entries for individual keys does
280+
not invalidate any cached value for the full dict.
281+
282+
In other words: if the underlying data is *changed*, the cache must
283+
be explicitly invalidated via `.invalidate()`.
135284
136285
Args:
137286
sequence
@@ -149,20 +298,27 @@ def update(
149298
# Only update the cache if the caches sequence number matches the
150299
# number that the cache had before the SELECT was started (SYN-369)
151300
if fetched_keys is None:
152-
self._insert(key, value, set())
301+
self.cache[(key, _FullCacheKey.KEY)] = value
153302
else:
154-
self._update_or_insert(key, value, fetched_keys)
303+
self._update_subset(key, value, fetched_keys)
155304

156-
def _update_or_insert(
157-
self, key: KT, value: Dict[DKT, DV], known_absent: Iterable[DKT]
305+
def _update_subset(
306+
self, key: KT, value: Dict[DKT, DV], fetched_keys: Iterable[DKT]
158307
) -> None:
159-
# We pop and reinsert as we need to tell the cache the size may have
160-
# changed
308+
"""Add the given dictionary values as explicit keys in the cache.
309+
310+
Args:
311+
key: top-level cache key
312+
value: The dictionary with all the values that we should cache
313+
fetched_keys: The full set of dict keys that were looked up. Any keys
314+
here not in `value` should be marked as "known absent".
315+
"""
316+
317+
for dict_key, dict_value in value.items():
318+
self.cache[(key, dict_key)] = _PerKeyValue(dict_value)
161319

162-
entry: DictionaryEntry = self.cache.pop(key, DictionaryEntry(False, set(), {}))
163-
entry.value.update(value)
164-
entry.known_absent.update(known_absent)
165-
self.cache[key] = entry
320+
for dict_key in fetched_keys:
321+
if dict_key in value:
322+
continue
166323

167-
def _insert(self, key: KT, value: Dict[DKT, DV], known_absent: Set[DKT]) -> None:
168-
self.cache[key] = DictionaryEntry(True, known_absent, value)
324+
self.cache[(key, dict_key)] = _PerKeyValue(_Sentinel.sentinel)

0 commit comments

Comments
 (0)