Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit ac5c221

Browse files
Stagger send presence to remotes (#10398)
This is to help with performance, where trying to connect to thousands of hosts at once can consume a lot of CPU (due to TLS etc). Co-authored-by: Brendan Abolivier <babolivier@matrix.org>
1 parent 5ecad4e commit ac5c221

File tree

4 files changed

+116
-5
lines changed

4 files changed

+116
-5
lines changed

changelog.d/10398.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Stagger sending of presence update to remote servers, reducing CPU spikes caused by starting many connections to remote servers at once.

synapse/federation/sender/__init__.py

Lines changed: 94 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,12 @@
1414

1515
import abc
1616
import logging
17+
from collections import OrderedDict
1718
from typing import TYPE_CHECKING, Dict, Hashable, Iterable, List, Optional, Set, Tuple
1819

20+
import attr
1921
from prometheus_client import Counter
22+
from typing_extensions import Literal
2023

2124
from twisted.internet import defer
2225

@@ -33,8 +36,12 @@
3336
event_processing_loop_room_count,
3437
events_processed_counter,
3538
)
36-
from synapse.metrics.background_process_metrics import run_as_background_process
39+
from synapse.metrics.background_process_metrics import (
40+
run_as_background_process,
41+
wrap_as_background_process,
42+
)
3743
from synapse.types import JsonDict, ReadReceipt, RoomStreamToken
44+
from synapse.util import Clock
3845
from synapse.util.metrics import Measure
3946

4047
if TYPE_CHECKING:
@@ -137,6 +144,84 @@ async def get_replication_rows(
137144
raise NotImplementedError()
138145

139146

147+
@attr.s
148+
class _PresenceQueue:
149+
"""A queue of destinations that need to be woken up due to new presence
150+
updates.
151+
152+
Staggers waking up of per destination queues to ensure that we don't attempt
153+
to start TLS connections with many hosts all at once, leading to pinned CPU.
154+
"""
155+
156+
# The maximum duration in seconds between queuing up a destination and it
157+
# being woken up.
158+
_MAX_TIME_IN_QUEUE = 30.0
159+
160+
# The maximum duration in seconds between waking up consecutive destination
161+
# queues.
162+
_MAX_DELAY = 0.1
163+
164+
sender: "FederationSender" = attr.ib()
165+
clock: Clock = attr.ib()
166+
queue: "OrderedDict[str, Literal[None]]" = attr.ib(factory=OrderedDict)
167+
processing: bool = attr.ib(default=False)
168+
169+
def add_to_queue(self, destination: str) -> None:
170+
"""Add a destination to the queue to be woken up."""
171+
172+
self.queue[destination] = None
173+
174+
if not self.processing:
175+
self._handle()
176+
177+
@wrap_as_background_process("_PresenceQueue.handle")
178+
async def _handle(self) -> None:
179+
"""Background process to drain the queue."""
180+
181+
if not self.queue:
182+
return
183+
184+
assert not self.processing
185+
self.processing = True
186+
187+
try:
188+
# We start with a delay that should drain the queue quickly enough that
189+
# we process all destinations in the queue in _MAX_TIME_IN_QUEUE
190+
# seconds.
191+
#
192+
# We also add an upper bound to the delay, to gracefully handle the
193+
# case where the queue only has a few entries in it.
194+
current_sleep_seconds = min(
195+
self._MAX_DELAY, self._MAX_TIME_IN_QUEUE / len(self.queue)
196+
)
197+
198+
while self.queue:
199+
destination, _ = self.queue.popitem(last=False)
200+
201+
queue = self.sender._get_per_destination_queue(destination)
202+
203+
if not queue._new_data_to_send:
204+
# The per destination queue has already been woken up.
205+
continue
206+
207+
queue.attempt_new_transaction()
208+
209+
await self.clock.sleep(current_sleep_seconds)
210+
211+
if not self.queue:
212+
break
213+
214+
# More destinations may have been added to the queue, so we may
215+
# need to reduce the delay to ensure everything gets processed
216+
# within _MAX_TIME_IN_QUEUE seconds.
217+
current_sleep_seconds = min(
218+
current_sleep_seconds, self._MAX_TIME_IN_QUEUE / len(self.queue)
219+
)
220+
221+
finally:
222+
self.processing = False
223+
224+
140225
class FederationSender(AbstractFederationSender):
141226
def __init__(self, hs: "HomeServer"):
142227
self.hs = hs
@@ -208,6 +293,8 @@ def __init__(self, hs: "HomeServer"):
208293

209294
self._external_cache = hs.get_external_cache()
210295

296+
self._presence_queue = _PresenceQueue(self, self.clock)
297+
211298
def _get_per_destination_queue(self, destination: str) -> PerDestinationQueue:
212299
"""Get or create a PerDestinationQueue for the given destination
213300
@@ -517,7 +604,12 @@ def send_presence_to_destinations(
517604
self._instance_name, destination
518605
):
519606
continue
520-
self._get_per_destination_queue(destination).send_presence(states)
607+
608+
self._get_per_destination_queue(destination).send_presence(
609+
states, start_loop=False
610+
)
611+
612+
self._presence_queue.add_to_queue(destination)
521613

522614
def build_and_send_edu(
523615
self,

synapse/federation/sender/per_destination_queue.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -171,14 +171,24 @@ def send_pdu(self, pdu: EventBase) -> None:
171171

172172
self.attempt_new_transaction()
173173

174-
def send_presence(self, states: Iterable[UserPresenceState]) -> None:
175-
"""Add presence updates to the queue. Start the transmission loop if necessary.
174+
def send_presence(
175+
self, states: Iterable[UserPresenceState], start_loop: bool = True
176+
) -> None:
177+
"""Add presence updates to the queue.
178+
179+
Args:
180+
states: Presence updates to send
181+
start_loop: Whether to start the transmission loop if not already
182+
running.
176183
177184
Args:
178185
states: presence to send
179186
"""
180187
self._pending_presence.update({state.user_id: state for state in states})
181-
self.attempt_new_transaction()
188+
self._new_data_to_send = True
189+
190+
if start_loop:
191+
self.attempt_new_transaction()
182192

183193
def queue_read_receipt(self, receipt: ReadReceipt) -> None:
184194
"""Add a RR to the list to be sent. Doesn't start the transmission loop yet

tests/events/test_presence_router.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,10 @@ def test_send_local_online_presence_to_with_module(self):
285285
presence_updates, _ = sync_presence(self, self.presence_receiving_user_two_id)
286286
self.assertEqual(len(presence_updates), 3)
287287

288+
# We stagger sending of presence, so we need to wait a bit for them to
289+
# get sent out.
290+
self.reactor.advance(60)
291+
288292
# Test that sending to a remote user works
289293
remote_user_id = "@far_away_person:island"
290294

@@ -301,6 +305,10 @@ def test_send_local_online_presence_to_with_module(self):
301305
self.module_api.send_local_online_presence_to([remote_user_id])
302306
)
303307

308+
# We stagger sending of presence, so we need to wait a bit for them to
309+
# get sent out.
310+
self.reactor.advance(60)
311+
304312
# Check that the expected presence updates were sent
305313
# We explicitly compare using sets as we expect that calling
306314
# module_api.send_local_online_presence_to will create a presence

0 commit comments

Comments
 (0)