Skip to content

Commit a8d7f54

Browse files
erikjohnstonphil-flex
authored andcommitted
Don't relay REMOTE_SERVER_UP cmds to same conn. (matrix-org#7352)
For direct TCP connections we need the master to relay REMOTE_SERVER_UP commands to the other connections so that all instances get notified about it. The old implementation just relayed to all connections, assuming that sending back to the original sender of the command was safe. This is not true for redis, where commands sent get echoed back to the sender, which was causing master to effectively infinite loop sending and then re-receiving REMOTE_SERVER_UP commands that it sent. The fix is to ensure that we only relay to *other* connections and not to the connection we received the notification from. Fixes matrix-org#7334.
1 parent df018db commit a8d7f54

File tree

6 files changed

+114
-25
lines changed

6 files changed

+114
-25
lines changed

changelog.d/7352.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add support for running replication over Redis when using workers.

synapse/notifier.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -220,12 +220,6 @@ def add_replication_callback(self, cb: Callable[[], None]):
220220
"""
221221
self.replication_callbacks.append(cb)
222222

223-
def add_remote_server_up_callback(self, cb: Callable[[str], None]):
224-
"""Add a callback that will be called when synapse detects a server
225-
has been
226-
"""
227-
self.remote_server_up_callbacks.append(cb)
228-
229223
def on_new_room_event(
230224
self, event, room_stream_id, max_room_stream_id, extra_users=[]
231225
):
@@ -544,6 +538,3 @@ def notify_remote_server_up(self, server: str):
544538
# circular dependencies.
545539
if self.federation_sender:
546540
self.federation_sender.wake_destination(server)
547-
548-
for cb in self.remote_server_up_callbacks:
549-
cb(server)

synapse/replication/tcp/handler.py

Lines changed: 49 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,6 @@ def __init__(self, hs):
117117
self._server_notices_sender = None
118118
if self._is_master:
119119
self._server_notices_sender = hs.get_server_notices_sender()
120-
self._notifier.add_remote_server_up_callback(self.send_remote_server_up)
121120

122121
def start_replication(self, hs):
123122
"""Helper method to start a replication connection to the remote server
@@ -163,7 +162,7 @@ def start_replication(self, hs):
163162
port = hs.config.worker_replication_port
164163
hs.get_reactor().connectTCP(host, port, self._factory)
165164

166-
async def on_REPLICATE(self, cmd: ReplicateCommand):
165+
async def on_REPLICATE(self, conn: AbstractConnection, cmd: ReplicateCommand):
167166
# We only want to announce positions by the writer of the streams.
168167
# Currently this is just the master process.
169168
if not self._is_master:
@@ -173,25 +172,31 @@ async def on_REPLICATE(self, cmd: ReplicateCommand):
173172
current_token = stream.current_token()
174173
self.send_command(PositionCommand(stream_name, current_token))
175174

176-
async def on_USER_SYNC(self, cmd: UserSyncCommand):
175+
async def on_USER_SYNC(self, conn: AbstractConnection, cmd: UserSyncCommand):
177176
user_sync_counter.inc()
178177

179178
if self._is_master:
180179
await self._presence_handler.update_external_syncs_row(
181180
cmd.instance_id, cmd.user_id, cmd.is_syncing, cmd.last_sync_ms
182181
)
183182

184-
async def on_CLEAR_USER_SYNC(self, cmd: ClearUserSyncsCommand):
183+
async def on_CLEAR_USER_SYNC(
184+
self, conn: AbstractConnection, cmd: ClearUserSyncsCommand
185+
):
185186
if self._is_master:
186187
await self._presence_handler.update_external_syncs_clear(cmd.instance_id)
187188

188-
async def on_FEDERATION_ACK(self, cmd: FederationAckCommand):
189+
async def on_FEDERATION_ACK(
190+
self, conn: AbstractConnection, cmd: FederationAckCommand
191+
):
189192
federation_ack_counter.inc()
190193

191194
if self._federation_sender:
192195
self._federation_sender.federation_ack(cmd.token)
193196

194-
async def on_REMOVE_PUSHER(self, cmd: RemovePusherCommand):
197+
async def on_REMOVE_PUSHER(
198+
self, conn: AbstractConnection, cmd: RemovePusherCommand
199+
):
195200
remove_pusher_counter.inc()
196201

197202
if self._is_master:
@@ -201,7 +206,9 @@ async def on_REMOVE_PUSHER(self, cmd: RemovePusherCommand):
201206

202207
self._notifier.on_new_replication_data()
203208

204-
async def on_INVALIDATE_CACHE(self, cmd: InvalidateCacheCommand):
209+
async def on_INVALIDATE_CACHE(
210+
self, conn: AbstractConnection, cmd: InvalidateCacheCommand
211+
):
205212
invalidate_cache_counter.inc()
206213

207214
if self._is_master:
@@ -211,7 +218,7 @@ async def on_INVALIDATE_CACHE(self, cmd: InvalidateCacheCommand):
211218
cmd.cache_func, tuple(cmd.keys)
212219
)
213220

214-
async def on_USER_IP(self, cmd: UserIpCommand):
221+
async def on_USER_IP(self, conn: AbstractConnection, cmd: UserIpCommand):
215222
user_ip_cache_counter.inc()
216223

217224
if self._is_master:
@@ -227,7 +234,7 @@ async def on_USER_IP(self, cmd: UserIpCommand):
227234
if self._server_notices_sender:
228235
await self._server_notices_sender.on_user_ip(cmd.user_id)
229236

230-
async def on_RDATA(self, cmd: RdataCommand):
237+
async def on_RDATA(self, conn: AbstractConnection, cmd: RdataCommand):
231238
stream_name = cmd.stream_name
232239
inbound_rdata_count.labels(stream_name).inc()
233240

@@ -278,7 +285,7 @@ async def on_rdata(self, stream_name: str, token: int, rows: list):
278285
logger.debug("Received rdata %s -> %s", stream_name, token)
279286
await self._replication_data_handler.on_rdata(stream_name, token, rows)
280287

281-
async def on_POSITION(self, cmd: PositionCommand):
288+
async def on_POSITION(self, conn: AbstractConnection, cmd: PositionCommand):
282289
stream = self._streams.get(cmd.stream_name)
283290
if not stream:
284291
logger.error("Got POSITION for unknown stream: %s", cmd.stream_name)
@@ -332,12 +339,30 @@ async def on_POSITION(self, cmd: PositionCommand):
332339

333340
self._streams_connected.add(cmd.stream_name)
334341

335-
async def on_REMOTE_SERVER_UP(self, cmd: RemoteServerUpCommand):
342+
async def on_REMOTE_SERVER_UP(
343+
self, conn: AbstractConnection, cmd: RemoteServerUpCommand
344+
):
336345
""""Called when get a new REMOTE_SERVER_UP command."""
337346
self._replication_data_handler.on_remote_server_up(cmd.data)
338347

339-
if self._is_master:
340-
self._notifier.notify_remote_server_up(cmd.data)
348+
self._notifier.notify_remote_server_up(cmd.data)
349+
350+
# We relay to all other connections to ensure every instance gets the
351+
# notification.
352+
#
353+
# When configured to use redis we'll always only have one connection and
354+
# so this is a no-op (all instances will have already received the same
355+
# REMOTE_SERVER_UP command).
356+
#
357+
# For direct TCP connections this will relay to all other connections
358+
# connected to us. When on master this will correctly fan out to all
359+
# other direct TCP clients and on workers there'll only be the one
360+
# connection to master.
361+
#
362+
# (The logic here should also be sound if we have a mix of Redis and
363+
# direct TCP connections so long as there is only one traffic route
364+
# between two instances, but that is not currently supported).
365+
self.send_command(cmd, ignore_conn=conn)
341366

342367
def new_connection(self, connection: AbstractConnection):
343368
"""Called when we have a new connection.
@@ -382,11 +407,21 @@ def connected(self) -> bool:
382407
"""
383408
return bool(self._connections)
384409

385-
def send_command(self, cmd: Command):
410+
def send_command(
411+
self, cmd: Command, ignore_conn: Optional[AbstractConnection] = None
412+
):
386413
"""Send a command to all connected connections.
414+
415+
Args:
416+
cmd
417+
ignore_conn: If set don't send command to the given connection.
418+
Used when relaying commands from one connection to all others.
387419
"""
388420
if self._connections:
389421
for connection in self._connections:
422+
if connection == ignore_conn:
423+
continue
424+
390425
try:
391426
connection.send_command(cmd)
392427
except Exception:

synapse/replication/tcp/protocol.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ async def handle_command(self, cmd: Command):
260260
# Then call out to the handler.
261261
cmd_func = getattr(self.command_handler, "on_%s" % (cmd.NAME,), None)
262262
if cmd_func:
263-
await cmd_func(cmd)
263+
await cmd_func(self, cmd)
264264
handled = True
265265

266266
if not handled:

synapse/replication/tcp/redis.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ async def handle_command(self, cmd: Command):
112112
# Then call out to the handler.
113113
cmd_func = getattr(self.handler, "on_%s" % (cmd.NAME,), None)
114114
if cmd_func:
115-
await cmd_func(cmd)
115+
await cmd_func(self, cmd)
116116
handled = True
117117

118118
if not handled:
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright 2020 The Matrix.org Foundation C.I.C.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
from typing import Tuple
17+
18+
from twisted.internet.interfaces import IProtocol
19+
from twisted.test.proto_helpers import StringTransport
20+
21+
from synapse.replication.tcp.resource import ReplicationStreamProtocolFactory
22+
23+
from tests.unittest import HomeserverTestCase
24+
25+
26+
class RemoteServerUpTestCase(HomeserverTestCase):
27+
def prepare(self, reactor, clock, hs):
28+
self.factory = ReplicationStreamProtocolFactory(hs)
29+
30+
def _make_client(self) -> Tuple[IProtocol, StringTransport]:
31+
"""Create a new direct TCP replication connection
32+
"""
33+
34+
proto = self.factory.buildProtocol(("127.0.0.1", 0))
35+
transport = StringTransport()
36+
proto.makeConnection(transport)
37+
38+
# We can safely ignore the commands received during connection.
39+
self.pump()
40+
transport.clear()
41+
42+
return proto, transport
43+
44+
def test_relay(self):
45+
"""Test that Synapse will relay REMOTE_SERVER_UP commands to all
46+
other connections, but not the one that sent it.
47+
"""
48+
49+
proto1, transport1 = self._make_client()
50+
51+
# We shouldn't receive an echo.
52+
proto1.dataReceived(b"REMOTE_SERVER_UP example.com\n")
53+
self.pump()
54+
self.assertEqual(transport1.value(), b"")
55+
56+
# But we should see an echo if we connect another client
57+
proto2, transport2 = self._make_client()
58+
proto1.dataReceived(b"REMOTE_SERVER_UP example.com\n")
59+
60+
self.pump()
61+
self.assertEqual(transport1.value(), b"")
62+
self.assertEqual(transport2.value(), b"REMOTE_SERVER_UP example.com\n")

0 commit comments

Comments
 (0)