Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Faster room joins: Try other destinations when resyncing the state of a partial-state room #12812

Merged
Merged
3 changes: 3 additions & 0 deletions synapse/federation/federation_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,9 @@ async def get_room_state_ids(

Returns:
a tuple of (state event_ids, auth event_ids)

Raises:
InvalidResponseError: if fields in the response have the wrong type.
Comment on lines +409 to +410
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this probably needs chasing up the call stack (through _get_state_ids_after_missing_prev_event, _resolve_state_at_missing_prevs, etc) too.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, I'll add a note to _get_state_ids_after_missing_prev_event.
_resolve_state_at_missing_prevs replaces it with a FederationError, which is already noted, so I'll leave that one alone.

"""
result = await self.transport_layer.get_room_state_ids(
destination, room_id, event_id=event_id
Expand Down
50 changes: 33 additions & 17 deletions synapse/handlers/federation.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,8 +555,8 @@ async def do_invite_join(
run_as_background_process(
desc="sync_partial_state_room",
func=self._sync_partial_state_room,
destination=origin,
destinations=ret.servers_in_room,
initial_destination=origin,
other_destinations=ret.servers_in_room,
room_id=room_id,
)

Expand Down Expand Up @@ -1461,16 +1461,16 @@ async def get_room_complexity(

async def _sync_partial_state_room(
self,
destination: Optional[str],
destinations: Collection[str],
initial_destination: Optional[str],
other_destinations: Collection[str],
room_id: str,
) -> None:
"""Background process to resync the state of a partial-state room

Args:
destination: the initial homeserver to pull the state from
destinations: other homeservers to try to pull the state from, if
`destination` is unavailable
initial_destination: the initial homeserver to pull the state from
other_destinations: other homeservers to try to pull the state from, if
`initial_destination` is unavailable
room_id: room to be resynced
"""

Expand All @@ -1481,18 +1481,29 @@ async def _sync_partial_state_room(
# TODO(faster_joins): what happens if we leave the room during a resync? if we
# really leave, that might mean we have difficulty getting the room state over
# federation.
#
# TODO(faster_joins): we need some way of prioritising which homeservers in
# `other_destinations` to try first, otherwise we'll spend ages trying dead
# homeservers for large rooms.

if initial_destination is None and len(other_destinations) == 0:
raise ValueError(
f"Cannot resync state of {room_id}: no destinations provided"
)

# Make an infinite iterator of destinations to try. Once we find a working
# destination, we'll stick with it until it flakes.
if destination is not None:
# Move `destination` to the front of the list.
destinations = list(destinations)
if destination in destinations:
destinations.remove(destination)
destinations = [destination] + destinations
destination_iter = itertools.cycle(destinations)

# `destination` is now the current remote homeserver we're pulling from.
if initial_destination is not None:
# Move `initial_destination` to the front of the list.
destinations = list(other_destinations)
if initial_destination in destinations:
destinations.remove(initial_destination)
destinations = [initial_destination] + destinations
destination_iter = itertools.cycle(destinations)
else:
destination_iter = itertools.cycle(other_destinations)

# `destination` is the current remote homeserver we're pulling from.
destination = next(destination_iter)
logger.info("Syncing state for room %s via %s", room_id, destination)

Expand Down Expand Up @@ -1529,7 +1540,7 @@ async def _sync_partial_state_room(
allow_rejected=True,
)
for event in events:
for attempt in range(len(destinations)):
for attempt in itertools.count():
try:
await self._federation_event_handler.update_state_for_partial_state_event(
destination, event
Expand All @@ -1538,6 +1549,11 @@ async def _sync_partial_state_room(
except FederationError as e:
if attempt == len(destinations) - 1:
# We have tried every remote server for this event. Give up.
# TODO(faster_joins) giving up isn't the right thing to do
# if there's a temporary network outage. retrying
# indefinitely is also not the right thing to do if we can
# reach all homeservers and they all claim they don't have
# the state we want.
logger.error(
"Failed to get state for %s at %s from %s because %s, "
"giving up!",
Expand Down
4 changes: 2 additions & 2 deletions synapse/handlers/federation_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -814,8 +814,8 @@ async def _resolve_state_at_missing_prevs(
event: an event to check for missing prevs.

Returns:
if we already had all the prev events, `None`. Otherwise, returns a list of
the events in the state at `event`.
if we already had all the prev events, `None`. Otherwise, returns
the event ids of the state at `event`.

Raises:
FederationError if we fail to get the state from the remote server after any
Expand Down