Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Time how long it takes us to do backfill processing #13535

Merged
merged 4 commits into from
Aug 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/13535.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add metrics to time how long it takes us to do backfill processing (`synapse_federation_backfill_processing_before_time_seconds`, `synapse_federation_backfill_processing_after_time_seconds`).
56 changes: 54 additions & 2 deletions synapse/handlers/federation.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
)

import attr
from prometheus_client import Histogram
from signedjson.key import decode_verify_key_bytes
from signedjson.sign import verify_signed_json
from unpaddedbase64 import decode_base64
Expand Down Expand Up @@ -79,6 +80,24 @@

logger = logging.getLogger(__name__)

# Added to debug performance and track progress on optimizations
backfill_processing_before_timer = Histogram(
MadLittleMods marked this conversation as resolved.
Show resolved Hide resolved
"synapse_federation_backfill_processing_before_time_seconds",
"sec",
[],
buckets=(
1.0,
5.0,
10.0,
20.0,
30.0,
40.0,
60.0,
80.0,
"+Inf",
),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We will probably want to adjust the fidelity in the buckets once we have some real data.

I've seen the linearizer lock take 45s and calculating the likely_domains take 20s for example

)


def get_domains_from_state(state: StateMap[EventBase]) -> List[Tuple[str, int]]:
"""Get joined domains from state
Expand Down Expand Up @@ -138,6 +157,7 @@ class FederationHandler:
def __init__(self, hs: "HomeServer"):
self.hs = hs

self.clock = hs.get_clock()
self.store = hs.get_datastores().main
self._storage_controllers = hs.get_storage_controllers()
self._state_storage_controller = self._storage_controllers.state
Expand Down Expand Up @@ -197,12 +217,39 @@ async def maybe_backfill(
return. This is used as part of the heuristic to decide if we
should back paginate.
"""
# Starting the processing time here so we can include the room backfill
# linearizer lock queue in the timing
processing_start_time = self.clock.time_msec()
Copy link
Contributor Author

@MadLittleMods MadLittleMods Aug 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Starting the processing time here so we can include the room backfill linearizer lock in the timing.

The linearizer is only used to avoid repeating work at the same time but is very simplistic. We can optimize how this works in the future so it would be good to have it as part of the before timing.


async with self._room_backfill.queue(room_id):
return await self._maybe_backfill_inner(room_id, current_depth, limit)
return await self._maybe_backfill_inner(
room_id,
current_depth,
limit,
processing_start_time=processing_start_time,
)

async def _maybe_backfill_inner(
self, room_id: str, current_depth: int, limit: int
self,
room_id: str,
current_depth: int,
limit: int,
*,
processing_start_time: int,
) -> bool:
"""
Checks whether the `current_depth` is at or approaching any backfill
points in the room and if so, will backfill. We only care about
checking backfill points that happened before the `current_depth`
(meaning less than or equal to the `current_depth`).

Args:
room_id: The room to backfill in.
current_depth: The depth to check at for any upcoming backfill points.
limit: The max number of events to request from the remote federated server.
processing_start_time: The time when `maybe_backfill` started
processing. Only used for timing.
"""
backwards_extremities = [
_BackfillPoint(event_id, depth, _BackfillPointType.BACKWARDS_EXTREMITY)
for event_id, depth in await self.store.get_oldest_event_ids_with_depth_in_room(
Expand Down Expand Up @@ -425,6 +472,11 @@ async def try_backfill(domains: List[str]) -> bool:

return False

processing_end_time = self.clock.time_msec()
backfill_processing_before_timer.observe(
(processing_start_time - processing_end_time) / 1000
)
Comment on lines +475 to +478
Copy link
Contributor Author

@MadLittleMods MadLittleMods Aug 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same problem as #13533 (comment)

I don't think this is working how we expect. All of the buckets just have the same values which means it's saying every record is taking less than 1.0s. It's possible. Did we get the ms to s conversion wrong?

https://prometheus.matrix.org/graph?g0.expr=synapse_federation_backfill_processing_before_time_seconds_bucket&g0.tab=1&g0.stacked=0&g0.show_exemplars=0&g0.range_input=1h&g0.end_input=2022-08-18%2020%3A36%3A02&g0.moment_input=2022-08-18%2020%3A36%3A02

synapse_federation_backfill_processing_before_time_seconds_bucket{environment="live", identifier="matrix.org", index="2", instance="matrix.org", job="synapse_client_reader", le="+Inf", service="synapse"}	7759
synapse_federation_backfill_processing_before_time_seconds_bucket{environment="live", identifier="matrix.org", index="2", instance="matrix.org", job="synapse_client_reader", le="1.0", service="synapse"}	7759
synapse_federation_backfill_processing_before_time_seconds_bucket{environment="live", identifier="matrix.org", index="2", instance="matrix.org", job="synapse_client_reader", le="10.0", service="synapse"}	7759
synapse_federation_backfill_processing_before_time_seconds_bucket{environment="live", identifier="matrix.org", index="2", instance="matrix.org", job="synapse_client_reader", le="20.0", service="synapse"}	7759
synapse_federation_backfill_processing_before_time_seconds_bucket{environment="live", identifier="matrix.org", index="2", instance="matrix.org", job="synapse_client_reader", le="30.0", service="synapse"}	7759
synapse_federation_backfill_processing_before_time_seconds_bucket{environment="live", identifier="matrix.org", index="2", instance="matrix.org", job="synapse_client_reader", le="40.0", service="synapse"}	7759
synapse_federation_backfill_processing_before_time_seconds_bucket{environment="live", identifier="matrix.org", index="2", instance="matrix.org", job="synapse_client_reader", le="5.0", service="synapse"}	7759
synapse_federation_backfill_processing_before_time_seconds_bucket{environment="live", identifier="matrix.org", index="2", instance="matrix.org", job="synapse_client_reader", le="60.0", service="synapse"}	7759
synapse_federation_backfill_processing_before_time_seconds_bucket{environment="live", identifier="matrix.org", index="2", instance="matrix.org", job="synapse_client_reader", le="80.0", service="synapse"}	7759

So the graph of the percentiles is just average of all of the buckets below it and we get straight lines, https://grafana.matrix.org/d/dYoRgTgVz/messages-timing?orgId=1&from=1660812148100&to=1660855348100&viewPanel=212


The after timing one looks normal:

https://grafana.matrix.org/d/dYoRgTgVz/messages-timing?orgId=1&from=1660812186785&to=1660855386785&viewPanel=213

Copy link
Contributor Author

@MadLittleMods MadLittleMods Aug 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did we get the ms to s conversion wrong?

I don't think so. It gives values like 2.196 when I log it locally.

And we do the same thing for existing metrics which look fine, ex.

).observe((now - ts) / 1000)

Copy link
Contributor Author

@MadLittleMods MadLittleMods Aug 22, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The values in Prometheus are negative: https://prometheus.matrix.org/graph?g0.expr=synapse_federation_backfill_processing_before_time_seconds_sum&g0.tab=1&g0.stacked=0&g0.show_exemplars=0&g0.range_input=1h&g0.end_input=2022-08-22%2017%3A57%3A40&g0.moment_input=2022-08-22%2017%3A57%3A40

Spotted 🕵️‍♀️ I accidentally did start - end 🤦‍♂️

Corrected:

Suggested change
processing_end_time = self.clock.time_msec()
backfill_processing_before_timer.observe(
(processing_start_time - processing_end_time) / 1000
)
processing_end_time = self.clock.time_msec()
backfill_processing_before_timer.observe(
(processing_end_time - processing_start_time) / 1000
)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in #13584


success = await try_backfill(likely_domains)
if success:
return True
Expand Down
49 changes: 35 additions & 14 deletions synapse/handlers/federation_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
Tuple,
)

from prometheus_client import Counter
from prometheus_client import Counter, Histogram

from synapse import event_auth
from synapse.api.constants import (
Expand Down Expand Up @@ -92,6 +92,26 @@
"Events received over federation that we marked as soft_failed",
)

# Added to debug performance and track progress on optimizations
backfill_processing_after_timer = Histogram(
"synapse_federation_backfill_processing_after_time_seconds",
"sec",
[],
buckets=(
1.0,
5.0,
10.0,
20.0,
30.0,
40.0,
60.0,
80.0,
120.0,
180.0,
"+Inf",
),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The after processing can take a while depending on how slow /state_ids is.

ex. _process_pulled_events taking 83s

)


class FederationEventHandler:
"""Handles events that originated from federation.
Expand Down Expand Up @@ -597,20 +617,21 @@ async def backfill(
if not events:
return

# if there are any events in the wrong room, the remote server is buggy and
# should not be trusted.
for ev in events:
if ev.room_id != room_id:
raise InvalidResponseError(
f"Remote server {dest} returned event {ev.event_id} which is in "
f"room {ev.room_id}, when we were backfilling in {room_id}"
)
with backfill_processing_after_timer.time():
# if there are any events in the wrong room, the remote server is buggy and
# should not be trusted.
for ev in events:
if ev.room_id != room_id:
raise InvalidResponseError(
f"Remote server {dest} returned event {ev.event_id} which is in "
f"room {ev.room_id}, when we were backfilling in {room_id}"
)

await self._process_pulled_events(
dest,
events,
backfilled=True,
)
await self._process_pulled_events(
dest,
events,
backfilled=True,
)

@trace
async def _get_missing_events_for_pdu(
Expand Down