Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit e8f30a7

Browse files
David RobertsonMadLittleMods
andauthored
Fix overflows in /messages backfill calculation (#13936)
* Reproduce bug * Compute `least_function` first * Substitute `least_function` with an f-string * Bugfix: avoid overflow Co-authored-by: Eric Eastwood <erice@element.io>
1 parent 1cc2ca8 commit e8f30a7

File tree

3 files changed

+103
-41
lines changed

3 files changed

+103
-41
lines changed

changelog.d/13936.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Exponentially backoff from backfilling the same event over and over.

synapse/storage/databases/main/event_federation.py

Lines changed: 53 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -73,13 +73,30 @@
7373

7474
logger = logging.getLogger(__name__)
7575

76-
BACKFILL_EVENT_BACKOFF_UPPER_BOUND_SECONDS: int = int(
77-
datetime.timedelta(days=7).total_seconds()
78-
)
79-
BACKFILL_EVENT_EXPONENTIAL_BACKOFF_STEP_SECONDS: int = int(
80-
datetime.timedelta(hours=1).total_seconds()
76+
# Parameters controlling exponential backoff between backfill failures.
77+
# After the first failure to backfill, we wait 2 hours before trying again. If the
78+
# second attempt fails, we wait 4 hours before trying again. If the third attempt fails,
79+
# we wait 8 hours before trying again, ... and so on.
80+
#
81+
# Each successive backoff period is twice as long as the last. However we cap this
82+
# period at a maximum of 2^8 = 256 hours: a little over 10 days. (This is the smallest
83+
# power of 2 which yields a maximum backoff period of at least 7 days---which was the
84+
# original maximum backoff period.) Even when we hit this cap, we will continue to
85+
# make backfill attempts once every 10 days.
86+
BACKFILL_EVENT_EXPONENTIAL_BACKOFF_MAXIMUM_DOUBLING_STEPS = 8
87+
BACKFILL_EVENT_EXPONENTIAL_BACKOFF_STEP_MILLISECONDS = int(
88+
datetime.timedelta(hours=1).total_seconds() * 1000
8189
)
8290

91+
# We need a cap on the power of 2 or else the backoff period
92+
# 2^N * (milliseconds per hour)
93+
# will overflow when calcuated within the database. We ensure overflow does not occur
94+
# by checking that the largest backoff period fits in a 32-bit signed integer.
95+
_LONGEST_BACKOFF_PERIOD_MILLISECONDS = (
96+
2**BACKFILL_EVENT_EXPONENTIAL_BACKOFF_MAXIMUM_DOUBLING_STEPS
97+
) * BACKFILL_EVENT_EXPONENTIAL_BACKOFF_STEP_MILLISECONDS
98+
assert 0 < _LONGEST_BACKOFF_PERIOD_MILLISECONDS <= ((2**31) - 1)
99+
83100

84101
# All the info we need while iterating the DAG while backfilling
85102
@attr.s(frozen=True, slots=True, auto_attribs=True)
@@ -767,7 +784,15 @@ def get_backfill_points_in_room_txn(
767784
# persisted in our database yet (meaning we don't know their depth
768785
# specifically). So we need to look for the approximate depth from
769786
# the events connected to the current backwards extremeties.
770-
sql = """
787+
788+
if isinstance(self.database_engine, PostgresEngine):
789+
least_function = "LEAST"
790+
elif isinstance(self.database_engine, Sqlite3Engine):
791+
least_function = "MIN"
792+
else:
793+
raise RuntimeError("Unknown database engine")
794+
795+
sql = f"""
771796
SELECT backward_extrem.event_id, event.depth FROM events AS event
772797
/**
773798
* Get the edge connections from the event_edges table
@@ -825,7 +850,10 @@ def get_backfill_points_in_room_txn(
825850
*/
826851
AND (
827852
failed_backfill_attempt_info.event_id IS NULL
828-
OR ? /* current_time */ >= failed_backfill_attempt_info.last_attempt_ts + /*least*/%s((1 << failed_backfill_attempt_info.num_attempts) * ? /* step */, ? /* upper bound */)
853+
OR ? /* current_time */ >= failed_backfill_attempt_info.last_attempt_ts + (
854+
(1 << {least_function}(failed_backfill_attempt_info.num_attempts, ? /* max doubling steps */))
855+
* ? /* step */
856+
)
829857
)
830858
/**
831859
* Sort from highest (closest to the `current_depth`) to the lowest depth
@@ -837,22 +865,15 @@ def get_backfill_points_in_room_txn(
837865
LIMIT ?
838866
"""
839867

840-
if isinstance(self.database_engine, PostgresEngine):
841-
least_function = "least"
842-
elif isinstance(self.database_engine, Sqlite3Engine):
843-
least_function = "min"
844-
else:
845-
raise RuntimeError("Unknown database engine")
846-
847868
txn.execute(
848-
sql % (least_function,),
869+
sql,
849870
(
850871
room_id,
851872
False,
852873
current_depth,
853874
self._clock.time_msec(),
854-
1000 * BACKFILL_EVENT_EXPONENTIAL_BACKOFF_STEP_SECONDS,
855-
1000 * BACKFILL_EVENT_BACKOFF_UPPER_BOUND_SECONDS,
875+
BACKFILL_EVENT_EXPONENTIAL_BACKOFF_MAXIMUM_DOUBLING_STEPS,
876+
BACKFILL_EVENT_EXPONENTIAL_BACKOFF_STEP_MILLISECONDS,
856877
limit,
857878
),
858879
)
@@ -902,7 +923,14 @@ async def get_insertion_event_backward_extremities_in_room(
902923
def get_insertion_event_backward_extremities_in_room_txn(
903924
txn: LoggingTransaction, room_id: str
904925
) -> List[Tuple[str, int]]:
905-
sql = """
926+
if isinstance(self.database_engine, PostgresEngine):
927+
least_function = "LEAST"
928+
elif isinstance(self.database_engine, Sqlite3Engine):
929+
least_function = "MIN"
930+
else:
931+
raise RuntimeError("Unknown database engine")
932+
933+
sql = f"""
906934
SELECT
907935
insertion_event_extremity.event_id, event.depth
908936
/* We only want insertion events that are also marked as backwards extremities */
@@ -942,7 +970,10 @@ def get_insertion_event_backward_extremities_in_room_txn(
942970
*/
943971
AND (
944972
failed_backfill_attempt_info.event_id IS NULL
945-
OR ? /* current_time */ >= failed_backfill_attempt_info.last_attempt_ts + /*least*/%s((1 << failed_backfill_attempt_info.num_attempts) * ? /* step */, ? /* upper bound */)
973+
OR ? /* current_time */ >= failed_backfill_attempt_info.last_attempt_ts + (
974+
(1 << {least_function}(failed_backfill_attempt_info.num_attempts, ? /* max doubling steps */))
975+
* ? /* step */
976+
)
946977
)
947978
/**
948979
* Sort from highest (closest to the `current_depth`) to the lowest depth
@@ -954,21 +985,14 @@ def get_insertion_event_backward_extremities_in_room_txn(
954985
LIMIT ?
955986
"""
956987

957-
if isinstance(self.database_engine, PostgresEngine):
958-
least_function = "least"
959-
elif isinstance(self.database_engine, Sqlite3Engine):
960-
least_function = "min"
961-
else:
962-
raise RuntimeError("Unknown database engine")
963-
964988
txn.execute(
965-
sql % (least_function,),
989+
sql,
966990
(
967991
room_id,
968992
current_depth,
969993
self._clock.time_msec(),
970-
1000 * BACKFILL_EVENT_EXPONENTIAL_BACKOFF_STEP_SECONDS,
971-
1000 * BACKFILL_EVENT_BACKOFF_UPPER_BOUND_SECONDS,
994+
BACKFILL_EVENT_EXPONENTIAL_BACKOFF_MAXIMUM_DOUBLING_STEPS,
995+
BACKFILL_EVENT_EXPONENTIAL_BACKOFF_STEP_MILLISECONDS,
972996
limit,
973997
),
974998
)

tests/storage/test_event_federation.py

Lines changed: 49 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -766,9 +766,7 @@ def test_get_backfill_points_in_room(self):
766766
self.store.get_backfill_points_in_room(room_id, depth_map["B"], limit=100)
767767
)
768768
backfill_event_ids = [backfill_point[0] for backfill_point in backfill_points]
769-
self.assertListEqual(
770-
backfill_event_ids, ["b6", "b5", "b4", "2", "b3", "b2", "b1"]
771-
)
769+
self.assertEqual(backfill_event_ids, ["b6", "b5", "b4", "2", "b3", "b2", "b1"])
772770

773771
# Try at "A"
774772
backfill_points = self.get_success(
@@ -814,7 +812,7 @@ def test_get_backfill_points_in_room_excludes_events_we_have_attempted(
814812
)
815813
backfill_event_ids = [backfill_point[0] for backfill_point in backfill_points]
816814
# Only the backfill points that we didn't record earlier exist here.
817-
self.assertListEqual(backfill_event_ids, ["b6", "2", "b1"])
815+
self.assertEqual(backfill_event_ids, ["b6", "2", "b1"])
818816

819817
def test_get_backfill_points_in_room_attempted_event_retry_after_backoff_duration(
820818
self,
@@ -860,7 +858,7 @@ def test_get_backfill_points_in_room_attempted_event_retry_after_backoff_duratio
860858
self.store.get_backfill_points_in_room(room_id, depth_map["A"], limit=100)
861859
)
862860
backfill_event_ids = [backfill_point[0] for backfill_point in backfill_points]
863-
self.assertListEqual(backfill_event_ids, ["b3", "b2"])
861+
self.assertEqual(backfill_event_ids, ["b3", "b2"])
864862

865863
# Now advance time by 20 hours (above 2^4 because we made 4 attemps) and
866864
# see if we can now backfill it
@@ -871,7 +869,48 @@ def test_get_backfill_points_in_room_attempted_event_retry_after_backoff_duratio
871869
self.store.get_backfill_points_in_room(room_id, depth_map["A"], limit=100)
872870
)
873871
backfill_event_ids = [backfill_point[0] for backfill_point in backfill_points]
874-
self.assertListEqual(backfill_event_ids, ["b3", "b2", "b1"])
872+
self.assertEqual(backfill_event_ids, ["b3", "b2", "b1"])
873+
874+
def test_get_backfill_points_in_room_works_after_many_failed_pull_attempts_that_could_naively_overflow(
875+
self,
876+
) -> None:
877+
"""
878+
A test that reproduces #13929 (Postgres only).
879+
880+
Test to make sure we can still get backfill points after many failed pull
881+
attempts that cause us to backoff to the limit. Even if the backoff formula
882+
would tell us to wait for more seconds than can be expressed in a 32 bit
883+
signed int.
884+
"""
885+
setup_info = self._setup_room_for_backfill_tests()
886+
room_id = setup_info.room_id
887+
depth_map = setup_info.depth_map
888+
889+
# Pretend that we have tried and failed 10 times to backfill event b1.
890+
for _ in range(10):
891+
self.get_success(
892+
self.store.record_event_failed_pull_attempt(room_id, "b1", "fake cause")
893+
)
894+
895+
# If the backoff periods grow without limit:
896+
# After the first failed attempt, we would have backed off for 1 << 1 = 2 hours.
897+
# After the second failed attempt we would have backed off for 1 << 2 = 4 hours,
898+
# so after the 10th failed attempt we should backoff for 1 << 10 == 1024 hours.
899+
# Wait 1100 hours just so we have a nice round number.
900+
self.reactor.advance(datetime.timedelta(hours=1100).total_seconds())
901+
902+
# 1024 hours in milliseconds is 1024 * 3600000, which exceeds the largest 32 bit
903+
# signed integer. The bug we're reproducing is that this overflow causes an
904+
# error in postgres preventing us from fetching a set of backwards extremities
905+
# to retry fetching.
906+
backfill_points = self.get_success(
907+
self.store.get_backfill_points_in_room(room_id, depth_map["A"], limit=100)
908+
)
909+
910+
# We should aim to fetch all backoff points: b1's latest backoff period has
911+
# expired, and we haven't tried the rest.
912+
backfill_event_ids = [backfill_point[0] for backfill_point in backfill_points]
913+
self.assertEqual(backfill_event_ids, ["b3", "b2", "b1"])
875914

876915
def _setup_room_for_insertion_backfill_tests(self) -> _BackfillSetupInfo:
877916
"""
@@ -965,9 +1004,7 @@ def test_get_insertion_event_backward_extremities_in_room(self):
9651004
)
9661005
)
9671006
backfill_event_ids = [backfill_point[0] for backfill_point in backfill_points]
968-
self.assertListEqual(
969-
backfill_event_ids, ["insertion_eventB", "insertion_eventA"]
970-
)
1007+
self.assertEqual(backfill_event_ids, ["insertion_eventB", "insertion_eventA"])
9711008

9721009
# Try at "insertion_eventA"
9731010
backfill_points = self.get_success(
@@ -1011,7 +1048,7 @@ def test_get_insertion_event_backward_extremities_in_room_excludes_events_we_hav
10111048
)
10121049
backfill_event_ids = [backfill_point[0] for backfill_point in backfill_points]
10131050
# Only the backfill points that we didn't record earlier exist here.
1014-
self.assertListEqual(backfill_event_ids, ["insertion_eventB"])
1051+
self.assertEqual(backfill_event_ids, ["insertion_eventB"])
10151052

10161053
def test_get_insertion_event_backward_extremities_in_room_attempted_event_retry_after_backoff_duration(
10171054
self,
@@ -1069,7 +1106,7 @@ def test_get_insertion_event_backward_extremities_in_room_attempted_event_retry_
10691106
)
10701107
)
10711108
backfill_event_ids = [backfill_point[0] for backfill_point in backfill_points]
1072-
self.assertListEqual(backfill_event_ids, [])
1109+
self.assertEqual(backfill_event_ids, [])
10731110

10741111
# Now advance time by 20 hours (above 2^4 because we made 4 attemps) and
10751112
# see if we can now backfill it
@@ -1083,7 +1120,7 @@ def test_get_insertion_event_backward_extremities_in_room_attempted_event_retry_
10831120
)
10841121
)
10851122
backfill_event_ids = [backfill_point[0] for backfill_point in backfill_points]
1086-
self.assertListEqual(backfill_event_ids, ["insertion_eventA"])
1123+
self.assertEqual(backfill_event_ids, ["insertion_eventA"])
10871124

10881125

10891126
@attr.s

0 commit comments

Comments
 (0)