Skip to content

Commit 34b3444

Browse files
committed
zephyr: Use exponential backoffs in retry loops.
This reduces the number of retries that might spam APIs. There is some complexity here which is left un-managed -- for instance, maybe_restart_mirroring_script does a number of restart attempts, and then fails, but will be retried every 15s by the surrounding `process_loop`. Previously, it would merely have looped forever inside maybe_restart_mirroring_script. Three loops are intentionally left as infinite `while True` loops, that merely cap their backoff at the default 90s. Their callers do not expect, or have any way to handle more gracefully, a failure of the expected-infinite-loop in `process_loop` or `zulip_to_zephyr`. They maintain their previous behavior of retrying forever, albeit more slowly.
1 parent 396ef1d commit 34b3444

File tree

1 file changed

+17
-5
lines changed

1 file changed

+17
-5
lines changed

zulip/integrations/zephyr/zephyr_mirror_backend.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
import tempfile
1818
import select
1919

20+
from zulip import RandomExponentialBackoff
21+
2022
DEFAULT_SITE = "https://api.zulip.com"
2123

2224
class States:
@@ -218,32 +220,41 @@ def maybe_restart_mirroring_script() -> None:
218220
except OSError:
219221
# We don't care whether we failed to cancel subs properly, but we should log it
220222
logger.exception("")
221-
while True:
223+
backoff = RandomExponentialBackoff(
224+
maximum_retries=3,
225+
)
226+
while backoff.keep_going():
222227
try:
223228
os.execvp(os.path.abspath(__file__), sys.argv)
229+
# No need for backoff.succeed, since this can't be reached
224230
except Exception:
225231
logger.exception("Error restarting mirroring script; trying again... Traceback:")
226-
time.sleep(1)
232+
backoff.fail()
233+
raise Exception("Failed to reload too many times, aborting!")
227234

228235
def process_loop(log: Optional[IO[Any]]) -> None:
229236
restart_check_count = 0
230237
last_check_time = time.time()
238+
recieve_backoff = RandomExponentialBackoff()
231239
while True:
232240
select.select([zephyr._z.getFD()], [], [], 15)
233241
try:
242+
process_backoff = RandomExponentialBackoff()
234243
# Fetch notices from the queue until its empty
235244
while True:
236245
notice = zephyr.receive(block=False)
246+
recieve_backoff.succeed()
237247
if notice is None:
238248
break
239249
try:
240250
process_notice(notice, log)
251+
process_backoff.succeed()
241252
except Exception:
242253
logger.exception("Error relaying zephyr:")
243-
time.sleep(2)
254+
process_backoff.fail()
244255
except Exception:
245256
logger.exception("Error checking for new zephyrs:")
246-
time.sleep(1)
257+
recieve_backoff.fail()
247258
continue
248259

249260
if time.time() - last_check_time > 15:
@@ -759,12 +770,13 @@ def maybe_forward_to_zephyr(message: Dict[str, Any]) -> None:
759770
def zulip_to_zephyr(options: int) -> None:
760771
# Sync messages from zulip to zephyr
761772
logger.info("Starting syncing messages.")
773+
backoff = RandomExponentialBackoff(timeout_success_equivalent=120)
762774
while True:
763775
try:
764776
zulip_client.call_on_each_message(maybe_forward_to_zephyr)
765777
except Exception:
766778
logger.exception("Error syncing messages:")
767-
time.sleep(1)
779+
backoff.fail()
768780

769781
def subscribed_to_mail_messages() -> bool:
770782
# In case we have lost our AFS tokens and those won't be able to

0 commit comments

Comments
 (0)