Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add handling of 503 Service Unavailable retries #1713

Merged
merged 8 commits into from
Dec 21, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 61 additions & 39 deletions jira/resilientsession.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import logging
import random
import time
from http import HTTPStatus
from typing import Any

from requests import Response, Session
Expand Down Expand Up @@ -278,7 +279,7 @@ def __recoverable(

Exponentially delays if recoverable.

At this moment it supports: 429
At this moment it supports: 429, 503

Args:
response (Optional[Union[ConnectionError, Response]]): The response or exception.
Expand All @@ -290,11 +291,14 @@ def __recoverable(
Returns:
bool: True if the request should be retried.
"""
is_recoverable = False # Controls return value AND whether we delay or not, Not-recoverable by default
suggested_delay = (
-1
) # Controls return value AND whether we delay or not, Not-recoverable by default
Comment on lines +294 to +296

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right now this comment doesn't quite make sense

renaming this var to recovery_delay would make it much more readable

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what would you change the comment to? (it makes sense to me but I'm ok to add more meaning to it)

I kind of agree with recovery_delay, but then it won't eventually be the recovery delay because of the max limit.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then you go with -1 right? anyhow, this was just a minor suggestion, don't worry about it

msg = str(response)

if isinstance(response, ConnectionError):
is_recoverable = True
suggested_delay = 10 * 2**counter

LOG.warning(
f"Got ConnectionError [{response}] errno:{response.errno} on {request_method} "
+ f"{url}\n" # type: ignore[str-bytes-safe]
Expand All @@ -304,45 +308,27 @@ def __recoverable(
"Response headers for ConnectionError are only printed for log level DEBUG."
)

if isinstance(response, Response):
if response.status_code in [429]:
is_recoverable = True
number_of_tokens_issued_per_interval = response.headers.get(
"X-RateLimit-FillRate"
)
token_issuing_rate_interval_seconds = response.headers.get(
"X-RateLimit-Interval-Seconds"
)
maximum_number_of_tokens = response.headers.get("X-RateLimit-Limit")
retry_after = response.headers.get("retry-after")
msg = f"{response.status_code} {response.reason}"
warning_msg = "Request rate limited by Jira."

warning_msg += (
f" Request should be retried after {retry_after} seconds.\n"
if retry_after is not None
else "\n"
)
if (
number_of_tokens_issued_per_interval is not None
and token_issuing_rate_interval_seconds is not None
):
warning_msg += f"{number_of_tokens_issued_per_interval} tokens are issued every {token_issuing_rate_interval_seconds} seconds.\n"
if maximum_number_of_tokens is not None:
warning_msg += (
f"You can accumulate up to {maximum_number_of_tokens} tokens.\n"
)
warning_msg = (
warning_msg
+ "Consider adding an exemption for the user as explained in: "
+ "https://confluence.atlassian.com/adminjiraserver/improving-instance-stability-with-rate-limiting-983794911.html"
)
elif isinstance(response, Response):
recoverable_error_codes = [
HTTPStatus.TOO_MANY_REQUESTS,
HTTPStatus.SERVICE_UNAVAILABLE,
]

LOG.warning(warning_msg)
if response.status_code in recoverable_error_codes:
retry_after = response.headers.get("Retry-After")
if retry_after:
Comment on lines +317 to +319

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this looks like regression: 429 should be still always retried

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

429s are retried, just with a different delay as per previous behaviour -- see line 322

suggested_delay = int(retry_after) # Do as told
elif response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
suggested_delay = 10 * 2**counter # Exponential backoff

if response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
msg = f"{response.status_code} {response.reason}"
self.__log_http_429_response(response)

is_recoverable = suggested_delay > 0
if is_recoverable:
# Exponential backoff with full jitter.
delay = min(self.max_retry_delay, 10 * 2**counter) * random.random()
# Apply jitter to prevent thundering herd
delay = min(self.max_retry_delay, suggested_delay) * random.random()
LOG.warning(
f"Got recoverable error from {request_method} {url}, will retry [{counter}/{self.max_retries}] in {delay}s. Err: {msg}" # type: ignore[str-bytes-safe]
)
Expand All @@ -355,3 +341,39 @@ def __recoverable(
time.sleep(delay)

return is_recoverable

def __log_http_429_response(self, response: Response):
retry_after = response.headers.get("Retry-After")
number_of_tokens_issued_per_interval = response.headers.get(
"X-RateLimit-FillRate"
)
token_issuing_rate_interval_seconds = response.headers.get(
"X-RateLimit-Interval-Seconds"
)
maximum_number_of_tokens = response.headers.get("X-RateLimit-Limit")

warning_msg = "Request rate limited by Jira."
warning_msg += (
f" Request should be retried after {retry_after} seconds.\n"
if retry_after is not None
else "\n"
)

if (
number_of_tokens_issued_per_interval is not None
and token_issuing_rate_interval_seconds is not None
):
warning_msg += f"{number_of_tokens_issued_per_interval} tokens are issued every {token_issuing_rate_interval_seconds} seconds.\n"

if maximum_number_of_tokens is not None:
warning_msg += (
f"You can accumulate up to {maximum_number_of_tokens} tokens.\n"
)

warning_msg = (
warning_msg
+ "Consider adding an exemption for the user as explained in: "
+ "https://confluence.atlassian.com/adminjiraserver/improving-instance-stability-with-rate-limiting-983794911.html"
)

LOG.warning(warning_msg)
94 changes: 54 additions & 40 deletions tests/test_resilientsession.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import logging
from http import HTTPStatus
from unittest.mock import Mock, patch

import pytest
Expand Down Expand Up @@ -63,68 +64,81 @@ def tearDown(self):
del self.loggingHandler


# Retry test data tuples: (status_code, with_rate_limit_header, with_retry_after_header, retry_expected)
with_rate_limit = with_retry_after = True
without_rate_limit = without_retry_after = False
status_codes_retries_test_data = [
(429, 4, 3),
(401, 1, 0),
(403, 1, 0),
(404, 1, 0),
(502, 1, 0),
(503, 1, 0),
(504, 1, 0),
# Always retry 429 responses
(HTTPStatus.TOO_MANY_REQUESTS, with_rate_limit, with_retry_after, True),
(HTTPStatus.TOO_MANY_REQUESTS, with_rate_limit, without_retry_after, True),
(HTTPStatus.TOO_MANY_REQUESTS, without_rate_limit, with_retry_after, True),
(HTTPStatus.TOO_MANY_REQUESTS, without_rate_limit, without_retry_after, True),
# Retry 503 responses only when 'Retry-After' in headers
(HTTPStatus.SERVICE_UNAVAILABLE, with_rate_limit, with_retry_after, True),
(HTTPStatus.SERVICE_UNAVAILABLE, with_rate_limit, without_retry_after, False),
(HTTPStatus.SERVICE_UNAVAILABLE, without_rate_limit, with_retry_after, True),
(HTTPStatus.SERVICE_UNAVAILABLE, without_rate_limit, without_retry_after, False),
# Never retry other responses
(HTTPStatus.UNAUTHORIZED, with_rate_limit, with_retry_after, False),
(HTTPStatus.UNAUTHORIZED, without_rate_limit, without_retry_after, False),
(HTTPStatus.FORBIDDEN, with_rate_limit, with_retry_after, False),
(HTTPStatus.FORBIDDEN, without_rate_limit, without_retry_after, False),
(HTTPStatus.NOT_FOUND, with_rate_limit, with_retry_after, False),
(HTTPStatus.NOT_FOUND, without_rate_limit, without_retry_after, False),
(HTTPStatus.BAD_GATEWAY, with_rate_limit, with_retry_after, False),
(HTTPStatus.BAD_GATEWAY, without_rate_limit, without_retry_after, False),
(HTTPStatus.GATEWAY_TIMEOUT, with_rate_limit, with_retry_after, False),
(HTTPStatus.GATEWAY_TIMEOUT, without_rate_limit, without_retry_after, False),
]


@patch("requests.Session.request")
@patch(f"{jira.resilientsession.__name__}.time.sleep")
@pytest.mark.parametrize(
"status_code,expected_number_of_retries,expected_number_of_sleep_invocations",
"status_code,with_rate_limit_header,with_retry_after_header,retry_expected",
status_codes_retries_test_data,
)
def test_status_codes_retries(
mocked_sleep_method: Mock,
mocked_request_method: Mock,
status_code: int,
expected_number_of_retries: int,
expected_number_of_sleep_invocations: int,
with_rate_limit_header: bool,
with_retry_after_header: bool,
retry_expected: bool,
):
mocked_response: Response = Response()
mocked_response.status_code = status_code
mocked_response.headers["X-RateLimit-FillRate"] = "1"
mocked_response.headers["X-RateLimit-Interval-Seconds"] = "1"
mocked_response.headers["retry-after"] = "1"
mocked_response.headers["X-RateLimit-Limit"] = "1"
mocked_request_method.return_value = mocked_response
session: jira.resilientsession.ResilientSession = (
jira.resilientsession.ResilientSession()
)
with pytest.raises(JIRAError):
session.get("mocked_url")
assert mocked_request_method.call_count == expected_number_of_retries
assert mocked_sleep_method.call_count == expected_number_of_sleep_invocations

RETRY_AFTER_HEADER = {"Retry-After": "1"}
RATE_LIMIT_HEADERS = {
"X-RateLimit-FillRate": "1",
"X-RateLimit-Interval-Seconds": "1",
"X-RateLimit-Limit": "1",
}

max_retries = 2

if retry_expected:
expected_number_of_requests = 1 + max_retries
expected_number_of_sleep_invocations = max_retries
else:
expected_number_of_requests = 1
expected_number_of_sleep_invocations = 0

@patch("requests.Session.request")
@patch(f"{jira.resilientsession.__name__}.time.sleep")
@pytest.mark.parametrize(
"status_code,expected_number_of_retries,expected_number_of_sleep_invocations",
status_codes_retries_test_data,
)
def test_status_codes_retries_no_headers(
mocked_sleep_method: Mock,
mocked_request_method: Mock,
status_code: int,
expected_number_of_retries: int,
expected_number_of_sleep_invocations: int,
):
mocked_response: Response = Response()
mocked_response.status_code = status_code
if with_retry_after_header:
mocked_response.headers.update(RETRY_AFTER_HEADER)
if with_rate_limit_header:
mocked_response.headers.update(RATE_LIMIT_HEADERS)

mocked_request_method.return_value = mocked_response

session: jira.resilientsession.ResilientSession = (
jira.resilientsession.ResilientSession()
jira.resilientsession.ResilientSession(max_retries=max_retries)
)

with pytest.raises(JIRAError):
session.get("mocked_url")
assert mocked_request_method.call_count == expected_number_of_retries

assert mocked_request_method.call_count == expected_number_of_requests
assert mocked_sleep_method.call_count == expected_number_of_sleep_invocations


Expand Down