Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 56 additions & 8 deletions cachecontrol/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from requests.structures import CaseInsensitiveDict

from cachecontrol.heuristics import HEURISTICALLY_CACHEABLE_STATUSES
from cachecontrol.cache import DictCache, SeparateBodyBaseCache
from cachecontrol.serialize import Serializer

Expand All @@ -33,6 +34,17 @@

URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")

NEVER_CACHE_STATUSES = (
# Per https://www.rfc-editor.org/rfc/rfc9111.html#section-3 the status
# code must be final
100,
101,
# From httplib2: Don't cache 206's since we aren't going to
# handle byte range requests
206,
)


PERMANENT_REDIRECT_STATUSES = (301, 308)


Expand Down Expand Up @@ -60,7 +72,20 @@ def __init__(
self.cache = DictCache() if cache is None else cache
self.cache_etags = cache_etags
self.serializer = serializer or Serializer()
self.cacheable_status_codes = status_codes or (200, 203, 300, 301, 308)
# Per https://www.rfc-editor.org/rfc/rfc9111.html#section-3-2.7.1 all
# all final response codes are potentially cacheable, subject to the
# other conditions. CacheController conservatively only caches common
# ones codes. For example, even with a max-age set, a 500 Internal
# Server Error will not be cached
self.cacheable_status_codes = status_codes or (
200,
203,
300,
301,
302,
307,
308,
)

@classmethod
def _urlnorm(cls, uri: str) -> str:
Expand Down Expand Up @@ -128,12 +153,12 @@ def parse_cache_control(self, headers: Mapping[str, str]) -> dict[str, int | Non
except IndexError:
if required:
logger.debug(
"Missing value for cache-control " "directive: %s",
"Missing value for cache-control directive: %s",
directive,
)
except ValueError:
logger.debug(
"Invalid value for cache-control directive " "%s, must be %s",
"Invalid value for cache-control directive %s, must be %s",
directive,
typ.__name__,
)
Expand Down Expand Up @@ -343,12 +368,11 @@ def cache_response(
else:
response = response_or_ref

# From httplib2: Don't cache 206's since we aren't going to
# handle byte range requests
cacheable_status_codes = status_codes or self.cacheable_status_codes
if response.status not in cacheable_status_codes:
if response.status in NEVER_CACHE_STATUSES:
logger.debug(
"Status code %s not in %s", response.status, cacheable_status_codes
"Status code %s in never cache set %s",
response.status,
NEVER_CACHE_STATUSES,
)
return

Expand Down Expand Up @@ -378,6 +402,30 @@ def cache_response(
cc_req = self.parse_cache_control(request.headers)
cc = self.parse_cache_control(response_headers)

# "at least one of the following" from
# https://www.rfc-editor.org/rfc/rfc9111.html#section-3-2.7.1
has_explicit_freshness = (
"public" in cc or "expires" in response_headers or "max-age" in cc
# NOTE: s-maxage is also listed in the RFC section, but
# cache_response() doesn't currently express the concept of shared
# vs private caching
)
cacheable_status_codes = status_codes or self.cacheable_status_codes
if response.status not in cacheable_status_codes:
logger.debug(
"Status code %s not in %s", response.status, cacheable_status_codes
)
return
if (
response.status not in HEURISTICALLY_CACHEABLE_STATUSES
and not has_explicit_freshness
):
logger.debug(
"Status code %s is not heuristically cacheable and no explicit caching headers are set",
response.status,
)
return

assert request.url is not None
cache_url = self.cache_url(request.url)
logger.debug('Updating cache with response from "%s"', cache_url)
Expand Down
31 changes: 16 additions & 15 deletions cachecontrol/heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,21 @@

TIME_FMT = "%a, %d %b %Y %H:%M:%S GMT"

# Concise list from https://www.rfc-editor.org/rfc/rfc9110#section-15.1
HEURISTICALLY_CACHEABLE_STATUSES = {
200,
203,
204,
206,
300,
301,
404,
405,
410,
414,
501,
}


def expire_after(delta: timedelta, date: datetime | None = None) -> datetime:
date = date or datetime.now(timezone.utc)
Expand Down Expand Up @@ -107,20 +122,6 @@ class LastModified(BaseHeuristic):
Unlike mozilla we limit this to 24-hr.
"""

cacheable_by_default_statuses = {
200,
203,
204,
206,
300,
301,
404,
405,
410,
414,
501,
}

def update_headers(self, resp: HTTPResponse) -> dict[str, str]:
headers: Mapping[str, str] = resp.headers

Expand All @@ -130,7 +131,7 @@ def update_headers(self, resp: HTTPResponse) -> dict[str, str]:
if "cache-control" in headers and headers["cache-control"] != "public":
return {}

if resp.status not in self.cacheable_by_default_statuses:
if resp.status not in HEURISTICALLY_CACHEABLE_STATUSES:
return {}

if "date" not in headers or "last-modified" not in headers:
Expand Down
34 changes: 33 additions & 1 deletion tests/test_cache_control.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,14 @@ def cc(self):
# Cache controller fixture
return CacheController(Mock(), serializer=Mock())

@pytest.mark.parametrize("status_code", [100, 101])
def test_no_cache_without_final(self, cc, status_code):
now = time.strftime(TIME_FMT, time.gmtime())
resp = self.resp({"cache-control": "max-age=3600", "date": now})
resp.status = status_code
cc.cache_response(Mock(), resp)
assert not cc.cache.set.called

def test_no_cache_non_20x_response(self, cc):
# No caching without some extra headers, so we add them
now = time.strftime(TIME_FMT, time.gmtime())
Expand All @@ -47,7 +55,7 @@ def test_no_cache_non_20x_response(self, cc):
no_cache_codes = [201, 400, 500]
for code in no_cache_codes:
resp.status = code
cc.cache_response(Mock(), resp)
cc.cache_response(self.req(), resp)
assert not cc.cache.set.called

# this should work b/c the resp is 20x
Expand All @@ -71,6 +79,17 @@ def test_no_cache_with_wrong_sized_body(self, cc):

assert not cc.cache.set.called

@pytest.mark.parametrize("status_code", [302, 307])
def test_no_cache_non_heuristic_status_without_explicit_freshness(
self, cc, status_code
):
now = time.strftime(TIME_FMT, time.gmtime())
# NOTE: Only 'date' header, NO "hard" cache-control/expires/public headers
resp = self.resp({"date": now})
resp.status = status_code
cc.cache_response(self.req(), resp)
assert not cc.cache.set.called

def test_cache_response_no_cache_control(self, cc):
resp = self.resp()
cc.cache_response(self.req(), resp)
Expand All @@ -85,6 +104,19 @@ def test_cache_response_cache_max_age(self, cc):
cc.serializer.dumps.assert_called_with(req, resp, None)
cc.cache.set.assert_called_with(self.url, ANY, expires=3600)

def test_cache_response_explicit_non_heuristic(self, cc):
# meets the requirements of
# https://www.rfc-editor.org/rfc/rfc9111.html#section-3-2.7.1
# *without* being heuristically cacheable
now = time.strftime(TIME_FMT, time.gmtime())
resp = self.resp({"cache-control": "max-age=3600", "date": now})
resp.status = 302

req = self.req()
cc.cache_response(req, resp)
cc.serializer.dumps.assert_called_with(req, resp, None)
cc.cache.set.assert_called_with(self.url, ANY, expires=3600)

def test_cache_response_cache_max_age_with_invalid_value_not_cached(self, cc):
now = time.strftime(TIME_FMT, time.gmtime())
# Not a valid header; this would be from a misconfigured server
Expand Down