Skip to content

Commit

Permalink
Merge pull request #457 from python-jsonschema/cache-refs
Browse files Browse the repository at this point in the history
Cache remote refs when downloading, refactor cachedownloader
  • Loading branch information
sirosen committed Jul 8, 2024
2 parents 761a2b2 + 037c2b2 commit e31b55f
Show file tree
Hide file tree
Showing 11 changed files with 574 additions and 231 deletions.
43 changes: 43 additions & 0 deletions docs/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,46 @@ To resolve, quote the boolean:
steps:
- bash: echo "{{ parameters.myBoolean}}"
Caching
-------

What data gets cached?
~~~~~~~~~~~~~~~~~~~~~~

``check-jsonschema`` will cache all downloaded schemas by default.
The schemas are stored in the ``downloads/`` directory in your cache dir, and any
downloaded refs are stored in the ``refs/`` directory.

Where is the cache dir?
~~~~~~~~~~~~~~~~~~~~~~~

``check-jsonschema`` detects an appropriate cache directory based on your
platform and environment variables.

On Windows, the cache dir is ``%LOCALAPPDATA%/check_jsonschema/`` and falls back
to ``%APPDATA%/check_jsonschema/`` if ``LOCALAPPDATA`` is unset.

On macOS, the cache dir is ``~/Library/Caches/check_jsonschema/``.

On Linux, the cache dir is ``$XDG_CACHE_HOME/check_jsonschema/`` and falls back
to ``~/.cache/check_jsonschema/`` if ``XDG_CACHE_HOME`` is unset.

How does check-jsonschema decide what is a cache hit vs miss?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

``check-jsonschema`` checks for cache hits by comparing local file modification
times to the ``Last-Modified`` header present in the headers on an HTTP GET
request. If the local last modified time is older than the header, the rest of
the request will be streamed and written to replace the file.

How do I clear the cache?
~~~~~~~~~~~~~~~~~~~~~~~~~

There is no special command for clearing the cache. Simply find the cache
directory based on the information above and remove it or any of its contents.

Can I disable caching?
~~~~~~~~~~~~~~~~~~~~~~

Yes! Just use the ``--no-cache`` CLI option.
265 changes: 156 additions & 109 deletions src/check_jsonschema/cachedownloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,139 +11,186 @@

import requests

_LASTMOD_FMT = "%a, %d %b %Y %H:%M:%S %Z"


def _base_cache_dir() -> str | None:
sysname = platform.system()

# on windows, try to get the appdata env var
# this *could* result in cache_dir=None, which is fine, just skip caching in
# that case
if sysname == "Windows":
cache_dir = os.getenv("LOCALAPPDATA", os.getenv("APPDATA"))
# macOS -> app support dir
elif sysname == "Darwin":
cache_dir = os.path.expanduser("~/Library/Caches")
# default for unknown platforms, namely linux behavior
# use XDG env var and default to ~/.cache/
else:
cache_dir = os.getenv("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))

return cache_dir


def _resolve_cache_dir(dirname: str = "downloads") -> str | None:
cache_dir = _base_cache_dir()
if cache_dir:
cache_dir = os.path.join(cache_dir, "check_jsonschema", dirname)
return cache_dir


def _lastmod_from_response(response: requests.Response) -> float:
try:
return time.mktime(
time.strptime(response.headers["last-modified"], _LASTMOD_FMT)
)
# OverflowError: time outside of platform-specific bounds
# ValueError: malformed/unparseable
# LookupError: no such header
except (OverflowError, ValueError, LookupError):
return 0.0


def _get_request(
file_url: str, *, response_ok: t.Callable[[requests.Response], bool]
) -> requests.Response:
num_retries = 2
r: requests.Response | None = None
for _attempt in range(num_retries + 1):
try:
r = requests.get(file_url, stream=True)
except requests.RequestException as e:
if _attempt == num_retries:
raise FailedDownloadError("encountered error during download") from e
continue
if r.ok and response_ok(r):
return r
assert r is not None
raise FailedDownloadError(
f"got response with status={r.status_code}, retries exhausted"
)


def _atomic_write(dest: str, content: bytes) -> None:
# download to a temp file and then move to the dest
# this makes the download safe if run in parallel (parallel runs
# won't create a new empty file for writing and cause failures)
fp = tempfile.NamedTemporaryFile(mode="wb", delete=False)
fp.write(content)
fp.close()
shutil.copy(fp.name, dest)
os.remove(fp.name)


def _cache_hit(cachefile: str, response: requests.Response) -> bool:
# no file? miss
if not os.path.exists(cachefile):
return False

# compare mtime on any cached file against the remote last-modified time
# it is considered a hit if the local file is at least as new as the remote file
local_mtime = os.path.getmtime(cachefile)
remote_mtime = _lastmod_from_response(response)
return local_mtime >= remote_mtime


class FailedDownloadError(Exception):
pass


class CacheDownloader:
_LASTMOD_FMT = "%a, %d %b %Y %H:%M:%S %Z"

# changed in v0.5.0
# original cache dir was "jsonschema_validate"
# this will let us do any other caching we might need in the future in the same
# cache dir (adjacent to "downloads")
_CACHEDIR_NAME = os.path.join("check_jsonschema", "downloads")
def __init__(self, cache_dir: str | None = None, disable_cache: bool = False):
if cache_dir is None:
self._cache_dir = _resolve_cache_dir()
else:
self._cache_dir = _resolve_cache_dir(cache_dir)
self._disable_cache = disable_cache

def __init__(
def _download(
self,
file_url: str,
filename: str | None = None,
cache_dir: str | None = None,
disable_cache: bool = False,
validation_callback: t.Callable[[bytes], t.Any] | None = None,
):
self._file_url = file_url
self._filename = filename or file_url.split("/")[-1]
self._cache_dir = cache_dir or self._compute_default_cache_dir()
self._disable_cache = disable_cache
self._validation_callback = validation_callback

def _compute_default_cache_dir(self) -> str | None:
sysname = platform.system()

# on windows, try to get the appdata env var
# this *could* result in cache_dir=None, which is fine, just skip caching in
# that case
if sysname == "Windows":
cache_dir = os.getenv("LOCALAPPDATA", os.getenv("APPDATA"))
# macOS -> app support dir
elif sysname == "Darwin":
cache_dir = os.path.expanduser("~/Library/Caches")
# default for unknown platforms, namely linux behavior
# use XDG env var and default to ~/.cache/
else:
cache_dir = os.getenv("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))

if cache_dir:
cache_dir = os.path.join(cache_dir, self._CACHEDIR_NAME)

return cache_dir

def _get_request(
self, *, response_ok: t.Callable[[requests.Response], bool]
) -> requests.Response:
try:
r: requests.Response | None = None
for _attempt in range(3):
r = requests.get(self._file_url, stream=True)
if r.ok and response_ok(r):
return r
assert r is not None
raise FailedDownloadError(
f"got response with status={r.status_code}, retries exhausted"
)
except requests.RequestException as e:
raise FailedDownloadError("encountered error during download") from e

def _lastmod_from_response(self, response: requests.Response) -> float:
try:
return time.mktime(
time.strptime(response.headers["last-modified"], self._LASTMOD_FMT)
)
# OverflowError: time outside of platform-specific bounds
# ValueError: malformed/unparseable
# LookupError: no such header
except (OverflowError, ValueError, LookupError):
return 0.0

def _cache_hit(self, cachefile: str, response: requests.Response) -> bool:
# no file? miss
if not os.path.exists(cachefile):
return False

# compare mtime on any cached file against the remote last-modified time
# it is considered a hit if the local file is at least as new as the remote file
local_mtime = os.path.getmtime(cachefile)
remote_mtime = self._lastmod_from_response(response)
return local_mtime >= remote_mtime

def _write(self, dest: str, response: requests.Response) -> None:
# download to a temp file and then move to the dest
# this makes the download safe if run in parallel (parallel runs
# won't create a new empty file for writing and cause failures)
fp = tempfile.NamedTemporaryFile(mode="wb", delete=False)
fp.write(response.content)
fp.close()
shutil.copy(fp.name, dest)
os.remove(fp.name)

def _validate(self, response: requests.Response) -> bool:
if not self._validation_callback:
return True

try:
self._validation_callback(response.content)
return True
except ValueError:
return False

def _download(self) -> str:
assert self._cache_dir
filename: str,
response_ok: t.Callable[[requests.Response], bool],
) -> str:
assert self._cache_dir is not None
os.makedirs(self._cache_dir, exist_ok=True)
dest = os.path.join(self._cache_dir, self._filename)
dest = os.path.join(self._cache_dir, filename)

def check_response_for_download(r: requests.Response) -> bool:
# if the response indicates a cache hit, treat it as valid
# this ensures that we short-circuit any further evaluation immediately on
# a hit
if self._cache_hit(dest, r):
if _cache_hit(dest, r):
return True
# we now know it's not a hit, so validate the content (forces download)
return self._validate(r)
return response_ok(r)

response = self._get_request(response_ok=check_response_for_download)
response = _get_request(file_url, response_ok=check_response_for_download)
# check to see if we have a file which matches the connection
# only download if we do not (cache miss, vs hit)
if not self._cache_hit(dest, response):
self._write(dest, response)
if not _cache_hit(dest, response):
_atomic_write(dest, response.content)

return dest

@contextlib.contextmanager
def open(self) -> t.Iterator[t.IO[bytes]]:
def open(
self,
file_url: str,
filename: str,
validate_response: t.Callable[[requests.Response], bool],
) -> t.Iterator[t.IO[bytes]]:
if (not self._cache_dir) or self._disable_cache:
yield io.BytesIO(self._get_request(response_ok=self._validate).content)
yield io.BytesIO(
_get_request(file_url, response_ok=validate_response).content
)
else:
with open(self._download(), "rb") as fp:
with open(
self._download(file_url, filename, response_ok=validate_response), "rb"
) as fp:
yield fp

def bind(
self,
file_url: str,
filename: str | None = None,
validation_callback: t.Callable[[bytes], t.Any] | None = None,
) -> BoundCacheDownloader:
return BoundCacheDownloader(
file_url, filename, self, validation_callback=validation_callback
)


class BoundCacheDownloader:
def __init__(
self,
file_url: str,
filename: str | None,
downloader: CacheDownloader,
*,
validation_callback: t.Callable[[bytes], t.Any] | None = None,
):
self._file_url = file_url
self._filename = filename or file_url.split("/")[-1]
self._downloader = downloader
self._validation_callback = validation_callback

@contextlib.contextmanager
def open(self) -> t.Iterator[t.IO[bytes]]:
with self._downloader.open(
self._file_url,
self._filename,
validate_response=self._validate_response,
) as fp:
yield fp

def _validate_response(self, response: requests.Response) -> bool:
if not self._validation_callback:
return True

try:
self._validation_callback(response.content)
return True
except ValueError:
return False
4 changes: 2 additions & 2 deletions src/check_jsonschema/cli/main_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,8 +300,8 @@ def build_schema_loader(args: ParseResult) -> SchemaLoaderBase:
assert args.schema_path is not None
return SchemaLoader(
args.schema_path,
args.cache_filename,
args.disable_cache,
cache_filename=args.cache_filename,
disable_cache=args.disable_cache,
base_uri=args.base_uri,
validator_class=args.validator_class,
)
Expand Down
10 changes: 6 additions & 4 deletions src/check_jsonschema/schema_loader/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,16 @@ def get_validator(

class SchemaLoader(SchemaLoaderBase):
validator_class: type[jsonschema.protocols.Validator] | None = None
disable_cache: bool = True

def __init__(
self,
schemafile: str,
*,
cache_filename: str | None = None,
disable_cache: bool = False,
base_uri: str | None = None,
validator_class: type[jsonschema.protocols.Validator] | None = None,
disable_cache: bool = True,
) -> None:
# record input parameters (these are not to be modified)
self.schemafile = schemafile
Expand Down Expand Up @@ -140,7 +142,7 @@ def get_validator(
# reference resolution
# with support for YAML, TOML, and other formats from the parsers
reference_registry = make_reference_registry(
self._parsers, retrieval_uri, schema
self._parsers, retrieval_uri, schema, self.disable_cache
)

if self.validator_class is None:
Expand Down Expand Up @@ -171,7 +173,7 @@ def get_validator(


class BuiltinSchemaLoader(SchemaLoader):
def __init__(self, schema_name: str, base_uri: str | None = None) -> None:
def __init__(self, schema_name: str, *, base_uri: str | None = None) -> None:
self.schema_name = schema_name
self.base_uri = base_uri
self._parsers = ParserSet()
Expand All @@ -187,7 +189,7 @@ def get_schema(self) -> dict[str, t.Any]:


class MetaSchemaLoader(SchemaLoaderBase):
def __init__(self, base_uri: str | None = None) -> None:
def __init__(self, *, base_uri: str | None = None) -> None:
if base_uri is not None:
raise NotImplementedError(
"'--base-uri' was used with '--metaschema'. "
Expand Down
Loading

0 comments on commit e31b55f

Please sign in to comment.