Skip to content

Commit 46de172

Browse files
committed
Caching for canonicalised JSON
1 parent 181e7c7 commit 46de172

File tree

1 file changed

+57
-4
lines changed

1 file changed

+57
-4
lines changed

src/hypothesis_jsonschema/_canonicalise.py

Lines changed: 57 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,14 @@
1212
most things by construction instead of by filtering. That's the difference
1313
between "I'd like it to be faster" and "doesn't finish at all".
1414
"""
15-
15+
import functools
1616
import itertools
1717
import json
1818
import math
1919
import re
2020
from copy import deepcopy
2121
from json.encoder import _make_iterencode, encode_basestring_ascii # type: ignore
22-
from typing import Any, Dict, List, NoReturn, Optional, Tuple, Union
22+
from typing import Any, Callable, Dict, List, NoReturn, Optional, Tuple, Type, Union
2323

2424
import jsonschema
2525
from hypothesis.errors import InvalidArgument
@@ -108,9 +108,62 @@ class HypothesisRefResolutionError(jsonschema.exceptions.RefResolutionError):
108108
pass
109109

110110

111-
def encode_canonical_json(value: JSONType) -> str:
111+
def _make_cache_key(
112+
value: JSONType,
113+
) -> Tuple[Type, Union[Tuple, None, bool, float, str]]:
114+
"""Make a hashable object from any JSON value.
115+
116+
The idea is to recursively convert all mutable values to immutable and adding values types as a discriminant.
117+
"""
118+
if isinstance(value, dict):
119+
return (dict, tuple((k, _make_cache_key(v)) for k, v in value.items()))
120+
if isinstance(value, list):
121+
return (list, tuple(map(_make_cache_key, value)))
122+
# Primitive types are hashable
123+
# `type` is needed to distinguish false-ish values - 0, "", False have the same hash (0)
124+
return (type(value), value)
125+
126+
127+
class HashedJSON:
128+
"""A proxy that holds a JSON value.
129+
130+
Adds a capability for the inner value to be cached, loosely based on `functools._HashedSeq`.
131+
"""
132+
133+
__slots__ = ("value", "hashedvalue")
134+
135+
def __init__(self, value: JSONType):
136+
self.value = value
137+
# `hash` is called multiple times on cache miss, therefore it is evaluated only once
138+
self.hashedvalue = hash(_make_cache_key(value))
139+
140+
def __hash__(self) -> int:
141+
return self.hashedvalue
142+
143+
def __eq__(self, other: "HashedJSON") -> bool: # type: ignore
144+
# TYPES: This class should be used only for caching purposes and there should be
145+
# no values of other types to compare
146+
return self.hashedvalue == other.hashedvalue
147+
148+
149+
def cached_json(func: Callable[[HashedJSON], str]) -> Callable[[JSONType], str]:
150+
"""Cache calls to `encode_canonical_json`.
151+
152+
The same schemas are encoded multiple times during canonicalisation and caching gives visible performance impact.
153+
"""
154+
cached_func = functools.lru_cache(maxsize=1024)(func)
155+
156+
@functools.wraps(cached_func)
157+
def wrapped(value: JSONType) -> str:
158+
return cached_func(HashedJSON(value))
159+
160+
return wrapped
161+
162+
163+
@cached_json
164+
def encode_canonical_json(value: HashedJSON) -> str:
112165
"""Canonical form serialiser, for uniqueness testing."""
113-
return json.dumps(value, sort_keys=True, cls=CanonicalisingJsonEncoder)
166+
return json.dumps(value.value, sort_keys=True, cls=CanonicalisingJsonEncoder)
114167

115168

116169
def sort_key(value: JSONType) -> Tuple[int, float, Union[float, str]]:

0 commit comments

Comments
 (0)