-
Notifications
You must be signed in to change notification settings - Fork 74
Open
Description
Hi! In my projects I needed to implement a handler for “pseudo-arrays”.
Essentially, these are dictionaries where the keys are numbers.
This format appears in a couple of public endpoints, and I ran into it while reverse-engineering the API of one store.
I can prepare a full MR if you're interested in having this functionality enabled by default.
Attaching an example of my implementation :)
class PseudoArrayConverter(SchemaBuilder):
"""
Specialized SchemaBuilder for converting pseudo-arrays
(dictionaries with sequential numeric string keys) into patternProperties.
"""
def __init__(
self,
schema_uri: str = "https://json-schema.org/draft/2020-12/schema",
) -> None:
super().__init__()
# Store raw objects by paths for pseudo-array analysis
self._raw_at_path: Dict[str, Any] = {}
# Store all added objects in full
self._all_objects: List[Any] = []
# Raw data collection
# ------------------------------------------------------------------ #
def add_object(self, obj: Any) -> None:
"""Adds an object for analysis."""
self._all_objects.append(obj)
self._collect_raw(obj, "#")
super().add_object(obj)
def _collect_raw(self, obj: Any, path: str) -> None:
"""Recursively collects raw data by paths."""
self._raw_at_path[path] = obj
if isinstance(obj, dict):
for k, v in obj.items():
self._collect_raw(v, f"{path}/{k}")
elif isinstance(obj, (list, tuple)):
for i, v in enumerate(obj):
self._collect_raw(v, f"{path}/{i}")
# Schema generation with pseudo-array conversion
# ------------------------------------------------------------------ #
def to_schema(self) -> Dict[str, Any]:
"""Generates a schema with converted pseudo-arrays."""
schema = super().to_schema()
self._convert_pseudo_arrays(schema, "#")
return schema
# Pseudo-array conversion logic
# ------------------------------------------------------------------ #
def _convert_pseudo_arrays(self, node: Dict[str, Any], path: str) -> None:
"""
Recursively converts pseudo-arrays into patternProperties.
Now uses all added objects to determine types correctly.
"""
if not (node.get("type") == "object" and "properties" in node):
self._recurse(node, self._convert_pseudo_arrays, path)
return
# Get all values for this path from all objects
values_at_path = self._get_all_values_at_path(path)
if not values_at_path:
self._recurse(node, self._convert_pseudo_arrays, path)
return
# Check whether all values are dicts with numeric keys
all_have_numeric_keys = all(
isinstance(v, dict) and all(isinstance(k, str) and k.isdigit() for k in v.keys())
for v in values_at_path
if v is not None
)
if not all_have_numeric_keys:
self._recurse(node, self._convert_pseudo_arrays, path)
return
# Collect all numeric keys from all objects
all_keys = set()
for value in values_at_path:
if isinstance(value, dict):
all_keys.update(value.keys())
if not all_keys:
return
# Check conditions for converting into a pseudo-array
if self._should_convert_to_pseudo_array(all_keys):
# Collect nested objects for schema merging
nested_objects = []
for value in values_at_path:
if isinstance(value, dict):
for k in sorted(value.keys(), key=int):
nested_objects.append(value[k])
# Create a schema for the nested objects
item_schema = self._create_schema_for_objects(nested_objects)
# Convert into patternProperties
node.clear()
node.update(
{
"type": "object",
"propertyNames": {"pattern": "^[0-9]+$"},
"patternProperties": {"^[0-9]+$": item_schema},
"additionalProperties": False,
}
)
return
self._recurse(node, self._convert_pseudo_arrays, path)
def _get_all_values_at_path(self, path: str) -> List[Any]:
"""Gets all values for the specified path from all objects."""
values = []
# If there is a value in _raw_at_path (from the latest object)
if path in self._raw_at_path:
values.append(self._raw_at_path[path])
# Also look through previous objects
for obj in self._all_objects[:-1]:
value = self._extract_value_from_path(obj, path)
if value is not None:
values.append(value)
return values
def _extract_value_from_path(self, obj: Any, path: str) -> Any:
"""Extracts a value by path from an object."""
if path == "#":
return obj
# Skip the initial "#"
parts = path[2:].split("/") if path.startswith("#/") else path.split("/")
current = obj
for part in parts:
if isinstance(current, dict) and part in current:
current = current[part]
elif isinstance(current, (list, tuple)) and part.isdigit():
idx = int(part)
if 0 <= idx < len(current):
current = current[idx]
else:
return None
else:
return None
return current
def _should_convert_to_pseudo_array(self, keys: set) -> bool:
"""Determines whether the structure should be converted to a pseudo-array."""
return all(k.isdigit() for k in keys)
def _create_schema_for_objects(self, objects: List[Any]) -> Dict[str, Any]:
"""Creates a schema for a list of objects by merging their types."""
if not objects:
return {"type": "object"}
# Use genson to merge types
from .to_schema_converter import JsonToSchemaConverter
builder = JsonToSchemaConverter()
for obj in objects:
builder.add_object(obj)
return builder.to_schema()
# Helpers
# ------------------------------------------------------------------ #
def _recurse(self, node: Dict[str, Any], func: callable, path: str) -> None:
"""Recursively traverses a schema."""
if node.get("type") == "object" and "properties" in node:
for k, sub in node["properties"].items():
func(sub, f"{path}/{k}")
if node.get("type") == "array":
items = node.get("items", {})
if isinstance(items, dict):
func(items, f"{path}/0")
elif isinstance(items, list):
for i, sub in enumerate(items):
func(sub, f"{path}/{i}")
if "anyOf" in node:
for sub in node["anyOf"]:
func(sub, path)Output:
"orders": {
"0": {"order_id": 1001, "amount": 299.99, "status": "completed", "t": "john.doe@example.com"},
"1": {"order_id": 1002, "amount": 149.50, "status": "pending", "t": [{"da": [1]}]},
"3": {"order_id": 1004, "amount": 89.00, "status": "completed", "t": {"da": [1]}},
"4": {"order_id": 1004, "amount": 89.00, "status": "completed", "t": {}},
} "orders": {
"type": "object",
"propertyNames": {
"pattern": "^[0-9]+$"
},
"patternProperties": {
"^[0-9]+$": {
"$schema": "http://json-schema.org/schema#",
"type": "object",
"properties": {
"order_id": {
"type": "integer"
},
"amount": {
"type": "number"
},
"status": {
"type": "string"
},
"t": {
"anyOf": [
{
"type": "string"
},
{
"type": "array",
"items": {
"type": "object",
"properties": {
"da": {
"type": "array",
"items": {
"type": "integer"
}
}
},
"required": [
"da"
]
}
},
{
"type": "object",
"properties": {
"da": {
"type": "array",
"items": {
"type": "integer"
}
}
}
}
]
}
},
"required": [
"amount",
"order_id",
"status",
"t"
]
}
},
"additionalProperties": false
},Metadata
Metadata
Assignees
Labels
No labels