Skip to content

Support for processing pseudo-arrays #89

@Miskler

Description

@Miskler

Hi! In my projects I needed to implement a handler for “pseudo-arrays”.
Essentially, these are dictionaries where the keys are numbers.
This format appears in a couple of public endpoints, and I ran into it while reverse-engineering the API of one store.

I can prepare a full MR if you're interested in having this functionality enabled by default.
Attaching an example of my implementation :)

class PseudoArrayConverter(SchemaBuilder):
    """
    Specialized SchemaBuilder for converting pseudo-arrays
    (dictionaries with sequential numeric string keys) into patternProperties.
    """

    def __init__(
        self,
        schema_uri: str = "https://json-schema.org/draft/2020-12/schema",
    ) -> None:
        super().__init__()
        # Store raw objects by paths for pseudo-array analysis
        self._raw_at_path: Dict[str, Any] = {}
        # Store all added objects in full
        self._all_objects: List[Any] = []

    # Raw data collection
    # ------------------------------------------------------------------ #
    def add_object(self, obj: Any) -> None:
        """Adds an object for analysis."""
        self._all_objects.append(obj)
        self._collect_raw(obj, "#")
        super().add_object(obj)

    def _collect_raw(self, obj: Any, path: str) -> None:
        """Recursively collects raw data by paths."""
        self._raw_at_path[path] = obj

        if isinstance(obj, dict):
            for k, v in obj.items():
                self._collect_raw(v, f"{path}/{k}")
        elif isinstance(obj, (list, tuple)):
            for i, v in enumerate(obj):
                self._collect_raw(v, f"{path}/{i}")

    # Schema generation with pseudo-array conversion
    # ------------------------------------------------------------------ #
    def to_schema(self) -> Dict[str, Any]:
        """Generates a schema with converted pseudo-arrays."""
        schema = super().to_schema()
        self._convert_pseudo_arrays(schema, "#")
        return schema

    # Pseudo-array conversion logic
    # ------------------------------------------------------------------ #
    def _convert_pseudo_arrays(self, node: Dict[str, Any], path: str) -> None:
        """
        Recursively converts pseudo-arrays into patternProperties.
        Now uses all added objects to determine types correctly.
        """
        if not (node.get("type") == "object" and "properties" in node):
            self._recurse(node, self._convert_pseudo_arrays, path)
            return

        # Get all values for this path from all objects
        values_at_path = self._get_all_values_at_path(path)

        if not values_at_path:
            self._recurse(node, self._convert_pseudo_arrays, path)
            return

        # Check whether all values are dicts with numeric keys
        all_have_numeric_keys = all(
            isinstance(v, dict) and all(isinstance(k, str) and k.isdigit() for k in v.keys())
            for v in values_at_path
            if v is not None
        )

        if not all_have_numeric_keys:
            self._recurse(node, self._convert_pseudo_arrays, path)
            return

        # Collect all numeric keys from all objects
        all_keys = set()
        for value in values_at_path:
            if isinstance(value, dict):
                all_keys.update(value.keys())

        if not all_keys:
            return

        # Check conditions for converting into a pseudo-array
        if self._should_convert_to_pseudo_array(all_keys):
            # Collect nested objects for schema merging
            nested_objects = []
            for value in values_at_path:
                if isinstance(value, dict):
                    for k in sorted(value.keys(), key=int):
                        nested_objects.append(value[k])

            # Create a schema for the nested objects
            item_schema = self._create_schema_for_objects(nested_objects)

            # Convert into patternProperties
            node.clear()
            node.update(
                {
                    "type": "object",
                    "propertyNames": {"pattern": "^[0-9]+$"},
                    "patternProperties": {"^[0-9]+$": item_schema},
                    "additionalProperties": False,
                }
            )
            return

        self._recurse(node, self._convert_pseudo_arrays, path)

    def _get_all_values_at_path(self, path: str) -> List[Any]:
        """Gets all values for the specified path from all objects."""
        values = []

        # If there is a value in _raw_at_path (from the latest object)
        if path in self._raw_at_path:
            values.append(self._raw_at_path[path])

        # Also look through previous objects
        for obj in self._all_objects[:-1]:
            value = self._extract_value_from_path(obj, path)
            if value is not None:
                values.append(value)

        return values

    def _extract_value_from_path(self, obj: Any, path: str) -> Any:
        """Extracts a value by path from an object."""
        if path == "#":
            return obj

        # Skip the initial "#"
        parts = path[2:].split("/") if path.startswith("#/") else path.split("/")

        current = obj
        for part in parts:
            if isinstance(current, dict) and part in current:
                current = current[part]
            elif isinstance(current, (list, tuple)) and part.isdigit():
                idx = int(part)
                if 0 <= idx < len(current):
                    current = current[idx]
                else:
                    return None
            else:
                return None

        return current

    def _should_convert_to_pseudo_array(self, keys: set) -> bool:
        """Determines whether the structure should be converted to a pseudo-array."""
        return all(k.isdigit() for k in keys)

    def _create_schema_for_objects(self, objects: List[Any]) -> Dict[str, Any]:
        """Creates a schema for a list of objects by merging their types."""
        if not objects:
            return {"type": "object"}

        # Use genson to merge types
        from .to_schema_converter import JsonToSchemaConverter

        builder = JsonToSchemaConverter()
        for obj in objects:
            builder.add_object(obj)

        return builder.to_schema()

    # Helpers
    # ------------------------------------------------------------------ #
    def _recurse(self, node: Dict[str, Any], func: callable, path: str) -> None:
        """Recursively traverses a schema."""
        if node.get("type") == "object" and "properties" in node:
            for k, sub in node["properties"].items():
                func(sub, f"{path}/{k}")

        if node.get("type") == "array":
            items = node.get("items", {})
            if isinstance(items, dict):
                func(items, f"{path}/0")
            elif isinstance(items, list):
                for i, sub in enumerate(items):
                    func(sub, f"{path}/{i}")

        if "anyOf" in node:
            for sub in node["anyOf"]:
                func(sub, path)

Output:

    "orders": {
        "0": {"order_id": 1001, "amount": 299.99, "status": "completed", "t": "john.doe@example.com"},
        "1": {"order_id": 1002, "amount": 149.50, "status": "pending", "t": [{"da": [1]}]},
        "3": {"order_id": 1004, "amount": 89.00,  "status": "completed", "t": {"da": [1]}},
        "4": {"order_id": 1004, "amount": 89.00,  "status": "completed", "t": {}},
    }
    "orders": {
      "type": "object",
      "propertyNames": {
        "pattern": "^[0-9]+$"
      },
      "patternProperties": {
        "^[0-9]+$": {
          "$schema": "http://json-schema.org/schema#",
          "type": "object",
          "properties": {
            "order_id": {
              "type": "integer"
            },
            "amount": {
              "type": "number"
            },
            "status": {
              "type": "string"
            },
            "t": {
              "anyOf": [
                {
                  "type": "string"
                },
                {
                  "type": "array",
                  "items": {
                    "type": "object",
                    "properties": {
                      "da": {
                        "type": "array",
                        "items": {
                          "type": "integer"
                        }
                      }
                    },
                    "required": [
                      "da"
                    ]
                  }
                },
                {
                  "type": "object",
                  "properties": {
                    "da": {
                      "type": "array",
                      "items": {
                        "type": "integer"
                      }
                    }
                  }
                }
              ]
            }
          },
          "required": [
            "amount",
            "order_id",
            "status",
            "t"
          ]
        }
      },
      "additionalProperties": false
    },

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions