feat: Implement dunder repr for collection, schema, column and rule (#63)

MoritzPotthoffQC · web-flow · commit d82dcf070c54 · 2025-06-18T17:15:37.000+02:00
diff --git a/dataframely/_base_collection.py b/dataframely/_base_collection.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import textwrap
 import typing
 from abc import ABCMeta
 from collections.abc import Iterable
@@ -245,6 +246,32 @@ def _derive_member_info(
             # Some other unknown annotation
             raise AnnotationImplementationError(attr, type_annotation)
 
+    def __repr__(cls) -> str:
+        parts = [f'[Collection "{cls.__class__.__name__}"]']
+        parts.append(textwrap.indent("Members:", prefix=" " * 2))
+        for name, member in cls.members().items():  # type: ignore
+            parts.append(
+                textwrap.indent(
+                    f'- "{name}": {member.schema.__name__}'
+                    f"(optional={member.is_optional}, "
+                    f"ignored_in_filters={member.ignored_in_filters}, "
+                    f"inline_for_sampling={member.inline_for_sampling})",
+                    prefix=" " * 4,
+                )
+            )
+        if filters := cls._filters():  # type: ignore
+            parts.append(textwrap.indent("Filters:", prefix=" " * 2))
+            for name, member in filters.items():
+                parts.append(textwrap.indent(f'- "{name}":', prefix=" " * 4))
+                parts.append(
+                    textwrap.indent(
+                        f"{member.logic(cls.create_empty()).explain()}",  # type: ignore
+                        prefix=" " * 8,
+                    )
+                )
+        parts.append("")  # Add line break at the end
+        return "\n".join(parts)
+
 
 class BaseCollection(metaclass=CollectionMeta):
     """Internal utility abstraction to reference collections without introducing
diff --git a/dataframely/_base_schema.py b/dataframely/_base_schema.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import textwrap
 from abc import ABCMeta
 from copy import copy
 from dataclasses import dataclass, field
@@ -162,6 +163,18 @@ def _get_metadata(source: dict[str, Any]) -> Metadata:
                 result.rules[attr] = value
         return result
 
+    def __repr__(cls) -> str:
+        parts = [f'[Schema "{cls.__name__}"]']
+        parts.append(textwrap.indent("Columns:", prefix=" " * 2))
+        for name, col in cls.columns().items():  # type: ignore
+            parts.append(textwrap.indent(f'- "{name}": {col!r}', prefix=" " * 4))
+        if validation_rules := cls._schema_validation_rules():  # type: ignore
+            parts.append(textwrap.indent("Rules:", prefix=" " * 2))
+            for name, rule in validation_rules.items():
+                parts.append(textwrap.indent(f'- "{name}": {rule!r}', prefix=" " * 4))
+        parts.append("")  # Add line break at the end
+        return "\n".join(parts)
+
 
 class BaseSchema(metaclass=SchemaMeta):
     """Internal utility abstraction to reference schemas without introducing cyclical
diff --git a/dataframely/_rule.py b/dataframely/_rule.py
@@ -42,6 +42,9 @@ def from_dict(cls, data: dict[str, Any]) -> Self:
         """
         return cls(data["expr"])
 
+    def __repr__(self) -> str:
+        return str(self.expr)
+
 
 class GroupRule(Rule):
     """Rule that is evaluated on a group of columns."""
@@ -62,6 +65,9 @@ def as_dict(self) -> dict[str, Any]:
     def from_dict(cls, data: dict[str, Any]) -> Self:
         return cls(data["expr"], group_columns=data["group_columns"])
 
+    def __repr__(self) -> str:
+        return f"{super().__repr__()} grouped by {self.group_columns}"
+
 
 def rule(*, group_by: list[str] | None = None) -> Callable[[ValidationFunction], Rule]:
     """Mark a function as a rule to evaluate during validation.
diff --git a/dataframely/columns/_base.py b/dataframely/columns/_base.py
@@ -372,6 +372,21 @@ def _attributes_match(
 
     # -------------------------------- DUNDER METHODS -------------------------------- #
 
+    def __repr__(self) -> str:
+        parts = [
+            f"{attribute}={repr(getattr(self, attribute))}"
+            for attribute, param_details in inspect.signature(
+                self.__class__.__init__
+            ).parameters.items()
+            if attribute
+            not in ["self", "alias"]  # alias is always equal to the column name here
+            and not (
+                # Do not include attributes that are set to their default value
+                getattr(self, attribute) == param_details.default
+            )
+        ]
+        return f"{self.__class__.__name__}({', '.join(parts)})"
+
     def __str__(self) -> str:
         return self.__class__.__name__.lower()
 
diff --git a/tests/collection/test_repr.py b/tests/collection/test_repr.py
@@ -0,0 +1,38 @@
+# Copyright (c) QuantCo 2025-2025
+# SPDX-License-Identifier: BSD-3-Clause
+
+import textwrap
+
+import polars as pl
+
+import dataframely as dy
+
+
+class MySchema(dy.Schema):
+    a = dy.Integer(primary_key=True)
+
+
+class MyCollection(dy.Collection):
+    member_a: dy.LazyFrame[MySchema]
+    member_b: dy.LazyFrame[MySchema]
+
+    @dy.filter()
+    def member_a_member_b_one_to_one(self) -> pl.LazyFrame:
+        return self.member_a.join(self.member_b, on="a", how="inner")
+
+
+def test_repr_collection() -> None:
+    assert repr(MyCollection) == textwrap.dedent("""\
+        [Collection "CollectionMeta"]
+          Members:
+            - "member_a": MySchema(optional=False, ignored_in_filters=False, inline_for_sampling=False)
+            - "member_b": MySchema(optional=False, ignored_in_filters=False, inline_for_sampling=False)
+          Filters:
+            - "member_a_member_b_one_to_one":
+                INNER JOIN:
+                LEFT PLAN ON: [col("a")]
+                  DF ["a"]; PROJECT */1 COLUMNS
+                RIGHT PLAN ON: [col("a")]
+                  DF ["a"]; PROJECT */1 COLUMNS
+                END INNER JOIN
+        """)
diff --git a/tests/schema/test_repr.py b/tests/schema/test_repr.py
@@ -0,0 +1,54 @@
+# Copyright (c) QuantCo 2025-2025
+# SPDX-License-Identifier: BSD-3-Clause
+import textwrap
+
+import polars as pl
+
+import dataframely as dy
+
+
+def test_repr_no_rules() -> None:
+    class SchemaNoRules(dy.Schema):
+        a = dy.Integer()
+
+    assert repr(SchemaNoRules) == textwrap.dedent("""\
+        [Schema "SchemaNoRules"]
+          Columns:
+            - "a": Integer(nullable=True)
+        """)
+
+
+def test_repr_only_column_rules() -> None:
+    class SchemaColumnRules(dy.Schema):
+        a = dy.Integer(min=10)
+
+    assert repr(SchemaColumnRules) == textwrap.dedent("""\
+        [Schema "SchemaColumnRules"]
+          Columns:
+            - "a": Integer(nullable=True, min=10)
+        """)
+
+
+class SchemaWithRules(dy.Schema):
+    a = dy.Integer(min=10)
+    b = dy.String(primary_key=True, regex=r"^[A-Z]{3}$", alias="b2")
+
+    @dy.rule()
+    def my_rule() -> pl.Expr:
+        return pl.col("a") < 100
+
+    @dy.rule(group_by=["a"])
+    def my_group_rule() -> pl.Expr:
+        return pl.col("a").sum() > 50
+
+
+def test_repr_with_rules() -> None:
+    assert repr(SchemaWithRules) == textwrap.dedent("""\
+        [Schema "SchemaWithRules"]
+          Columns:
+            - "a": Integer(nullable=True, min=10)
+            - "b2": String(nullable=False, primary_key=True, regex='^[A-Z]{3}$')
+          Rules:
+            - "my_rule": [(col("a")) < (dyn int: 100)]
+            - "my_group_rule": [(col("a").sum()) > (dyn int: 50)] grouped by ['a']
+        """)