DoubleML · SvenKlaassen · Apr 25, 2025 · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/doubleml/__init__.py b/doubleml/__init__.py
@@ -1,8 +1,8 @@
 import importlib.metadata
 
+from .data import DoubleMLClusterData, DoubleMLData
 from .did.did import DoubleMLDID
 from .did.did_cs import DoubleMLDIDCS
-from .double_ml_data import DoubleMLClusterData, DoubleMLData
 from .double_ml_framework import DoubleMLFramework, concat
 from .irm.apo import DoubleMLAPO
 from .irm.apos import DoubleMLAPOS

diff --git a/doubleml/data/__init__.py b/doubleml/data/__init__.py
@@ -0,0 +1,13 @@
+"""
+The :mod:`doubleml.data` module implements data classes for double machine learning.
+"""
+
+from .base_data import DoubleMLData
+from .cluster_data import DoubleMLClusterData
+from .panel_data import DoubleMLPanelData
+
+__all__ = [
+    "DoubleMLData",
+    "DoubleMLClusterData",
+    "DoubleMLPanelData",
+]
diff --git a/doubleml/double_ml_data.py → doubleml/data/base_data.py b/doubleml/double_ml_data.py → doubleml/data/base_data.py
diff --git a/doubleml/data/cluster_data.py b/doubleml/data/cluster_data.py
@@ -0,0 +1,289 @@
+import io
+
+import numpy as np
+import pandas as pd
+from sklearn.utils import assert_all_finite
+from sklearn.utils.validation import check_array
+
+from doubleml.data.base_data import DoubleMLBaseData, DoubleMLData
+from doubleml.utils._estimation import _assure_2d_array
+
+
+class DoubleMLClusterData(DoubleMLData):
+    """Double machine learning data-backend for data with cluster variables.
+
+    :class:`DoubleMLClusterData` objects can be initialized from
+    :class:`pandas.DataFrame`'s as well as :class:`numpy.ndarray`'s.
+
+    Parameters
+    ----------
+    data : :class:`pandas.DataFrame`
+        The data.
+
+    y_col : str
+        The outcome variable.
+
+    d_cols : str or list
+        The treatment variable(s).
+
+    cluster_cols : str or list
+        The cluster variable(s).
+
+    x_cols : None, str or list
+        The covariates.
+        If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor
+        treatment variables ``d_cols``, nor instrumental variables ``z_cols`` are used as covariates.
+        Default is ``None``.
+
+    z_cols : None, str or list
+        The instrumental variable(s).
+        Default is ``None``.
+
+    t_col : None or str
+        The time variable (only relevant/used for DiD Estimators).
+        Default is ``None``.
+
+    s_col : None or str
+        The score or selection variable (only relevant/used for RDD and SSM Estimatiors).
+        Default is ``None``.
+
+    use_other_treat_as_covariate : bool
+        Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates.
+        Default is ``True``.
+
+    force_all_x_finite : bool or str
+        Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``.
+        Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are
+        allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed).
+        Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used
+        for the nuisance functions are capable to provide valid predictions with missings and / or infinite values
+        in the covariates ``x``.
+        Default is ``True``.
+
+    Examples
+    --------
+    >>> from doubleml import DoubleMLClusterData
+    >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021
+    >>> # initialization from pandas.DataFrame
+    >>> df = make_pliv_multiway_cluster_CKMS2021(return_type='DataFrame')
+    >>> obj_dml_data_from_df = DoubleMLClusterData(df, 'Y', 'D', ['cluster_var_i', 'cluster_var_j'], z_cols='Z')
+    >>> # initialization from np.ndarray
+    >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array')
+    >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z)
+    """
+
+    def __init__(
+        self,
+        data,
+        y_col,
+        d_cols,
+        cluster_cols,
+        x_cols=None,
+        z_cols=None,
+        t_col=None,
+        s_col=None,
+        use_other_treat_as_covariate=True,
+        force_all_x_finite=True,
+    ):
+        DoubleMLBaseData.__init__(self, data)
+
+        # we need to set cluster_cols (needs _data) before call to the super __init__ because of the x_cols setter
+        self.cluster_cols = cluster_cols
+        self._set_cluster_vars()
+        DoubleMLData.__init__(
+            self, data, y_col, d_cols, x_cols, z_cols, t_col, s_col, use_other_treat_as_covariate, force_all_x_finite
+        )
+        self._check_disjoint_sets_cluster_cols()
+
+    def __str__(self):
+        data_summary = self._data_summary_str()
+        buf = io.StringIO()
+        self.data.info(verbose=False, buf=buf)
+        df_info = buf.getvalue()
+        res = (
+            "================== DoubleMLClusterData Object ==================\n"
+            + "\n------------------ Data summary      ------------------\n"
+            + data_summary
+            + "\n------------------ DataFrame info    ------------------\n"
+            + df_info
+        )
+        return res
+
+    def _data_summary_str(self):
+        data_summary = (
+            f"Outcome variable: {self.y_col}\n"
+            f"Treatment variable(s): {self.d_cols}\n"
+            f"Cluster variable(s): {self.cluster_cols}\n"
+            f"Covariates: {self.x_cols}\n"
+            f"Instrument variable(s): {self.z_cols}\n"
+        )
+        if self.t_col is not None:
+            data_summary += f"Time variable: {self.t_col}\n"
+        if self.s_col is not None:
+            data_summary += f"Score/Selection variable: {self.s_col}\n"
+
+        data_summary += f"No. Observations: {self.n_obs}\n"
+        return data_summary
+
+    @classmethod
+    def from_arrays(
+        cls, x, y, d, cluster_vars, z=None, t=None, s=None, use_other_treat_as_covariate=True, force_all_x_finite=True
+    ):
+        """
+        Initialize :class:`DoubleMLClusterData` from :class:`numpy.ndarray`'s.
+
+        Parameters
+        ----------
+        x : :class:`numpy.ndarray`
+            Array of covariates.
+
+        y : :class:`numpy.ndarray`
+            Array of the outcome variable.
+
+        d : :class:`numpy.ndarray`
+            Array of treatment variables.
+
+        cluster_vars : :class:`numpy.ndarray`
+            Array of cluster variables.
+
+        z : None or :class:`numpy.ndarray`
+            Array of instrumental variables.
+            Default is ``None``.
+
+        t : :class:`numpy.ndarray`
+            Array of the time variable (only relevant/used for DiD models).
+            Default is ``None``.
+
+        s : :class:`numpy.ndarray`
+            Array of the score or selection variable (only relevant/used for RDD or SSM models).
+            Default is ``None``.
+
+        use_other_treat_as_covariate : bool
+            Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates.
+            Default is ``True``.
+
+        force_all_x_finite : bool or str
+            Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``.
+            Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are
+            allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed).
+            Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used
+            for the nuisance functions are capable to provide valid predictions with missings and / or infinite values
+            in the covariates ``x``.
+            Default is ``True``.
+
+        Examples
+        --------
+        >>> from doubleml import DoubleMLClusterData
+        >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021
+        >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array')
+        >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z)
+        """
+        dml_data = DoubleMLData.from_arrays(x, y, d, z, t, s, use_other_treat_as_covariate, force_all_x_finite)
+        cluster_vars = check_array(cluster_vars, ensure_2d=False, allow_nd=False)
+        cluster_vars = _assure_2d_array(cluster_vars)
+        if cluster_vars.shape[1] == 1:
+            cluster_cols = ["cluster_var"]
+        else:
+            cluster_cols = [f"cluster_var{i + 1}" for i in np.arange(cluster_vars.shape[1])]
+
+        data = pd.concat((pd.DataFrame(cluster_vars, columns=cluster_cols), dml_data.data), axis=1)
+
+        return cls(
+            data,
+            dml_data.y_col,
+            dml_data.d_cols,
+            cluster_cols,
+            dml_data.x_cols,
+            dml_data.z_cols,
+            dml_data.t_col,
+            dml_data.s_col,
+            dml_data.use_other_treat_as_covariate,
+            dml_data.force_all_x_finite,
+        )
+
+    @property
+    def cluster_cols(self):
+        """
+        The cluster variable(s).
+        """
+        return self._cluster_cols
+
+    @cluster_cols.setter
+    def cluster_cols(self, value):
+        reset_value = hasattr(self, "_cluster_cols")
+        if isinstance(value, str):
+            value = [value]
+        if not isinstance(value, list):
+            raise TypeError(
+                "The cluster variable(s) cluster_cols must be of str or list type. "
+                f"{str(value)} of type {str(type(value))} was passed."
+            )
+        if not len(set(value)) == len(value):
+            raise ValueError("Invalid cluster variable(s) cluster_cols: Contains duplicate values.")
+        if not set(value).issubset(set(self.all_variables)):
+            raise ValueError("Invalid cluster variable(s) cluster_cols. At least one cluster variable is no data column.")
+        self._cluster_cols = value
+        if reset_value:
+            self._check_disjoint_sets()
+            self._set_cluster_vars()
+
+    @property
+    def n_cluster_vars(self):
+        """
+        The number of cluster variables.
+        """
+        return len(self.cluster_cols)
+
+    @property
+    def cluster_vars(self):
+        """
+        Array of cluster variable(s).
+        """
+        return self._cluster_vars.values
+
+    def _get_optional_col_sets(self):
+        base_optional_col_sets = super()._get_optional_col_sets()
+        cluster_cols_set = set(self.cluster_cols)
+        return [cluster_cols_set] + base_optional_col_sets
+
+    def _check_disjoint_sets(self):
+        # apply the standard checks from the DoubleMLData class
+        super(DoubleMLClusterData, self)._check_disjoint_sets()
+        self._check_disjoint_sets_cluster_cols()
+
+    def _check_disjoint_sets_cluster_cols(self):
+        # apply the standard checks from the DoubleMLData class
+        super(DoubleMLClusterData, self)._check_disjoint_sets()
+
+        # special checks for the additional cluster variables
+        cluster_cols_set = set(self.cluster_cols)
+        y_col_set = {self.y_col}
+        x_cols_set = set(self.x_cols)
+        d_cols_set = set(self.d_cols)
+
+        z_cols_set = set(self.z_cols or [])
+        t_col_set = {self.t_col} if self.t_col else set()
+        s_col_set = {self.s_col} if self.s_col else set()
+
+        # TODO: X can not be used as cluster variable
+        cluster_checks_args = [
+            (y_col_set, "outcome variable", "``y_col``"),
+            (d_cols_set, "treatment variable", "``d_cols``"),
+            (x_cols_set, "covariate", "``x_cols``"),
+            (z_cols_set, "instrumental variable", "``z_cols``"),
+            (t_col_set, "time variable", "``t_col``"),
+            (s_col_set, "score or selection variable", "``s_col``"),
+        ]
+        for set1, name, argument in cluster_checks_args:
+            self._check_disjoint(
+                set1=set1,
+                name1=name,
+                arg1=argument,
+                set2=cluster_cols_set,
+                name2="cluster variable(s)",
+                arg2="``cluster_cols``",
+            )
+
+    def _set_cluster_vars(self):
+        assert_all_finite(self.data.loc[:, self.cluster_cols])
+        self._cluster_vars = self.data.loc[:, self.cluster_cols]