Skip to content

Commit 5821ca7

Browse files
authored
Merge pull request #292 from DoubleML/did-extension
Extend DiD Classes
2 parents ae15abf + 78463ec commit 5821ca7

File tree

90 files changed

+9741
-909
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

90 files changed

+9741
-909
lines changed

.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,4 @@ MANIFEST
3030
*.idea
3131
*.vscode
3232
.flake8
33-
.coverage
33+
.coverage

doubleml/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import importlib.metadata
22

3+
from .data import DoubleMLClusterData, DoubleMLData
34
from .did.did import DoubleMLDID
45
from .did.did_cs import DoubleMLDIDCS
5-
from .double_ml_data import DoubleMLClusterData, DoubleMLData
66
from .double_ml_framework import DoubleMLFramework, concat
77
from .irm.apo import DoubleMLAPO
88
from .irm.apos import DoubleMLAPOS

doubleml/data/__init__.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
"""
2+
The :mod:`doubleml.data` module implements data classes for double machine learning.
3+
"""
4+
5+
from .base_data import DoubleMLData
6+
from .cluster_data import DoubleMLClusterData
7+
from .panel_data import DoubleMLPanelData
8+
9+
__all__ = [
10+
"DoubleMLData",
11+
"DoubleMLClusterData",
12+
"DoubleMLPanelData",
13+
]

doubleml/double_ml_data.py renamed to doubleml/data/base_data.py

+143-424
Large diffs are not rendered by default.

doubleml/data/cluster_data.py

+289
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,289 @@
1+
import io
2+
3+
import numpy as np
4+
import pandas as pd
5+
from sklearn.utils import assert_all_finite
6+
from sklearn.utils.validation import check_array
7+
8+
from doubleml.data.base_data import DoubleMLBaseData, DoubleMLData
9+
from doubleml.utils._estimation import _assure_2d_array
10+
11+
12+
class DoubleMLClusterData(DoubleMLData):
13+
"""Double machine learning data-backend for data with cluster variables.
14+
15+
:class:`DoubleMLClusterData` objects can be initialized from
16+
:class:`pandas.DataFrame`'s as well as :class:`numpy.ndarray`'s.
17+
18+
Parameters
19+
----------
20+
data : :class:`pandas.DataFrame`
21+
The data.
22+
23+
y_col : str
24+
The outcome variable.
25+
26+
d_cols : str or list
27+
The treatment variable(s).
28+
29+
cluster_cols : str or list
30+
The cluster variable(s).
31+
32+
x_cols : None, str or list
33+
The covariates.
34+
If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor
35+
treatment variables ``d_cols``, nor instrumental variables ``z_cols`` are used as covariates.
36+
Default is ``None``.
37+
38+
z_cols : None, str or list
39+
The instrumental variable(s).
40+
Default is ``None``.
41+
42+
t_col : None or str
43+
The time variable (only relevant/used for DiD Estimators).
44+
Default is ``None``.
45+
46+
s_col : None or str
47+
The score or selection variable (only relevant/used for RDD and SSM Estimatiors).
48+
Default is ``None``.
49+
50+
use_other_treat_as_covariate : bool
51+
Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates.
52+
Default is ``True``.
53+
54+
force_all_x_finite : bool or str
55+
Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``.
56+
Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are
57+
allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed).
58+
Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used
59+
for the nuisance functions are capable to provide valid predictions with missings and / or infinite values
60+
in the covariates ``x``.
61+
Default is ``True``.
62+
63+
Examples
64+
--------
65+
>>> from doubleml import DoubleMLClusterData
66+
>>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021
67+
>>> # initialization from pandas.DataFrame
68+
>>> df = make_pliv_multiway_cluster_CKMS2021(return_type='DataFrame')
69+
>>> obj_dml_data_from_df = DoubleMLClusterData(df, 'Y', 'D', ['cluster_var_i', 'cluster_var_j'], z_cols='Z')
70+
>>> # initialization from np.ndarray
71+
>>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array')
72+
>>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z)
73+
"""
74+
75+
def __init__(
76+
self,
77+
data,
78+
y_col,
79+
d_cols,
80+
cluster_cols,
81+
x_cols=None,
82+
z_cols=None,
83+
t_col=None,
84+
s_col=None,
85+
use_other_treat_as_covariate=True,
86+
force_all_x_finite=True,
87+
):
88+
DoubleMLBaseData.__init__(self, data)
89+
90+
# we need to set cluster_cols (needs _data) before call to the super __init__ because of the x_cols setter
91+
self.cluster_cols = cluster_cols
92+
self._set_cluster_vars()
93+
DoubleMLData.__init__(
94+
self, data, y_col, d_cols, x_cols, z_cols, t_col, s_col, use_other_treat_as_covariate, force_all_x_finite
95+
)
96+
self._check_disjoint_sets_cluster_cols()
97+
98+
def __str__(self):
99+
data_summary = self._data_summary_str()
100+
buf = io.StringIO()
101+
self.data.info(verbose=False, buf=buf)
102+
df_info = buf.getvalue()
103+
res = (
104+
"================== DoubleMLClusterData Object ==================\n"
105+
+ "\n------------------ Data summary ------------------\n"
106+
+ data_summary
107+
+ "\n------------------ DataFrame info ------------------\n"
108+
+ df_info
109+
)
110+
return res
111+
112+
def _data_summary_str(self):
113+
data_summary = (
114+
f"Outcome variable: {self.y_col}\n"
115+
f"Treatment variable(s): {self.d_cols}\n"
116+
f"Cluster variable(s): {self.cluster_cols}\n"
117+
f"Covariates: {self.x_cols}\n"
118+
f"Instrument variable(s): {self.z_cols}\n"
119+
)
120+
if self.t_col is not None:
121+
data_summary += f"Time variable: {self.t_col}\n"
122+
if self.s_col is not None:
123+
data_summary += f"Score/Selection variable: {self.s_col}\n"
124+
125+
data_summary += f"No. Observations: {self.n_obs}\n"
126+
return data_summary
127+
128+
@classmethod
129+
def from_arrays(
130+
cls, x, y, d, cluster_vars, z=None, t=None, s=None, use_other_treat_as_covariate=True, force_all_x_finite=True
131+
):
132+
"""
133+
Initialize :class:`DoubleMLClusterData` from :class:`numpy.ndarray`'s.
134+
135+
Parameters
136+
----------
137+
x : :class:`numpy.ndarray`
138+
Array of covariates.
139+
140+
y : :class:`numpy.ndarray`
141+
Array of the outcome variable.
142+
143+
d : :class:`numpy.ndarray`
144+
Array of treatment variables.
145+
146+
cluster_vars : :class:`numpy.ndarray`
147+
Array of cluster variables.
148+
149+
z : None or :class:`numpy.ndarray`
150+
Array of instrumental variables.
151+
Default is ``None``.
152+
153+
t : :class:`numpy.ndarray`
154+
Array of the time variable (only relevant/used for DiD models).
155+
Default is ``None``.
156+
157+
s : :class:`numpy.ndarray`
158+
Array of the score or selection variable (only relevant/used for RDD or SSM models).
159+
Default is ``None``.
160+
161+
use_other_treat_as_covariate : bool
162+
Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates.
163+
Default is ``True``.
164+
165+
force_all_x_finite : bool or str
166+
Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``.
167+
Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are
168+
allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed).
169+
Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used
170+
for the nuisance functions are capable to provide valid predictions with missings and / or infinite values
171+
in the covariates ``x``.
172+
Default is ``True``.
173+
174+
Examples
175+
--------
176+
>>> from doubleml import DoubleMLClusterData
177+
>>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021
178+
>>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array')
179+
>>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z)
180+
"""
181+
dml_data = DoubleMLData.from_arrays(x, y, d, z, t, s, use_other_treat_as_covariate, force_all_x_finite)
182+
cluster_vars = check_array(cluster_vars, ensure_2d=False, allow_nd=False)
183+
cluster_vars = _assure_2d_array(cluster_vars)
184+
if cluster_vars.shape[1] == 1:
185+
cluster_cols = ["cluster_var"]
186+
else:
187+
cluster_cols = [f"cluster_var{i + 1}" for i in np.arange(cluster_vars.shape[1])]
188+
189+
data = pd.concat((pd.DataFrame(cluster_vars, columns=cluster_cols), dml_data.data), axis=1)
190+
191+
return cls(
192+
data,
193+
dml_data.y_col,
194+
dml_data.d_cols,
195+
cluster_cols,
196+
dml_data.x_cols,
197+
dml_data.z_cols,
198+
dml_data.t_col,
199+
dml_data.s_col,
200+
dml_data.use_other_treat_as_covariate,
201+
dml_data.force_all_x_finite,
202+
)
203+
204+
@property
205+
def cluster_cols(self):
206+
"""
207+
The cluster variable(s).
208+
"""
209+
return self._cluster_cols
210+
211+
@cluster_cols.setter
212+
def cluster_cols(self, value):
213+
reset_value = hasattr(self, "_cluster_cols")
214+
if isinstance(value, str):
215+
value = [value]
216+
if not isinstance(value, list):
217+
raise TypeError(
218+
"The cluster variable(s) cluster_cols must be of str or list type. "
219+
f"{str(value)} of type {str(type(value))} was passed."
220+
)
221+
if not len(set(value)) == len(value):
222+
raise ValueError("Invalid cluster variable(s) cluster_cols: Contains duplicate values.")
223+
if not set(value).issubset(set(self.all_variables)):
224+
raise ValueError("Invalid cluster variable(s) cluster_cols. At least one cluster variable is no data column.")
225+
self._cluster_cols = value
226+
if reset_value:
227+
self._check_disjoint_sets()
228+
self._set_cluster_vars()
229+
230+
@property
231+
def n_cluster_vars(self):
232+
"""
233+
The number of cluster variables.
234+
"""
235+
return len(self.cluster_cols)
236+
237+
@property
238+
def cluster_vars(self):
239+
"""
240+
Array of cluster variable(s).
241+
"""
242+
return self._cluster_vars.values
243+
244+
def _get_optional_col_sets(self):
245+
base_optional_col_sets = super()._get_optional_col_sets()
246+
cluster_cols_set = set(self.cluster_cols)
247+
return [cluster_cols_set] + base_optional_col_sets
248+
249+
def _check_disjoint_sets(self):
250+
# apply the standard checks from the DoubleMLData class
251+
super(DoubleMLClusterData, self)._check_disjoint_sets()
252+
self._check_disjoint_sets_cluster_cols()
253+
254+
def _check_disjoint_sets_cluster_cols(self):
255+
# apply the standard checks from the DoubleMLData class
256+
super(DoubleMLClusterData, self)._check_disjoint_sets()
257+
258+
# special checks for the additional cluster variables
259+
cluster_cols_set = set(self.cluster_cols)
260+
y_col_set = {self.y_col}
261+
x_cols_set = set(self.x_cols)
262+
d_cols_set = set(self.d_cols)
263+
264+
z_cols_set = set(self.z_cols or [])
265+
t_col_set = {self.t_col} if self.t_col else set()
266+
s_col_set = {self.s_col} if self.s_col else set()
267+
268+
# TODO: X can not be used as cluster variable
269+
cluster_checks_args = [
270+
(y_col_set, "outcome variable", "``y_col``"),
271+
(d_cols_set, "treatment variable", "``d_cols``"),
272+
(x_cols_set, "covariate", "``x_cols``"),
273+
(z_cols_set, "instrumental variable", "``z_cols``"),
274+
(t_col_set, "time variable", "``t_col``"),
275+
(s_col_set, "score or selection variable", "``s_col``"),
276+
]
277+
for set1, name, argument in cluster_checks_args:
278+
self._check_disjoint(
279+
set1=set1,
280+
name1=name,
281+
arg1=argument,
282+
set2=cluster_cols_set,
283+
name2="cluster variable(s)",
284+
arg2="``cluster_cols``",
285+
)
286+
287+
def _set_cluster_vars(self):
288+
assert_all_finite(self.data.loc[:, self.cluster_cols])
289+
self._cluster_vars = self.data.loc[:, self.cluster_cols]

0 commit comments

Comments
 (0)