Skip to content

Add Class for Repeated Cross-Sectional Data #330

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 24 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
ac858cd
add a cross-sectional dgp
SvenKlaassen Jun 2, 2025
10e532e
add simple test cases for cross sectional dgp
SvenKlaassen Jun 2, 2025
c96605d
reset index for in panel data
SvenKlaassen Jun 3, 2025
61dbf11
add basic did_cs_binary version with simple tests
SvenKlaassen Jun 3, 2025
ceebc6e
add internal atribute _score_dim to DoubleML class
SvenKlaassen Jun 3, 2025
ade3b9a
check prediction size based on internal n_obs
SvenKlaassen Jun 3, 2025
f113e61
update score dimensions init in the cs object
SvenKlaassen Jun 3, 2025
9ef4e53
update lambda and p calculation in did_cs
SvenKlaassen Jun 5, 2025
e90441b
add _score_dim property to doubleml class
SvenKlaassen Jun 5, 2025
9f6f5d4
add _n_obs_sample_splitting property to doubleml class
SvenKlaassen Jun 5, 2025
eb951c4
update check_resampling input
SvenKlaassen Jun 5, 2025
a6c6507
update did binary classes with n_obs_subset and n_obs_sample_splitting
SvenKlaassen Jun 5, 2025
d54b272
update tune without folds to n_obs of doubleml obj
SvenKlaassen Jun 6, 2025
693e109
change n_obs for panel data
SvenKlaassen Jun 6, 2025
7d6ef35
fix order test
SvenKlaassen Jun 6, 2025
18c3844
add sensitivity estimation to did_cs_binary
SvenKlaassen Jun 6, 2025
5d2232b
fix id positions and scaling for sensitivity
SvenKlaassen Jun 6, 2025
7f01b6b
add placebo test for did_cs_binary
SvenKlaassen Jun 6, 2025
3fafccc
extend ext prediction tests for did_cs_binary
SvenKlaassen Jun 6, 2025
9e37851
add control group test for did_cs_binary
SvenKlaassen Jun 6, 2025
810eade
add tune to did_cs_binary
SvenKlaassen Jun 6, 2025
6b6116c
update did_cs_binary sdout test
SvenKlaassen Jun 6, 2025
de324cf
add exceptions and tests
SvenKlaassen Jun 6, 2025
8d0c52c
simplify did_cs_binary nuisance estimation
SvenKlaassen Jun 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions doubleml/data/panel_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ def __init__(
force_all_x_finite=force_all_x_finite,
force_all_d_finite=False,
)
# reset index to ensure a simple RangeIndex
self.data.reset_index(drop=True, inplace=True)
if self.n_treat != 1:
raise ValueError("Only one treatment column is allowed for panel data.")

Expand Down Expand Up @@ -213,9 +215,9 @@ def id_var_unique(self):
return self._id_var_unique

@property
def n_obs(self):
def n_ids(self):
"""
The number of observations. For panel data, the number of unique values for id_col.
The number of unique values for id_col.
"""
return len(self._id_var_unique)

Expand Down
5 changes: 3 additions & 2 deletions doubleml/data/tests/test_panel_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def test_id_col_setter():
dml_data.id_col = "id_new"
assert np.array_equal(dml_data.id_var, id_comp)
assert dml_data._id_var_unique == np.unique(id_comp)
assert dml_data.n_obs == 1
assert dml_data.n_ids == 1

msg = "Invalid id variable id_col. a13 is no data column."
with pytest.raises(ValueError, match=msg):
Expand Down Expand Up @@ -169,7 +169,8 @@ def test_panel_data_properties():

assert np.array_equal(dml_data.id_var, df["id"].values)
assert np.array_equal(dml_data.id_var_unique, np.unique(df["id"].values))
assert dml_data.n_obs == len(np.unique(df["id"].values))
assert dml_data.n_obs == df.shape[0]
assert dml_data.n_ids == len(np.unique(df["id"].values))
assert dml_data.g_col == "d"
assert np.array_equal(dml_data.g_values, np.sort(np.unique(df["d"].values)))
assert dml_data.n_groups == len(np.unique(df["d"].values))
Expand Down
2 changes: 2 additions & 0 deletions doubleml/did/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
from .did_aggregation import DoubleMLDIDAggregation
from .did_binary import DoubleMLDIDBinary
from .did_cs import DoubleMLDIDCS
from .did_cs_binary import DoubleMLDIDCSBinary
from .did_multi import DoubleMLDIDMulti

__all__ = [
"DoubleMLDIDAggregation",
"DoubleMLDID",
"DoubleMLDIDCS",
"DoubleMLDIDBinary",
"DoubleMLDIDCSBinary",
"DoubleMLDIDMulti",
]
2 changes: 2 additions & 0 deletions doubleml/did/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
"""

from .dgp_did_CS2021 import make_did_CS2021
from .dgp_did_cs_CS2021 import make_did_cs_CS2021
from .dgp_did_SZ2020 import make_did_SZ2020

__all__ = [
"make_did_SZ2020",
"make_did_CS2021",
"make_did_cs_CS2021",
]
190 changes: 190 additions & 0 deletions doubleml/did/datasets/dgp_did_cs_CS2021.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
import numpy as np

from doubleml.did.datasets.dgp_did_CS2021 import make_did_CS2021

# Based on https://doi.org/10.1016/j.jeconom.2020.12.001 (see Appendix SC)
# and https://d2cml-ai.github.io/csdid/examples/csdid_basic.html#Examples-with-simulated-data
# Cross-sectional version of the data generating process (DGP) for Callaway and Sant'Anna (2021)


def make_did_cs_CS2021(n_obs=1000, dgp_type=1, include_never_treated=True, lambda_t=0.5, time_type="datetime", **kwargs):
"""
Generate synthetic repeated cross-sectional data for difference-in-differences analysis based on
Callaway and Sant'Anna (2021).

This function creates repeated cross-sectional data with heterogeneous treatment effects across time periods and groups.
The data includes pre-treatment periods, multiple treatment groups that receive treatment at different times,
and optionally a never-treated group that serves as a control. The true average treatment effect on the
treated (ATT) has a heterogeneous structure dependent on covariates and exposure time.

The data generating process offers six variations (``dgp_type`` 1-6) that differ in how the regression features
and propensity score features are derived:

- DGP 1: Outcome and propensity score are linear (in Z)
- DGP 2: Outcome is linear, propensity score is nonlinear
- DGP 3: Outcome is nonlinear, propensity score is linear
- DGP 4: Outcome and propensity score are nonlinear
- DGP 5: Outcome is linear, propensity score is constant (experimental setting)
- DGP 6: Outcome is nonlinear, propensity score is constant (experimental setting)

Let :math:`X= (X_1, X_2, X_3, X_4)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries
:math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix.

Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`,
where :math:`\\tilde{Z}_1 = \\exp(0.5 \\cdot X_1)`, :math:`\\tilde{Z}_2 = 10 + X_2/(1 + \\exp(X_1))`,
:math:`\\tilde{Z}_3 = (0.6 + X_1 \\cdot X_3 / 25)^3` and :math:`\\tilde{Z}_4 = (20 + X_2 + X_4)^2`.

For a feature vector :math:`W=(W_1, W_2, W_3, W_4)^T` (either X or Z based on ``dgp_type``), the core functions are:

1. Time-varying outcome regression function for each time period :math:`t`:

.. math::

f_{reg,t}(W) = 210 + \\frac{t}{T} \\cdot (27.4 \\cdot W_1 + 13.7 \\cdot W_2 + 13.7 \\cdot W_3 + 13.7 \\cdot W_4)

2. Group-specific propensity function for each treatment group :math:`g`:

.. math::

f_{ps,g}(W) = \\xi \\cdot \\left(1-\\frac{g}{G}\\right) \\cdot
(-W_1 + 0.5 \\cdot W_2 - 0.25 \\cdot W_3 - 0.2\\cdot W_4)

where :math:`T` is the number of time periods, :math:`G` is the number of treatment groups, and :math:`\\xi` is a
scale parameter (default: 0.9).

The panel data model is defined with the following components:

1. Time effects: :math:`\\delta_t = t` for time period :math:`t`

2. Individual effects: :math:`\\eta_i \\sim \\mathcal{N}(g_i, 1)` where :math:`g_i` is unit :math:`i`'s treatment group

3. Treatment effects: For a unit in treatment group :math:`g`, the effect in period :math:`t` is:

.. math::

\\theta_{i,t,g} = \\max(t - t_g + 1, 0) + 0.1 \\cdot X_{i,1} \\cdot \\max(t - t_g + 1, 0)

where :math:`t_g` is the first treatment period for group :math:`g`, :math:`X_{i,1}` is the first covariate for unit
:math:`i`, and :math:`\\max(t - t_g + 1, 0)` represents the exposure time (0 for pre-treatment periods).

4. Potential outcomes for unit :math:`i` in period :math:`t`:

.. math::

Y_{i,t}(0) &= f_{reg,t}(W_{reg}) + \\delta_t + \\eta_i + \\varepsilon_{i,0,t}

Y_{i,t}(1) &= Y_{i,t}(0) + \\theta_{i,t,g} + (\\varepsilon_{i,1,t} - \\varepsilon_{i,0,t})

where :math:`\\varepsilon_{i,0,t}, \\varepsilon_{i,1,t} \\sim \\mathcal{N}(0, 1)`.

5. Observed outcomes:

.. math::

Y_{i,t} = Y_{i,t}(1) \\cdot 1\\{t \\geq t_g\\} + Y_{i,t}(0) \\cdot 1\\{t < t_g\\}

6. Treatment assignment:

For non-experimental settings (DGP 1-4), the probability of being in treatment group :math:`g` is:

.. math::

P(G_i = g) = \\frac{\\exp(f_{ps,g}(W_{ps}))}{\\sum_{g'} \\exp(f_{ps,g'}(W_{ps}))}

For experimental settings (DGP 5-6), each treatment group (including never-treated) has equal probability:

.. math::

P(G_i = g) = \\frac{1}{G} \\text{ for all } g

7. Steps 1-6 generate panel data. To obtain repeated cross-sectional data, the number of generated indivials is increased
to `n_obs/lambda_t`, where `lambda_t` denotes the pobability to observe a unit at each time period (time constant).
for each


The variables :math:`W_{reg}` and :math:`W_{ps}` are selected based on the DGP type:

.. math::

DGP1:\\quad W_{reg} &= Z \\quad W_{ps} = Z

DGP2:\\quad W_{reg} &= Z \\quad W_{ps} = X

DGP3:\\quad W_{reg} &= X \\quad W_{ps} = Z

DGP4:\\quad W_{reg} &= X \\quad W_{ps} = X

DGP5:\\quad W_{reg} &= Z \\quad W_{ps} = 0

DGP6:\\quad W_{reg} &= X \\quad W_{ps} = 0

where settings 5-6 correspond to experimental designs with equal probability across treatment groups.


Parameters
----------
n_obs : int, default=1000
The number of observations to simulate.

dgp_type : int, default=1
The data generating process to be used (1-6).

include_never_treated : bool, default=True
Whether to include units that are never treated.

lambda_t : float, default=0.5
Probability of observing a unit at each time period.

time_type : str, default="datetime"
Type of time variable. Either "datetime" or "float".

**kwargs
Additional keyword arguments. Accepts the following parameters:

`c` (float, default=0.0):
Parameter for correlation structure in X.

`dim_x` (int, default=4):
Dimension of feature vectors.

`xi` (float, default=0.9):
Scale parameter for the propensity score function.

`n_periods` (int, default=5):
Number of time periods.

`anticipation_periods` (int, default=0):
Number of periods before treatment where anticipation effects occur.

`n_pre_treat_periods` (int, default=2):
Number of pre-treatment periods.

`start_date` (str, default="2025-01"):
Start date for datetime time variables.

Returns
-------
pandas.DataFrame
DataFrame containing the simulated panel data.

References
----------
Callaway, B. and Sant’Anna, P. H. (2021),
Difference-in-Differences with multiple time periods. Journal of Econometrics, 225(2), 200-230.
doi:`10.1016/j.jeconom.2020.12.001 <https://doi.org/10.1016/j.jeconom.2020.12.001>`_.
"""

n_obs_panel = int(np.ceil(n_obs / lambda_t))
df_panel = make_did_CS2021(
n_obs=n_obs_panel,
dgp_type=dgp_type,
include_never_treated=include_never_treated,
time_type=time_type,
**kwargs,
)

# for each time period, randomly select units to observe
observed_units = np.random.binomial(1, lambda_t, size=(len(df_panel.index)))
df_repeated_cs = df_panel[observed_units == 1].copy()

return df_repeated_cs
22 changes: 14 additions & 8 deletions doubleml/did/did_binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,12 @@ def __init__(
super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting=False)

self._check_data(self._dml_data)
# for did panel data the scores are based on the number of unique ids
self._n_obs = obj_dml_data.n_ids
self._score_dim = (self._n_obs, self.n_rep, self._dml_data.n_treat)
# reinitialze arrays
self._initialize_arrays()

g_values = self._dml_data.g_values
t_values = self._dml_data.t_values

Expand Down Expand Up @@ -171,8 +177,7 @@ def __init__(

# Numeric values for positions of the entries in id_panel_data inside id_original
# np.nonzero(np.isin(id_original, id_panel_data))
self._n_subset = self._panel_data_wide.shape[0]
self._n_obs = self._n_subset # Effective sample size used for resampling
self._n_obs_subset = self._panel_data_wide.shape[0] # Effective sample size used for resampling
self._n_treated_subset = self._panel_data_wide["G_indicator"].sum()

# Save x and y for later ML estimation
Expand All @@ -192,6 +197,7 @@ def __init__(

# set stratication for resampling
self._strata = self._panel_data_wide["G_indicator"]
self._n_obs_sample_splitting = self.n_obs_subset
if draw_sample_splitting:
self.draw_sample_splitting()

Expand Down Expand Up @@ -244,7 +250,7 @@ def __str__(self):
f"Evaluation period: {str(self.t_value_eval)}\n"
f"Control group: {str(self.control_group)}\n"
f"Anticipation periods: {str(self.anticipation_periods)}\n"
f"Effective sample size: {str(self.n_obs)}\n"
f"Effective sample size: {str(self.n_obs_subset)}\n"
)
learner_info = ""
for key, value in self.learner.items():
Expand Down Expand Up @@ -371,11 +377,11 @@ def trimming_threshold(self):
return self._trimming_threshold

@property
def n_obs(self):
def n_obs_subset(self):
"""
The number of observations used for estimation.
"""
return self._n_subset
return self._n_obs_subset

def _initialize_ml_nuisance_params(self):
if self.score == "observational":
Expand Down Expand Up @@ -542,7 +548,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
psi_a, psi_b = self._score_elements(y, d, g_hat0["preds"], g_hat1["preds"], m_hat["preds"], p_hat)

extend_kwargs = {
"n_obs": self._dml_data.n_obs,
"n_obs": self._dml_data.n_ids,
"id_positions": self.id_positions,
}
psi_elements = {
Expand Down Expand Up @@ -707,13 +713,13 @@ def _sensitivity_element_est(self, preds):
psi_nu2 = nu2_score_element - nu2

extend_kwargs = {
"n_obs": self._dml_data.n_obs,
"n_obs": self._dml_data.n_ids,
"id_positions": self.id_positions,
"fill_value": 0.0,
}

# add scaling to make variance estimation consistent (sample size difference)
scaling = self._dml_data.n_obs / self._n_subset
scaling = self._dml_data.n_ids / self._n_obs_subset
element_dict = {
"sigma2": sigma2,
"nu2": nu2,
Expand Down
8 changes: 2 additions & 6 deletions doubleml/did/did_cs.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,14 +219,10 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa

# THIS DIFFERS FROM THE PAPER due to stratified splitting this should be the same for each fold
# nuisance estimates of the uncond. treatment prob.
p_hat = np.full_like(d, np.nan, dtype="float64")
for train_index, test_index in smpls:
p_hat[test_index] = np.mean(d[train_index])
p_hat = np.full_like(d, d.mean(), dtype="float64")

# nuisance estimates of the uncond. time prob.
lambda_hat = np.full_like(t, np.nan, dtype="float64")
for train_index, test_index in smpls:
lambda_hat[test_index] = np.mean(t[train_index])
lambda_hat = np.full_like(t, t.mean(), dtype="float64")

# nuisance g
smpls_d0_t0, smpls_d0_t1, smpls_d1_t0, smpls_d1_t1 = _get_cond_smpls_2d(smpls, d, t)
Expand Down
Loading