DoubleML · SvenKlaassen · Jun 2, 2025 · Jun 2, 2025 · Jun 3, 2025 · Jun 3, 2025
diff --git a/doubleml/data/panel_data.py b/doubleml/data/panel_data.py
@@ -106,6 +106,8 @@ def __init__(
             force_all_x_finite=force_all_x_finite,
             force_all_d_finite=False,
         )
+        # reset index to ensure a simple RangeIndex
+        self.data.reset_index(drop=True, inplace=True)
         if self.n_treat != 1:
             raise ValueError("Only one treatment column is allowed for panel data.")
 
@@ -213,9 +215,9 @@ def id_var_unique(self):
         return self._id_var_unique
 
     @property
-    def n_obs(self):
+    def n_ids(self):
         """
-        The number of observations. For panel data, the number of unique values for id_col.
+        The number of unique values for id_col.
         """
         return len(self._id_var_unique)
 

diff --git a/doubleml/data/tests/test_panel_data.py b/doubleml/data/tests/test_panel_data.py
@@ -56,7 +56,7 @@ def test_id_col_setter():
     dml_data.id_col = "id_new"
     assert np.array_equal(dml_data.id_var, id_comp)
     assert dml_data._id_var_unique == np.unique(id_comp)
-    assert dml_data.n_obs == 1
+    assert dml_data.n_ids == 1
 
     msg = "Invalid id variable id_col. a13 is no data column."
     with pytest.raises(ValueError, match=msg):
@@ -169,7 +169,8 @@ def test_panel_data_properties():
 
     assert np.array_equal(dml_data.id_var, df["id"].values)
     assert np.array_equal(dml_data.id_var_unique, np.unique(df["id"].values))
-    assert dml_data.n_obs == len(np.unique(df["id"].values))
+    assert dml_data.n_obs == df.shape[0]
+    assert dml_data.n_ids == len(np.unique(df["id"].values))
     assert dml_data.g_col == "d"
     assert np.array_equal(dml_data.g_values, np.sort(np.unique(df["d"].values)))
     assert dml_data.n_groups == len(np.unique(df["d"].values))

diff --git a/doubleml/did/__init__.py b/doubleml/did/__init__.py
@@ -6,12 +6,14 @@
 from .did_aggregation import DoubleMLDIDAggregation
 from .did_binary import DoubleMLDIDBinary
 from .did_cs import DoubleMLDIDCS
+from .did_cs_binary import DoubleMLDIDCSBinary
 from .did_multi import DoubleMLDIDMulti
 
 __all__ = [
     "DoubleMLDIDAggregation",
     "DoubleMLDID",
     "DoubleMLDIDCS",
     "DoubleMLDIDBinary",
+    "DoubleMLDIDCSBinary",
     "DoubleMLDIDMulti",
 ]
diff --git a/doubleml/did/datasets/__init__.py b/doubleml/did/datasets/__init__.py
@@ -3,9 +3,11 @@
 """
 
 from .dgp_did_CS2021 import make_did_CS2021
+from .dgp_did_cs_CS2021 import make_did_cs_CS2021
 from .dgp_did_SZ2020 import make_did_SZ2020
 
 __all__ = [
     "make_did_SZ2020",
     "make_did_CS2021",
+    "make_did_cs_CS2021",
 ]
diff --git a/doubleml/did/datasets/dgp_did_cs_CS2021.py b/doubleml/did/datasets/dgp_did_cs_CS2021.py
@@ -0,0 +1,190 @@
+import numpy as np
+
+from doubleml.did.datasets.dgp_did_CS2021 import make_did_CS2021
+
+# Based on https://doi.org/10.1016/j.jeconom.2020.12.001 (see Appendix SC)
+# and https://d2cml-ai.github.io/csdid/examples/csdid_basic.html#Examples-with-simulated-data
+# Cross-sectional version of the data generating process (DGP) for Callaway and Sant'Anna (2021)
+
+
+def make_did_cs_CS2021(n_obs=1000, dgp_type=1, include_never_treated=True, lambda_t=0.5, time_type="datetime", **kwargs):
+    """
+    Generate synthetic repeated cross-sectional data for difference-in-differences analysis based on
+    Callaway and Sant'Anna (2021).
+
+    This function creates repeated cross-sectional data with heterogeneous treatment effects across time periods and groups.
+    The data includes pre-treatment periods, multiple treatment groups that receive treatment at different times,
+    and optionally a never-treated group that serves as a control. The true average treatment effect on the
+    treated (ATT) has a heterogeneous structure dependent on covariates and exposure time.
+
+    The data generating process offers six variations (``dgp_type`` 1-6) that differ in how the regression features
+    and propensity score features are derived:
+
+    - DGP 1: Outcome and propensity score are linear (in Z)
+    - DGP 2: Outcome is linear, propensity score is nonlinear
+    - DGP 3: Outcome is nonlinear, propensity score is linear
+    - DGP 4: Outcome and propensity score are nonlinear
+    - DGP 5: Outcome is linear, propensity score is constant (experimental setting)
+    - DGP 6: Outcome is nonlinear, propensity score is constant (experimental setting)
+
+    Let :math:`X= (X_1, X_2, X_3, X_4)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries
+    :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix.
+
+    Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`,
+    where :math:`\\tilde{Z}_1 = \\exp(0.5 \\cdot X_1)`, :math:`\\tilde{Z}_2 = 10 + X_2/(1 + \\exp(X_1))`,
+    :math:`\\tilde{Z}_3 = (0.6 + X_1 \\cdot X_3 / 25)^3` and :math:`\\tilde{Z}_4 = (20 + X_2 + X_4)^2`.
+
+    For a feature vector :math:`W=(W_1, W_2, W_3, W_4)^T` (either X or Z based on ``dgp_type``), the core functions are:
+
+    1. Time-varying outcome regression function for each time period :math:`t`:
+
+       .. math::
+
+           f_{reg,t}(W) = 210 + \\frac{t}{T} \\cdot (27.4 \\cdot W_1 + 13.7 \\cdot W_2 + 13.7 \\cdot W_3 + 13.7 \\cdot W_4)
+
+    2. Group-specific propensity function for each treatment group :math:`g`:
+
+       .. math::
+
+           f_{ps,g}(W) = \\xi \\cdot \\left(1-\\frac{g}{G}\\right) \\cdot
+           (-W_1 + 0.5 \\cdot W_2 - 0.25 \\cdot W_3 - 0.2\\cdot W_4)
+
+    where :math:`T` is the number of time periods, :math:`G` is the number of treatment groups, and :math:`\\xi` is a
+    scale parameter (default: 0.9).
+
+    The panel data model is defined with the following components:
+
+    1. Time effects: :math:`\\delta_t = t` for time period :math:`t`
+
+    2. Individual effects: :math:`\\eta_i \\sim \\mathcal{N}(g_i, 1)` where :math:`g_i` is unit :math:`i`'s treatment group
+
+    3. Treatment effects: For a unit in treatment group :math:`g`, the effect in period :math:`t` is:
+
+       .. math::
+
+           \\theta_{i,t,g} = \\max(t - t_g + 1, 0) + 0.1 \\cdot X_{i,1} \\cdot \\max(t - t_g + 1, 0)
+
+       where :math:`t_g` is the first treatment period for group :math:`g`, :math:`X_{i,1}` is the first covariate for unit
+       :math:`i`, and :math:`\\max(t - t_g + 1, 0)` represents the exposure time (0 for pre-treatment periods).
+
+    4. Potential outcomes for unit :math:`i` in period :math:`t`:
+
+       .. math::
+
+           Y_{i,t}(0) &= f_{reg,t}(W_{reg}) + \\delta_t + \\eta_i + \\varepsilon_{i,0,t}
+
+           Y_{i,t}(1) &= Y_{i,t}(0) + \\theta_{i,t,g} + (\\varepsilon_{i,1,t} - \\varepsilon_{i,0,t})
+
+       where :math:`\\varepsilon_{i,0,t}, \\varepsilon_{i,1,t} \\sim \\mathcal{N}(0, 1)`.
+
+    5. Observed outcomes:
+
+       .. math::
+
+           Y_{i,t} = Y_{i,t}(1) \\cdot 1\\{t \\geq t_g\\} + Y_{i,t}(0) \\cdot 1\\{t < t_g\\}
+
+    6. Treatment assignment:
+
+       For non-experimental settings (DGP 1-4), the probability of being in treatment group :math:`g` is:
+
+       .. math::
+
+           P(G_i = g) = \\frac{\\exp(f_{ps,g}(W_{ps}))}{\\sum_{g'} \\exp(f_{ps,g'}(W_{ps}))}
+
+       For experimental settings (DGP 5-6), each treatment group (including never-treated) has equal probability:
+
+       .. math::
+
+           P(G_i = g) = \\frac{1}{G} \\text{ for all } g
+
+    7. Steps 1-6 generate panel data. To obtain repeated cross-sectional data, the number of generated indivials is increased
+    to `n_obs/lambda_t`, where `lambda_t` denotes the pobability to observe a unit at each time period (time constant).
+    for each
+
+
+    The variables :math:`W_{reg}` and :math:`W_{ps}` are selected based on the DGP type:
+
+    .. math::
+
+        DGP1:\\quad W_{reg} &= Z \\quad W_{ps} = Z
+
+        DGP2:\\quad W_{reg} &= Z \\quad W_{ps} = X
+
+        DGP3:\\quad W_{reg} &= X \\quad W_{ps} = Z
+
+        DGP4:\\quad W_{reg} &= X \\quad W_{ps} = X
+
+        DGP5:\\quad W_{reg} &= Z \\quad W_{ps} = 0
+
+        DGP6:\\quad W_{reg} &= X \\quad W_{ps} = 0
+
+    where settings 5-6 correspond to experimental designs with equal probability across treatment groups.
+
+
+    Parameters
+    ----------
+    n_obs : int, default=1000
+        The number of observations to simulate.
+
+    dgp_type : int, default=1
+        The data generating process to be used (1-6).
+
+    include_never_treated : bool, default=True
+        Whether to include units that are never treated.
+
+    lambda_t : float, default=0.5
+        Probability of observing a unit at each time period.
+
+    time_type : str, default="datetime"
+        Type of time variable. Either "datetime" or "float".
+
+    **kwargs
+        Additional keyword arguments. Accepts the following parameters:
+
+        `c` (float, default=0.0):
+            Parameter for correlation structure in X.
+
+        `dim_x` (int, default=4):
+            Dimension of feature vectors.
+
+        `xi` (float, default=0.9):
+            Scale parameter for the propensity score function.
+
+        `n_periods` (int, default=5):
+            Number of time periods.
+
+        `anticipation_periods` (int, default=0):
+            Number of periods before treatment where anticipation effects occur.
+
+        `n_pre_treat_periods` (int, default=2):
+            Number of pre-treatment periods.
+
+        `start_date` (str, default="2025-01"):
+            Start date for datetime time variables.
+
+    Returns
+    -------
+    pandas.DataFrame
+        DataFrame containing the simulated panel data.
+
+    References
+    ----------
+    Callaway, B. and Sant’Anna, P. H. (2021),
+    Difference-in-Differences with multiple time periods. Journal of Econometrics, 225(2), 200-230.
+    doi:`10.1016/j.jeconom.2020.12.001 <https://doi.org/10.1016/j.jeconom.2020.12.001>`_.
+    """
+
+    n_obs_panel = int(np.ceil(n_obs / lambda_t))
+    df_panel = make_did_CS2021(
+        n_obs=n_obs_panel,
+        dgp_type=dgp_type,
+        include_never_treated=include_never_treated,
+        time_type=time_type,
+        **kwargs,
+    )
+
+    # for each time period, randomly select units to observe
+    observed_units = np.random.binomial(1, lambda_t, size=(len(df_panel.index)))
+    df_repeated_cs = df_panel[observed_units == 1].copy()
+
+    return df_repeated_cs
diff --git a/doubleml/did/did_binary.py b/doubleml/did/did_binary.py
@@ -124,6 +124,12 @@ def __init__(
         super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting=False)
 
         self._check_data(self._dml_data)
+        # for did panel data the scores are based on the number of unique ids
+        self._n_obs = obj_dml_data.n_ids
+        self._score_dim = (self._n_obs, self.n_rep, self._dml_data.n_treat)
+        # reinitialze arrays
+        self._initialize_arrays()
+
         g_values = self._dml_data.g_values
         t_values = self._dml_data.t_values
 
@@ -171,8 +177,7 @@ def __init__(
 
         # Numeric values for positions of the entries in id_panel_data inside id_original
         # np.nonzero(np.isin(id_original, id_panel_data))
-        self._n_subset = self._panel_data_wide.shape[0]
-        self._n_obs = self._n_subset  # Effective sample size used for resampling
+        self._n_obs_subset = self._panel_data_wide.shape[0]  # Effective sample size used for resampling
         self._n_treated_subset = self._panel_data_wide["G_indicator"].sum()
 
         # Save x and y for later ML estimation
@@ -192,6 +197,7 @@ def __init__(
 
         # set stratication for resampling
         self._strata = self._panel_data_wide["G_indicator"]
+        self._n_obs_sample_splitting = self.n_obs_subset
         if draw_sample_splitting:
             self.draw_sample_splitting()
 
@@ -244,7 +250,7 @@ def __str__(self):
             f"Evaluation period: {str(self.t_value_eval)}\n"
             f"Control group: {str(self.control_group)}\n"
             f"Anticipation periods: {str(self.anticipation_periods)}\n"
-            f"Effective sample size: {str(self.n_obs)}\n"
+            f"Effective sample size: {str(self.n_obs_subset)}\n"
         )
         learner_info = ""
         for key, value in self.learner.items():
@@ -371,11 +377,11 @@ def trimming_threshold(self):
         return self._trimming_threshold
 
     @property
-    def n_obs(self):
+    def n_obs_subset(self):
         """
         The number of observations used for estimation.
         """
-        return self._n_subset
+        return self._n_obs_subset
 
     def _initialize_ml_nuisance_params(self):
         if self.score == "observational":
@@ -542,7 +548,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
         psi_a, psi_b = self._score_elements(y, d, g_hat0["preds"], g_hat1["preds"], m_hat["preds"], p_hat)
 
         extend_kwargs = {
-            "n_obs": self._dml_data.n_obs,
+            "n_obs": self._dml_data.n_ids,
             "id_positions": self.id_positions,
         }
         psi_elements = {
@@ -707,13 +713,13 @@ def _sensitivity_element_est(self, preds):
         psi_nu2 = nu2_score_element - nu2
 
         extend_kwargs = {
-            "n_obs": self._dml_data.n_obs,
+            "n_obs": self._dml_data.n_ids,
             "id_positions": self.id_positions,
             "fill_value": 0.0,
         }
 
         # add scaling to make variance estimation consistent (sample size difference)
-        scaling = self._dml_data.n_obs / self._n_subset
+        scaling = self._dml_data.n_ids / self._n_obs_subset
         element_dict = {
             "sigma2": sigma2,
             "nu2": nu2,

diff --git a/doubleml/did/did_cs.py b/doubleml/did/did_cs.py
@@ -219,14 +219,10 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
 
         # THIS DIFFERS FROM THE PAPER due to stratified splitting this should be the same for each fold
         # nuisance estimates of the uncond. treatment prob.
-        p_hat = np.full_like(d, np.nan, dtype="float64")
-        for train_index, test_index in smpls:
-            p_hat[test_index] = np.mean(d[train_index])
+        p_hat = np.full_like(d, d.mean(), dtype="float64")
 
         # nuisance estimates of the uncond. time prob.
-        lambda_hat = np.full_like(t, np.nan, dtype="float64")
-        for train_index, test_index in smpls:
-            lambda_hat[test_index] = np.mean(t[train_index])
+        lambda_hat = np.full_like(t, t.mean(), dtype="float64")
 
         # nuisance g
         smpls_d0_t0, smpls_d0_t1, smpls_d1_t0, smpls_d1_t1 = _get_cond_smpls_2d(smpls, d, t)