Merge branch 'blrt' into dev

Labo-Lacourse · Dec 22, 2023 · ec9824d · ec9824d
2 parents a65fec2 + 1b5e814
commit ec9824d
Show file tree

Hide file tree

Showing 5 changed files with 324 additions and 37 deletions.
diff --git a/stepmix/bootstrap.py b/stepmix/bootstrap.py
@@ -2,6 +2,7 @@
 import itertools
 import pandas as pd
 import warnings
+import copy
 
 import numpy as np
 import tqdm
@@ -43,13 +44,23 @@ def find_best_permutation(reference, target, criterion=mse):
 
 
 def bootstrap(
-    estimator, X, Y=None, n_repetitions=1000, sample_weight=None, progress_bar=True
+    estimator,
+    X,
+    Y=None,
+    n_repetitions=1000,
+    sample_weight=None,
+    parametric=False,
+    sampler=None,
+    identify_classes=True,
+    progress_bar=True,
+    random_state=None,
 ):
-    """Non-parametric boostrap of StepMix estimator.
+    """Parametric or Non-parametric boostrap of a StepMix estimator.
 
-    Fit the estimator on X,Y then fit n_repetitions on resampled datasets.
+    Fit n_repetitions clones of the estimator on resampled datasets.
 
-    Repetition parameters are aligned with the class order of the main estimator.
+    If identify_classes=True, repeated parameter estimates are aligned with the class order of the main estimator using
+    a permutation search.
 
     Parameters
     ----------
@@ -60,17 +71,40 @@ def bootstrap(
     n_repetitions: int
         Number of repetitions to fit.
     sample_weight : array-like of shape(n_samples,), default=None
+        Array of weights that are assigned to individual samples.
+        If not provided, then each sample is given unit weight. Ignored if parametric=True.
+    parametric: bool, default=False
+        Use parametric bootstrap instead of non-parametric. Data will be generated by sampling the estimator.
+    sampler: bool, default=None
+        Another fitted estimator to use for sampling instead of the main estimator. Only used for parametric
+        bootstrapping.
+    identify_classes: bool, default=True
+        Run a permutation test to align the classes of the repetitions to the classes of the main estimator. This is
+        required if inference on the model parameters is needed, but can be turned off if only the likelihood needs
+        to be bootstrapped to save computations.
     progress_bar : bool, default=True
         Display a tqdm progress bar for repetitions.
+    random_state : int, default=None
     Returns
     ----------
     samples: DataFrame
         DataFrame of all repetitions. Follows the convention of StepMix.get_parameters_df() with an additional
         'rep' column.
     rep_stats: DataFrame
         Likelihood statistics of each repetition.
+        'rep' column. None if identy_classes=False.
+    stats: DataFrame
+        Various statistics of bootstrapped estimators.
     """
     check_is_fitted(estimator)
+    estimator = copy.deepcopy(estimator)
+    estimator.set_params(random_state=random_state)
+
+    if sampler is not None:
+        check_is_fitted(sampler)
+        sampler = copy.deepcopy(sampler)
+        sampler.set_params(random_state=random_state)
+
     n_samples = X.shape[0]
     x_names = estimator.x_names_
     y_names = estimator.y_names_ if hasattr(estimator, "y_names_") else None
@@ -83,9 +117,9 @@ def bootstrap(
     ref_class_probabilities = estimator.predict_proba(X, Y)
 
     # Raise warning if trying to permute too many columns
-    if estimator.n_components > 6:
+    if identify_classes and estimator.n_components > 6:
         warnings.warn(
-            "Bootstrapping requires permuting latent classes. Permuting latent classes with n_components > 6 may be slow."
+            "Bootstrapping with identfy_classes=True requires permuting latent classes. Permuting latent classes with n_components > 6 may be slow."
         )
 
     # Now fit n_repetitions estimator with resampling and save parameters
@@ -102,27 +136,37 @@ def bootstrap(
     )
     for rep in tqdm_rep:
         # Resample data
-        rep_samples = rng.choice(n_samples, size=(n_samples,), replace=True)
-        X_rep = X[rep_samples]
-        Y_rep = Y[rep_samples] if Y is not None else None
-        sample_weight_rep = (
-            sample_weight[rep_samples] if sample_weight is not None else None
-        )
+        if parametric and sampler is not None:
+            X_rep, Y_rep, _ = sampler.sample(n_samples)
+            sample_weight_rep = None
+        elif parametric:
+            X_rep, Y_rep, _ = estimator.sample(n_samples)
+            sample_weight_rep = None
+        else:
+            # Sample with replacement
+            rep_samples = rng.choice(n_samples, size=(n_samples,), replace=True)
+            X_rep = X[rep_samples]
+            Y_rep = Y[rep_samples] if Y is not None else None
+            sample_weight_rep = (
+                sample_weight[rep_samples] if sample_weight is not None else None
+            )
 
         # Fit estimator on resample data
         estimator_rep = clone(estimator)
 
         # Disable printing for repeated estimators and fit
-        estimator_rep.verbose = 0
-        estimator_rep.progress_bar = 0
+        estimator_rep.set_params(verbose=0, progress_bar=0, random_state=random_state)
         estimator_rep.fit(X_rep, Y_rep, sample_weight=sample_weight_rep)
 
         # Class ordering may be different. Reorder based on best permutation of class probabilites
-        rep_class_probabilities = estimator_rep.predict_proba(
-            X, Y
-        )  # Inference on original samples
-        perm = find_best_permutation(ref_class_probabilities, rep_class_probabilities)
-        estimator_rep.permute_classes(perm)
+        if identify_classes:
+            rep_class_probabilities = estimator_rep.predict_proba(
+                X, Y
+            )  # Inference on original samples
+            perm = find_best_permutation(
+                ref_class_probabilities, rep_class_probabilities
+            )
+            estimator_rep.permute_classes(perm)
 
         # Save parameters
         df_i = estimator_rep.get_parameters_df(x_names, y_names)
@@ -148,10 +192,129 @@ def bootstrap(
             max_LL=np.max(ll_buffer),
         )
 
-    return_df = pd.concat(parameters)
-    return_df.sort_index(inplace=True)
+    if identify_classes:
+        return_df = pd.concat(parameters)
+        return_df.sort_index(inplace=True)
+    else:
+        # Do not return parameters if classes are not identified
+        return_df = None
 
     # Add likelihoods statistics
     stats = {"LL": np.array(ll_buffer), "avg_LL": np.array(avg_ll_buffer)}
 
     return return_df, pd.DataFrame.from_dict(stats)
+
+
+def blrt(null_model, alternative_model, X, Y=None, n_repetitions=30, random_state=42):
+    """BLRT Test
+
+    References
+    ----------
+    Dziak, John J., Stephanie T. Lanza, and Xianming Tan. "Effect size, statistical power, and sample size requirements for the bootstrap likelihood ratio test in latent class analysis." Structural equation modeling: a multidisciplinary journal 21.4 (2014): 534-552.
+    Nylund, Karen L., Tihomir Asparouhov, and Bengt O. Muthén. "Deciding on the number of classes in latent class analysis and growth mixture modeling: A Monte Carlo simulation study." Structural equation modeling: A multidisciplinary Journal 14.4 (2007): 535-569.
+
+    Parameters
+    ----------
+    null_model : StepMix instance
+        A StepMix model with k classes.
+    alternative_model : StepMix instance
+        A StepMix model with k + 1 classes.
+    X : array-like of shape (n_samples, n_features)
+    Y : array-like of shape (n_samples, n_features_structural), default=None
+    n_repetitions: int
+        Number of repetitions to fit.
+    random_state : int, default=None
+
+    Returns
+    ----------
+    p-value: float
+        Bootstrap p-value of the BLRT test. A significant test indicates the alternative k + 1 model provides a
+        significantly better fit of the data.
+    """
+    n_samples = X.shape[0]
+
+    # Fit both models on real data
+    null_model.fit(X, Y)
+    alternative_model.fit(X, Y)
+    real_stat = 2 * (alternative_model.score(X, Y) - null_model.score(X, Y)) * n_samples
+
+    # Bootstrap null model
+    print("Bootstrapping null model...")
+    _, stats_null = bootstrap(
+        null_model,
+        X,
+        Y,
+        n_repetitions=n_repetitions,
+        identify_classes=False,
+        sampler=null_model,
+        random_state=random_state,
+        parametric=True,
+    )
+    print("\nBootstrapping alternative model...")
+    _, stats_alternative = bootstrap(
+        alternative_model,
+        X,
+        Y,
+        n_repetitions=n_repetitions,
+        identify_classes=False,
+        sampler=null_model,
+        random_state=random_state,
+        parametric=True,
+    )
+    gen_stats = 2 * (stats_alternative["LL"] - stats_null["LL"])
+    b = np.sum(gen_stats > real_stat)
+
+    return b/n_repetitions
+
+
+def blrt_sweep(model, X, Y=None, low=1, high=5, n_repetitions=30, random_state=42, verbose=True):
+    """Sweep BLRT Test
+
+    Run BLRT test for a range of number of classes. For example, if you set low=1 and high=4, the function
+    will return the result of 3 tests [1 vs 2, 2 vs 3, 3 vs 4].
+
+    Parameters
+    ----------
+    model : StepMix instance
+        A StepMix model.
+    X : array-like of shape (n_samples, n_features)
+    Y : array-like of shape (n_samples, n_features_structural), default=None
+    low: int, default=1
+        Minimum number of classes to test.
+    high: int, default=5
+        Maximum number of classes to test.
+    n_repetitions: int
+        Number of repetitions to fit.
+    random_state : int, default=None
+    verbose : bool, default=True
+
+    Returns
+    ----------
+    p-values: List of length high - low
+        Bootstrap p-values of the BLRT test for each number of classes.
+    """
+    test_string = list()
+    p_values = list()
+    for k in range(low, high):
+        print(f"Testing {k} vs. {k + 1} classes...")
+        null_model = clone(model)
+        null_model.set_params(n_components=k)
+        alternative_model = clone(model)
+        alternative_model.set_params(n_components=k + 1)
+        p_values.append(
+            blrt(
+                null_model,
+                alternative_model,
+                X,
+                Y=Y,
+                n_repetitions=n_repetitions,
+                random_state=random_state,
+            )
+        )
+        test_string.append(f"{k} vs. {k + 1} classes")
+
+    if verbose:
+        df = pd.DataFrame({"Test": test_string, "p-value": p_values}).set_index('Test')
+        print('\nBLRT Sweep Results')
+        print(df.round(4))
+    return p_values
diff --git a/stepmix/emission/categorical.py b/stepmix/emission/categorical.py
@@ -181,11 +181,8 @@ def sample(self, class_no, n_samples):
                 for k in range(n_features)
             ]
         )
-        X = np.reshape(
-            np.swapaxes(X, 0, 1),
-            (n_samples, n_features * self.parameters["max_n_outcomes"]),
-        )
-        return X
+        X = np.argmax(X, axis=2) # Convert to integers
+        return X.T
 
     @property
     def n_parameters(self):

diff --git a/stepmix/emission/covariate.py b/stepmix/emission/covariate.py
@@ -126,7 +126,7 @@ def predict(self, X):
         return prob.argmax(axis=1)
 
     def sample(self, class_no, n_samples):
-        raise NotImplementedError
+        raise NotImplementedError("Covariate models are not generative. Sampling not supported.")
 
     @property
     def n_parameters(self):

diff --git a/stepmix/stepmix.py b/stepmix/stepmix.py
@@ -327,11 +327,11 @@ def _check_initial_parameters(self, X):
         self.param_buffer_ = list()
 
         # Covariate models have special constraints. Check them.
-        is_covariate = utils.check_covariate(self.measurement, self.structural)
+        self._is_covariate = utils.check_covariate(self.measurement, self.structural)
 
         # Covariate models use a different conditional likelihood (See Bakk and Kuha, 2018), which should
         # not include the marginal likelihood over the latent classes in the E-step
-        self._conditional_likelihood = is_covariate
+        self._conditional_likelihood = self._is_covariate
 
     def _initialize_parameters(self, X, random_state):
         """Initialize the weights and measurement model parameters.
@@ -1082,23 +1082,46 @@ def m_step_structural(self, resp, Y, sample_weight=None):
         self._sm.m_step(Y, resp * sample_weight[:, np.newaxis])
 
     def bootstrap(
-        self, X, Y=None, n_repetitions=1000, sample_weight=None, progress_bar=True
+        self,
+        X,
+        Y=None,
+        n_repetitions=1000,
+        sample_weight=None,
+        parametric=False,
+        sampler=None,
+        identify_classes=True,
+        progress_bar=True,
+        random_state=None,
     ):
-        """Non-parametric boostrap of StepMix estimator.
+        """Parametric or Non-parametric boostrap of this estimator.
 
-        Fit the estimator on X,Y then fit n_repetitions on resampled datasets.
+        Fit n_repetitions clones of the estimator on resampled datasets.
 
-        Repetition parameters are aligned with the class order of the main estimator.
+        If identify_classes=True, repeated parameter estimates are aligned with the class order of the main estimator using
+        a permutation search.
 
         Parameters
         ----------
         X : array-like of shape (n_samples, n_features)
         Y : array-like of shape (n_samples, n_features_structural), default=None
         sample_weight : array-like of shape(n_samples,), default=None
+            Array of weights that are assigned to individual samples.
+            If not provided, then each sample is given unit weight. Ignored if parametric=True.
         n_repetitions: int
             Number of repetitions to fit.
-        progress_bar : bool, default=True
+        parametric: bool, default=False
+            Use parametric bootstrap instead of non-parametric. Data will be generated by sampling the estimator.
+        sampler: bool, default=None
+            Another fitted estimator to use for sampling instead of the main estimator. Only used for parametric
+            bootstrapping.
+        identify_classes: bool, default=True
+            Run a permutation test to align the classes of the repetitions to the classes of the main estimator. This is
+            required if inference on the model parameters is needed, but can be turned off if only the likelihood needs
+            to be bootstrapped to save computations.
+            progress_bar : bool, default=True
             Display a tqdm progress bar for repetitions.
+        random_state : int, default=None
+            If none, use self.random_state.
         Returns
         ----------
         samples: DataFrame
@@ -1108,8 +1131,21 @@ def bootstrap(
             Likelihood statistics of each repetition.
         """
         check_is_fitted(self)
-
-        return bootstrap(self, X, Y, n_repetitions, sample_weight, progress_bar)
+        if random_state is None:
+            random_state = self.random_state
+
+        return bootstrap(
+            self,
+            X,
+            Y,
+            n_repetitions=n_repetitions,
+            sample_weight=sample_weight,
+            parametric=parametric,
+            sampler=sampler,
+            identify_classes=identify_classes,
+            progress_bar=progress_bar,
+            random_state=random_state,
+        )
 
     def bootstrap_stats(
         self, X, Y=None, n_repetitions=1000, sample_weight=None, progress_bar=True