Skip to content

Commit

Permalink
Merge branch 'blrt' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
sachaMorin committed Dec 22, 2023
2 parents a65fec2 + 1b5e814 commit ec9824d
Show file tree
Hide file tree
Showing 5 changed files with 324 additions and 37 deletions.
205 changes: 184 additions & 21 deletions stepmix/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import itertools
import pandas as pd
import warnings
import copy

import numpy as np
import tqdm
Expand Down Expand Up @@ -43,13 +44,23 @@ def find_best_permutation(reference, target, criterion=mse):


def bootstrap(
estimator, X, Y=None, n_repetitions=1000, sample_weight=None, progress_bar=True
estimator,
X,
Y=None,
n_repetitions=1000,
sample_weight=None,
parametric=False,
sampler=None,
identify_classes=True,
progress_bar=True,
random_state=None,
):
"""Non-parametric boostrap of StepMix estimator.
"""Parametric or Non-parametric boostrap of a StepMix estimator.
Fit the estimator on X,Y then fit n_repetitions on resampled datasets.
Fit n_repetitions clones of the estimator on resampled datasets.
Repetition parameters are aligned with the class order of the main estimator.
If identify_classes=True, repeated parameter estimates are aligned with the class order of the main estimator using
a permutation search.
Parameters
----------
Expand All @@ -60,17 +71,40 @@ def bootstrap(
n_repetitions: int
Number of repetitions to fit.
sample_weight : array-like of shape(n_samples,), default=None
Array of weights that are assigned to individual samples.
If not provided, then each sample is given unit weight. Ignored if parametric=True.
parametric: bool, default=False
Use parametric bootstrap instead of non-parametric. Data will be generated by sampling the estimator.
sampler: bool, default=None
Another fitted estimator to use for sampling instead of the main estimator. Only used for parametric
bootstrapping.
identify_classes: bool, default=True
Run a permutation test to align the classes of the repetitions to the classes of the main estimator. This is
required if inference on the model parameters is needed, but can be turned off if only the likelihood needs
to be bootstrapped to save computations.
progress_bar : bool, default=True
Display a tqdm progress bar for repetitions.
random_state : int, default=None
Returns
----------
samples: DataFrame
DataFrame of all repetitions. Follows the convention of StepMix.get_parameters_df() with an additional
'rep' column.
rep_stats: DataFrame
Likelihood statistics of each repetition.
'rep' column. None if identy_classes=False.
stats: DataFrame
Various statistics of bootstrapped estimators.
"""
check_is_fitted(estimator)
estimator = copy.deepcopy(estimator)
estimator.set_params(random_state=random_state)

if sampler is not None:
check_is_fitted(sampler)
sampler = copy.deepcopy(sampler)
sampler.set_params(random_state=random_state)

n_samples = X.shape[0]
x_names = estimator.x_names_
y_names = estimator.y_names_ if hasattr(estimator, "y_names_") else None
Expand All @@ -83,9 +117,9 @@ def bootstrap(
ref_class_probabilities = estimator.predict_proba(X, Y)

# Raise warning if trying to permute too many columns
if estimator.n_components > 6:
if identify_classes and estimator.n_components > 6:
warnings.warn(
"Bootstrapping requires permuting latent classes. Permuting latent classes with n_components > 6 may be slow."
"Bootstrapping with identfy_classes=True requires permuting latent classes. Permuting latent classes with n_components > 6 may be slow."
)

# Now fit n_repetitions estimator with resampling and save parameters
Expand All @@ -102,27 +136,37 @@ def bootstrap(
)
for rep in tqdm_rep:
# Resample data
rep_samples = rng.choice(n_samples, size=(n_samples,), replace=True)
X_rep = X[rep_samples]
Y_rep = Y[rep_samples] if Y is not None else None
sample_weight_rep = (
sample_weight[rep_samples] if sample_weight is not None else None
)
if parametric and sampler is not None:
X_rep, Y_rep, _ = sampler.sample(n_samples)
sample_weight_rep = None
elif parametric:
X_rep, Y_rep, _ = estimator.sample(n_samples)
sample_weight_rep = None
else:
# Sample with replacement
rep_samples = rng.choice(n_samples, size=(n_samples,), replace=True)
X_rep = X[rep_samples]
Y_rep = Y[rep_samples] if Y is not None else None
sample_weight_rep = (
sample_weight[rep_samples] if sample_weight is not None else None
)

# Fit estimator on resample data
estimator_rep = clone(estimator)

# Disable printing for repeated estimators and fit
estimator_rep.verbose = 0
estimator_rep.progress_bar = 0
estimator_rep.set_params(verbose=0, progress_bar=0, random_state=random_state)
estimator_rep.fit(X_rep, Y_rep, sample_weight=sample_weight_rep)

# Class ordering may be different. Reorder based on best permutation of class probabilites
rep_class_probabilities = estimator_rep.predict_proba(
X, Y
) # Inference on original samples
perm = find_best_permutation(ref_class_probabilities, rep_class_probabilities)
estimator_rep.permute_classes(perm)
if identify_classes:
rep_class_probabilities = estimator_rep.predict_proba(
X, Y
) # Inference on original samples
perm = find_best_permutation(
ref_class_probabilities, rep_class_probabilities
)
estimator_rep.permute_classes(perm)

# Save parameters
df_i = estimator_rep.get_parameters_df(x_names, y_names)
Expand All @@ -148,10 +192,129 @@ def bootstrap(
max_LL=np.max(ll_buffer),
)

return_df = pd.concat(parameters)
return_df.sort_index(inplace=True)
if identify_classes:
return_df = pd.concat(parameters)
return_df.sort_index(inplace=True)
else:
# Do not return parameters if classes are not identified
return_df = None

# Add likelihoods statistics
stats = {"LL": np.array(ll_buffer), "avg_LL": np.array(avg_ll_buffer)}

return return_df, pd.DataFrame.from_dict(stats)


def blrt(null_model, alternative_model, X, Y=None, n_repetitions=30, random_state=42):
"""BLRT Test
References
----------
Dziak, John J., Stephanie T. Lanza, and Xianming Tan. "Effect size, statistical power, and sample size requirements for the bootstrap likelihood ratio test in latent class analysis." Structural equation modeling: a multidisciplinary journal 21.4 (2014): 534-552.
Nylund, Karen L., Tihomir Asparouhov, and Bengt O. Muthén. "Deciding on the number of classes in latent class analysis and growth mixture modeling: A Monte Carlo simulation study." Structural equation modeling: A multidisciplinary Journal 14.4 (2007): 535-569.
Parameters
----------
null_model : StepMix instance
A StepMix model with k classes.
alternative_model : StepMix instance
A StepMix model with k + 1 classes.
X : array-like of shape (n_samples, n_features)
Y : array-like of shape (n_samples, n_features_structural), default=None
n_repetitions: int
Number of repetitions to fit.
random_state : int, default=None
Returns
----------
p-value: float
Bootstrap p-value of the BLRT test. A significant test indicates the alternative k + 1 model provides a
significantly better fit of the data.
"""
n_samples = X.shape[0]

# Fit both models on real data
null_model.fit(X, Y)
alternative_model.fit(X, Y)
real_stat = 2 * (alternative_model.score(X, Y) - null_model.score(X, Y)) * n_samples

# Bootstrap null model
print("Bootstrapping null model...")
_, stats_null = bootstrap(
null_model,
X,
Y,
n_repetitions=n_repetitions,
identify_classes=False,
sampler=null_model,
random_state=random_state,
parametric=True,
)
print("\nBootstrapping alternative model...")
_, stats_alternative = bootstrap(
alternative_model,
X,
Y,
n_repetitions=n_repetitions,
identify_classes=False,
sampler=null_model,
random_state=random_state,
parametric=True,
)
gen_stats = 2 * (stats_alternative["LL"] - stats_null["LL"])
b = np.sum(gen_stats > real_stat)

return b/n_repetitions


def blrt_sweep(model, X, Y=None, low=1, high=5, n_repetitions=30, random_state=42, verbose=True):
"""Sweep BLRT Test
Run BLRT test for a range of number of classes. For example, if you set low=1 and high=4, the function
will return the result of 3 tests [1 vs 2, 2 vs 3, 3 vs 4].
Parameters
----------
model : StepMix instance
A StepMix model.
X : array-like of shape (n_samples, n_features)
Y : array-like of shape (n_samples, n_features_structural), default=None
low: int, default=1
Minimum number of classes to test.
high: int, default=5
Maximum number of classes to test.
n_repetitions: int
Number of repetitions to fit.
random_state : int, default=None
verbose : bool, default=True
Returns
----------
p-values: List of length high - low
Bootstrap p-values of the BLRT test for each number of classes.
"""
test_string = list()
p_values = list()
for k in range(low, high):
print(f"Testing {k} vs. {k + 1} classes...")
null_model = clone(model)
null_model.set_params(n_components=k)
alternative_model = clone(model)
alternative_model.set_params(n_components=k + 1)
p_values.append(
blrt(
null_model,
alternative_model,
X,
Y=Y,
n_repetitions=n_repetitions,
random_state=random_state,
)
)
test_string.append(f"{k} vs. {k + 1} classes")

if verbose:
df = pd.DataFrame({"Test": test_string, "p-value": p_values}).set_index('Test')
print('\nBLRT Sweep Results')
print(df.round(4))
return p_values
7 changes: 2 additions & 5 deletions stepmix/emission/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,8 @@ def sample(self, class_no, n_samples):
for k in range(n_features)
]
)
X = np.reshape(
np.swapaxes(X, 0, 1),
(n_samples, n_features * self.parameters["max_n_outcomes"]),
)
return X
X = np.argmax(X, axis=2) # Convert to integers
return X.T

@property
def n_parameters(self):
Expand Down
2 changes: 1 addition & 1 deletion stepmix/emission/covariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def predict(self, X):
return prob.argmax(axis=1)

def sample(self, class_no, n_samples):
raise NotImplementedError
raise NotImplementedError("Covariate models are not generative. Sampling not supported.")

@property
def n_parameters(self):
Expand Down
54 changes: 45 additions & 9 deletions stepmix/stepmix.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,11 +327,11 @@ def _check_initial_parameters(self, X):
self.param_buffer_ = list()

# Covariate models have special constraints. Check them.
is_covariate = utils.check_covariate(self.measurement, self.structural)
self._is_covariate = utils.check_covariate(self.measurement, self.structural)

# Covariate models use a different conditional likelihood (See Bakk and Kuha, 2018), which should
# not include the marginal likelihood over the latent classes in the E-step
self._conditional_likelihood = is_covariate
self._conditional_likelihood = self._is_covariate

def _initialize_parameters(self, X, random_state):
"""Initialize the weights and measurement model parameters.
Expand Down Expand Up @@ -1082,23 +1082,46 @@ def m_step_structural(self, resp, Y, sample_weight=None):
self._sm.m_step(Y, resp * sample_weight[:, np.newaxis])

def bootstrap(
self, X, Y=None, n_repetitions=1000, sample_weight=None, progress_bar=True
self,
X,
Y=None,
n_repetitions=1000,
sample_weight=None,
parametric=False,
sampler=None,
identify_classes=True,
progress_bar=True,
random_state=None,
):
"""Non-parametric boostrap of StepMix estimator.
"""Parametric or Non-parametric boostrap of this estimator.
Fit the estimator on X,Y then fit n_repetitions on resampled datasets.
Fit n_repetitions clones of the estimator on resampled datasets.
Repetition parameters are aligned with the class order of the main estimator.
If identify_classes=True, repeated parameter estimates are aligned with the class order of the main estimator using
a permutation search.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Y : array-like of shape (n_samples, n_features_structural), default=None
sample_weight : array-like of shape(n_samples,), default=None
Array of weights that are assigned to individual samples.
If not provided, then each sample is given unit weight. Ignored if parametric=True.
n_repetitions: int
Number of repetitions to fit.
progress_bar : bool, default=True
parametric: bool, default=False
Use parametric bootstrap instead of non-parametric. Data will be generated by sampling the estimator.
sampler: bool, default=None
Another fitted estimator to use for sampling instead of the main estimator. Only used for parametric
bootstrapping.
identify_classes: bool, default=True
Run a permutation test to align the classes of the repetitions to the classes of the main estimator. This is
required if inference on the model parameters is needed, but can be turned off if only the likelihood needs
to be bootstrapped to save computations.
progress_bar : bool, default=True
Display a tqdm progress bar for repetitions.
random_state : int, default=None
If none, use self.random_state.
Returns
----------
samples: DataFrame
Expand All @@ -1108,8 +1131,21 @@ def bootstrap(
Likelihood statistics of each repetition.
"""
check_is_fitted(self)

return bootstrap(self, X, Y, n_repetitions, sample_weight, progress_bar)
if random_state is None:
random_state = self.random_state

return bootstrap(
self,
X,
Y,
n_repetitions=n_repetitions,
sample_weight=sample_weight,
parametric=parametric,
sampler=sampler,
identify_classes=identify_classes,
progress_bar=progress_bar,
random_state=random_state,
)

def bootstrap_stats(
self, X, Y=None, n_repetitions=1000, sample_weight=None, progress_bar=True
Expand Down
Loading

0 comments on commit ec9824d

Please sign in to comment.