-
Notifications
You must be signed in to change notification settings - Fork 34
DSB fixes #165
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DSB fixes #165
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,4 @@ | ||
| from typing import Optional, Iterable, Tuple, Union | ||
| from typing import Optional, Iterable, Tuple, Union, Literal | ||
| from numbers import Integral, Real | ||
| from warnings import warn | ||
|
|
||
|
|
@@ -21,6 +21,9 @@ def dsb( | |
| isotype_controls: Optional[Iterable[str]] = None, | ||
| empty_counts_range: Optional[Tuple[Real, Real]] = None, | ||
| cell_counts_range: Optional[Tuple[Real, Real]] = None, | ||
| scale_factor: Literal["standardize", "mean_subtract"] = "standardize", | ||
| quantile_clipping: bool = False, | ||
| quantile_clip: Tuple[float, float] = (0.001, 0.9995), | ||
| add_layer: bool = False, | ||
| random_state: Optional[Union[int, np.random.RandomState, None]] = None, | ||
| ) -> Union[None, MuData]: | ||
|
|
@@ -47,6 +50,11 @@ def dsb( | |
| this specifies the minimum and maximum log10-counts for a droplet to be considered empty. | ||
| cell_counts_range: If ``data_raw`` is ``None``, i.e. ``data`` contains the unfiltered data, | ||
| this specifies the minimum and maximum log10-counts for a droplet to be considered not empty. | ||
| scale_factor: How to perform protein level denoising. `standardize` corresponds to the method | ||
| described as Step I in the paper, i.e. subtracting the mean and dividing by the standard | ||
| deviation. `mean_subtract` subtracts the mean without dividing by the standard deviation. | ||
| quantile_clipping: Whether to clip the final result. Useful if outliers are observied. | ||
| quantile_clip: The quantiles to clip outlier values to. Only used if `quantile_clipping=True`. | ||
| add_layer: Whether to add a ``'dsb'`` layer instead of assigning to the X matrix. | ||
| random_state: Random seed. | ||
|
|
||
|
|
@@ -102,6 +110,13 @@ def dsb( | |
| if pseudocount < 0: | ||
| raise ValueError("pseudocount cannot be negative") | ||
|
|
||
| if quantile_clipping: | ||
| if len(quantile_clip) != 2: | ||
| raise ValueError("quantile_clip must have exactly 2 values") | ||
| quantile_clip = np.asarray(quantile_clip) | ||
| if np.any((quantile_clip < 0) | (quantile_clip > 1)): | ||
| raise ValueError("quantile_clip must be between 0 and 1") | ||
|
|
||
| if cells.shape[1] != empty.shape[1]: # this should only be possible if mudata_raw != None | ||
| raise ValueError("data and data_raw have different numbers of proteins") | ||
|
|
||
|
|
@@ -153,10 +168,15 @@ def dsb( | |
| else np.log(cells.X.toarray() + pseudocount) | ||
| ) | ||
|
|
||
| cells_scaled = (cells_scaled - empty_scaled.mean(axis=0)) / empty_scaled.std(axis=0) | ||
| cells_scaled_dtype = cells_scaled.dtype | ||
| cells_scaled = cells_scaled - empty_scaled.mean(axis=0, dtype=np.float64) | ||
| if scale_factor == "standardize": | ||
| cells_scaled /= empty_scaled.std(axis=0, ddof=1, dtype=np.float64) | ||
| if cells_scaled_dtype.kind == "f": | ||
| cells_scaled = cells_scaled.astype(cells_scaled_dtype, copy=False) | ||
|
|
||
| if denoise_counts: | ||
| bgmeans = np.empty(cells_scaled.shape[0], np.float32) | ||
| bgmeans = np.empty(cells_scaled.shape[0], cells_scaled.dtype) | ||
ilan-gold marked this conversation as resolved.
Show resolved
Hide resolved
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are we making the
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At that point,
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Alternatively, I can move the castback to the very bottom of the function, so we do all our calculations in
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice, ok you're right! |
||
| # init_params needs to be random, otherwise fitted variance for one of the n_components | ||
| # sometimes goes to 0 | ||
| sharedvar = GaussianMixture( | ||
|
|
@@ -191,6 +211,11 @@ def dsb( | |
| reg.fit(covar, cells_scaled) | ||
|
|
||
| cells_scaled -= reg.predict(covar) - reg.intercept_ | ||
|
|
||
| if quantile_clipping: | ||
| quantiles = np.quantile(cells_scaled, quantile_clip) | ||
| np.clip(cells_scaled, a_min=quantiles.min(), a_max=quantiles.max(), out=cells_scaled) | ||
|
|
||
| if add_layer: | ||
| cells.layers["dsb"] = cells_scaled | ||
| else: | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.