-
Notifications
You must be signed in to change notification settings - Fork 0
Development #8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Development #8
Changes from all commits
cdd7a92
94704ab
628f326
03e1ae5
c1915ff
90473a5
5b4fc80
5d6647a
20c515f
ee39116
3599038
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,9 +1,9 @@ | ||
| import numpy as np | ||
| import pandas as pd | ||
| from joblib import Parallel, delayed | ||
| import scipy.stats as stats | ||
| from sklearn.impute import SimpleImputer | ||
| from numba import njit | ||
|
|
||
| from selectlfq.utils import get_logger | ||
|
|
||
| logger = get_logger() | ||
|
|
@@ -51,27 +51,40 @@ def _calculate_variance_distance(data): | |
|
|
||
|
|
||
| @njit | ||
| def _nan_correlation_matrix(data): | ||
| def _nan_correlation_matrix(data: np.ndarray) -> np.ndarray: | ||
| """ | ||
| Calculate the correlation matrix of the data. | ||
|
|
||
| Parameters: | ||
| data: numpy array of shape (n_fragments, n_samples) | ||
| The data to calculate the correlation matrix of. | ||
|
|
||
| Returns: | ||
| numpy array of shape (n_fragments, n_fragments) | ||
| containing the correlation matrix of the data. | ||
| """ | ||
| n = len(data) | ||
| correlation_matrix = np.full((n, n), np.nan) # Initialize with NaN | ||
| correlation_matrix = np.zeros((n, n)) | ||
|
|
||
| for i in range(n): | ||
| for j in range(n): | ||
| if i != j: # Only compute for off-diagonal elements | ||
| mask = np.isfinite(data[i]) & np.isfinite(data[j]) | ||
| if np.sum(mask) > 1: # Ensure there are at least two data points | ||
| xi = data[i][mask] | ||
| xj = data[j][mask] | ||
| std_dev_i = np.std(xi) | ||
| std_dev_j = np.std(xj) | ||
|
|
||
| if (std_dev_i > 0) and (std_dev_j > 0): | ||
| mean_i = np.mean(xi) | ||
| mean_j = np.mean(xj) | ||
| sparsity = np.mean(mask) | ||
| covariance = np.mean((xi - mean_i) * (xj - mean_j)) | ||
| corr = covariance / (std_dev_i * std_dev_j) | ||
| correlation_matrix[i, j] = corr * sparsity | ||
| for j in range(i + 1, n): # Only compute upper triangle | ||
| mask = np.isfinite(data[i]) & np.isfinite(data[j]) | ||
| if np.sum(mask) > 1: | ||
| xi = data[i][mask] | ||
| xj = data[j][mask] | ||
| std_dev_i = np.std(xi) | ||
| std_dev_j = np.std(xj) | ||
|
|
||
| if (std_dev_i > 0) and (std_dev_j > 0): | ||
| mean_i = np.mean(xi) | ||
| mean_j = np.mean(xj) | ||
| sparsity = np.mean(mask) | ||
| covariance = np.mean((xi - mean_i) * (xj - mean_j)) | ||
| corr = covariance / (std_dev_i * std_dev_j) | ||
| correlation_matrix[i, j] = corr * sparsity | ||
| correlation_matrix[j, i] = ( | ||
| corr * sparsity | ||
| ) # Mirror to lower triangle | ||
|
|
||
| return correlation_matrix | ||
|
|
||
|
|
@@ -98,7 +111,7 @@ def _nan_correlation_w_ref(data_ref_pairs: tuple[np.ndarray, np.ndarray]) -> np. | |
| ref = ref[0] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The return value shape comment is incorrect. Based on the implementation, the method returns an array of shape (n_fragments,) not (n_samples, n_features) as currently documented. def _nan_correlation_w_ref(data_ref_pairs: tuple[np.ndarray, np.ndarray]) -> np.ndarray:
"""
Calculate the correlation between each row of data and the reference row.
Parameters:
data_ref_pairs: tuple[np.ndarray, np.ndarray]
A tuple containing two numpy arrays:
- data: numpy array of shape (n_fragments, n_samples)
- ref: numpy array of shape (n_samples,) or (1, n_samples)
Returns:
numpy array of shape (n_fragments,)
containing the correlation between each row of data and the reference row.
"""
data, ref = data_ref_pairs
if ref.ndim > 1 and ref.shape[0] == 1: |
||
|
|
||
| n = data.shape[0] | ||
| correlation_matrix = np.full(data.shape, np.nan) | ||
| correlation_matrix = np.zeros(data.shape) | ||
| ref_mask = np.isfinite(ref) | ||
|
|
||
| for i in range(n): # Compute all elements | ||
|
|
@@ -137,25 +150,59 @@ def feature_engineering_pipeline_for_unaligned_data(self, data): | |
| """ | ||
| logger.info("Feature engineering pipeline for unaligned data.") | ||
|
|
||
| total = self._feat_eng.parallel_process( | ||
| data, self._feat_eng.feature_engineering, func="sum", n_jobs=10, axis=1 | ||
| ranks = self._feat_eng.parallel_process( | ||
| data, | ||
| self._feat_eng.feature_engineering, | ||
| func="rank_intensity", | ||
| n_jobs=10, | ||
| axis=0, | ||
| ) | ||
| return [total] | ||
| return [ranks] | ||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added a proper docstring to the method, including Parameters and Returns sections, to improve code documentation and make the method's purpose and usage clearer. def feature_engineering_pipeline_for_height_and_ms1_intensity(self, data):
"""
Feature engineering pipeline for height and MS1 intensity data.
Parameters
----------
data : array-like
The input data for feature engineering.
Returns
-------
list
List of engineered features.
"""
logger.info("Calculating additional features.")
mean_corr, std_corr = zip(
*self._feat_eng.parallel_process(
data, self._feat_eng.feature_engineering, func="corr", n_jobs=10
)
) |
||
| def feature_engineering_pipeline_for_height_and_ms1_intensity(self, data): | ||
| logger.info("Calculating additional features.") | ||
| mean_corr, std_corr = zip( | ||
| *self._feat_eng.parallel_process( | ||
| data, self._feat_eng.feature_engineering, func="corr", n_jobs=10 | ||
| ) | ||
| ) | ||
|
|
||
| return [ | ||
| std_corr, | ||
| mean_corr, | ||
| ] | ||
|
|
||
| def feature_engineering_pipeline_for_intensity(self, data): | ||
| """ | ||
| Feature engineering pipeline for intensity data. | ||
| """ | ||
| logger.info("Calculating mean and std correlation across fragments.") | ||
| logger.info("Calculating additional features.") | ||
| mean_corr, std_corr = zip( | ||
| *self._feat_eng.parallel_process( | ||
| data, self._feat_eng.feature_engineering, func="corr", n_jobs=10 | ||
| ) | ||
| ) | ||
| no_of_datapoints_across_fragments = self._feat_eng.parallel_process( | ||
| data, | ||
| self._feat_eng.feature_engineering, | ||
| func="count_no_of_datapoints", | ||
| n_jobs=10, | ||
| axis=0, | ||
| ) | ||
|
|
||
| no_of_datapoints_across_samples = self._feat_eng.parallel_process( | ||
| data, | ||
| self._feat_eng.feature_engineering, | ||
| func="count_no_of_datapoints", | ||
| n_jobs=10, | ||
| axis=1, | ||
| ) | ||
|
|
||
| return [ | ||
| std_corr, | ||
| mean_corr, | ||
| no_of_datapoints_across_fragments, | ||
| no_of_datapoints_across_samples, | ||
| ] | ||
|
|
||
|
|
||
|
|
@@ -184,9 +231,11 @@ def feature_engineering(self, data, func, **kwargs): | |
| "percentile": self._assign_percentiles, | ||
| "mean_distance": _calculate_mean_distance, | ||
| "var_distance": _calculate_variance_distance, | ||
| "corr": self._nan_mean_std_corr_across_fragments, | ||
| "corr": self.nan_mean_std_corr_across_fragments, | ||
| "median_std_offset": self._calculate_median_std_offset, | ||
| "derivative": self._calculate_derivative, | ||
| "count_no_of_datapoints": self._count_no_of_datapoints, | ||
| "weighted_variance": self._calculate_weighted_variance, | ||
| } | ||
|
|
||
| feat_eng = func_mapping[func] | ||
|
|
@@ -200,10 +249,11 @@ def feature_engineering(self, data, func, **kwargs): | |
| "sum", | ||
| "mad", | ||
| "cv", | ||
| # "rank_intensity", | ||
| "L2", | ||
| "sparsity", | ||
| "median_std_offset", | ||
| "count_no_of_datapoints", | ||
| "weighted_variance", | ||
| ]: | ||
| axis = kwargs.get("axis") | ||
|
|
||
|
|
@@ -221,6 +271,12 @@ def _repeater(self, data, function, instance_method, **kwargs): | |
| else: | ||
| return [function(subset, **kwargs) for subset in data] | ||
|
|
||
| def _calculate_weighted_variance(self, data, axis=0): | ||
| return np.nanvar(data, axis=axis) * self._sparsity(data, axis=axis) | ||
|
|
||
| def _count_no_of_datapoints(self, data, axis=0): | ||
| return np.sum(np.isfinite(data), axis=axis) | ||
|
|
||
| def _calculate_median_std_offset(self, data, axis=1): | ||
| stds = np.nanstd(data, axis=axis) | ||
| median_stds = np.nanmedian(stds) | ||
|
|
@@ -319,7 +375,7 @@ def _growth_decay_rate(self, data, axis=0): | |
| coefs = np.tile(coef, (imputed_data.shape[1], 1)).T | ||
| return coefs | ||
|
|
||
| def _nan_mean_std_corr_across_fragments(self, data): | ||
| def nan_mean_std_corr_across_fragments(self, data): | ||
| mean_corr = self._mean_tile(_nan_correlation_matrix(data), data.shape[1]) | ||
| std_corr = self._std_tile(_nan_correlation_matrix(data), data.shape[1]) | ||
|
|
||
|
|
@@ -331,12 +387,12 @@ def _nan_corrs(self, inputs): | |
| ) | ||
|
|
||
| def _mean_tile(self, data, samples): | ||
| mean = np.nanmean(data, axis=0) | ||
| mean = np.mean(data, axis=0) | ||
| mean = np.tile(mean, (samples, 1)).T | ||
| return mean | ||
|
|
||
| def _std_tile(self, data, samples): | ||
| std_data = np.nanstd(data, axis=0) | ||
| std_data = np.std(data, axis=0) | ||
| std_data = np.tile(std_data, (samples, 1)).T | ||
| return std_data | ||
|
|
||
|
|
@@ -373,3 +429,28 @@ def parallel_process(self, inputs, method, n_jobs=10, **kwargs): | |
| return Parallel(n_jobs=n_jobs)( | ||
| delayed(method)(input, **kwargs) for input in inputs | ||
| ) | ||
|
|
||
| def calculate_ms1_ms2_corr( | ||
| self, | ||
| ms1_data_extracted: list[np.ndarray], | ||
| ms2_data_extracted: list[np.ndarray], | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added a check to ensure the lengths of def calculate_ms1_ms2_corr(
self,
ms1_data_extracted: list[np.ndarray],
ms2_data_extracted: list[np.ndarray],
) -> pd.DataFrame:
"""
Calculate the correlation between the ms1 and ms2 data.
Parameters:
ms1_data_extracted: list[np.ndarray]
The ms1 data extracted from the precursor data.
ms2_data_extracted: list[np.ndarray]
The ms2 data extracted from the precursor data.
Returns:
ms1_ms2_corr_data: pd.DataFrame
The ms1-ms2 correlation data.
"""
if len(ms1_data_extracted) != len(ms2_data_extracted):
raise ValueError("MS1 and MS2 data must have the same number of elements")
zipped_data = zip(ms2_data_extracted, ms1_data_extracted)
ms1_ms2_corr = self.parallel_process(
inputs=zipped_data, method=_nan_correlation_w_ref, n_jobs=10
)
ms1_ms2_corr = pd.DataFrame(np.vstack(ms1_ms2_corr)) |
||
| ) -> pd.DataFrame: | ||
| """ | ||
| Calculate the correlation between the ms1 and ms2 data. | ||
|
|
||
| Parameters: | ||
| ms1_data_extracted: list[np.ndarray] | ||
| The ms1 data extracted from the precursor data. | ||
| ms2_data_extracted: list[np.ndarray] | ||
| The ms2 data extracted from the precursor data. | ||
| Returns: | ||
| ms1_ms2_corr_data: pd.DataFrame | ||
| The ms1-ms2 correlation data. | ||
| """ | ||
|
|
||
| zipped_data = zip(ms2_data_extracted, ms1_data_extracted) | ||
| ms1_ms2_corr = self.parallel_process( | ||
| inputs=zipped_data, method=_nan_correlation_w_ref, n_jobs=10 | ||
| ) | ||
| ms1_ms2_corr = pd.DataFrame(np.vstack(ms1_ms2_corr)) | ||
| return ms1_ms2_corr | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The correlation matrix should have 1.0 along the diagonal to represent the perfect correlation of each fragment with itself. Currently, the diagonal elements are left as zeros which is incorrect for correlation matrices.