Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 110 additions & 29 deletions selectlfq/featureengineering.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
import scipy.stats as stats
from sklearn.impute import SimpleImputer
from numba import njit

from selectlfq.utils import get_logger

logger = get_logger()
Expand Down Expand Up @@ -51,27 +51,40 @@ def _calculate_variance_distance(data):


@njit

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The correlation matrix should have 1.0 along the diagonal to represent the perfect correlation of each fragment with itself. Currently, the diagonal elements are left as zeros which is incorrect for correlation matrices.

@njit
def _nan_correlation_matrix(data: np.ndarray) -> np.ndarray:
    """
    Calculate the correlation matrix of the data.

    Parameters:
        data: numpy array of shape (n_fragments, n_samples)
            The data to calculate the correlation matrix of.

    Returns:
        numpy array of shape (n_fragments, n_fragments)
        containing the correlation matrix of the data.
    """
    n = len(data)
    correlation_matrix = np.zeros((n, n))

    for i in range(n):
        correlation_matrix[i, i] = 1.0  # Set diagonal to 1.0 (perfect self-correlation)
        for j in range(i + 1, n):  # Only compute upper triangle
            mask = np.isfinite(data[i]) & np.isfinite(data[j])
            if np.sum(mask) > 1:
                xi = data[i][mask]
                xj = data[j][mask]

def _nan_correlation_matrix(data):
def _nan_correlation_matrix(data: np.ndarray) -> np.ndarray:
"""
Calculate the correlation matrix of the data.

Parameters:
data: numpy array of shape (n_fragments, n_samples)
The data to calculate the correlation matrix of.

Returns:
numpy array of shape (n_fragments, n_fragments)
containing the correlation matrix of the data.
"""
n = len(data)
correlation_matrix = np.full((n, n), np.nan) # Initialize with NaN
correlation_matrix = np.zeros((n, n))

for i in range(n):
for j in range(n):
if i != j: # Only compute for off-diagonal elements
mask = np.isfinite(data[i]) & np.isfinite(data[j])
if np.sum(mask) > 1: # Ensure there are at least two data points
xi = data[i][mask]
xj = data[j][mask]
std_dev_i = np.std(xi)
std_dev_j = np.std(xj)

if (std_dev_i > 0) and (std_dev_j > 0):
mean_i = np.mean(xi)
mean_j = np.mean(xj)
sparsity = np.mean(mask)
covariance = np.mean((xi - mean_i) * (xj - mean_j))
corr = covariance / (std_dev_i * std_dev_j)
correlation_matrix[i, j] = corr * sparsity
for j in range(i + 1, n): # Only compute upper triangle
mask = np.isfinite(data[i]) & np.isfinite(data[j])
if np.sum(mask) > 1:
xi = data[i][mask]
xj = data[j][mask]
std_dev_i = np.std(xi)
std_dev_j = np.std(xj)

if (std_dev_i > 0) and (std_dev_j > 0):
mean_i = np.mean(xi)
mean_j = np.mean(xj)
sparsity = np.mean(mask)
covariance = np.mean((xi - mean_i) * (xj - mean_j))
corr = covariance / (std_dev_i * std_dev_j)
correlation_matrix[i, j] = corr * sparsity
correlation_matrix[j, i] = (
corr * sparsity
) # Mirror to lower triangle

return correlation_matrix

Expand All @@ -98,7 +111,7 @@ def _nan_correlation_w_ref(data_ref_pairs: tuple[np.ndarray, np.ndarray]) -> np.
ref = ref[0]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return value shape comment is incorrect. Based on the implementation, the method returns an array of shape (n_fragments,) not (n_samples, n_features) as currently documented.

def _nan_correlation_w_ref(data_ref_pairs: tuple[np.ndarray, np.ndarray]) -> np.ndarray:
    """
    Calculate the correlation between each row of data and the reference row.

    Parameters:
        data_ref_pairs: tuple[np.ndarray, np.ndarray]
            A tuple containing two numpy arrays:
            - data: numpy array of shape (n_fragments, n_samples)
            - ref: numpy array of shape (n_samples,) or (1, n_samples)

    Returns:
        numpy array of shape (n_fragments,)
        containing the correlation between each row of data and the reference row.
    """

    data, ref = data_ref_pairs

    if ref.ndim > 1 and ref.shape[0] == 1:


n = data.shape[0]
correlation_matrix = np.full(data.shape, np.nan)
correlation_matrix = np.zeros(data.shape)
ref_mask = np.isfinite(ref)

for i in range(n): # Compute all elements
Expand Down Expand Up @@ -137,25 +150,59 @@ def feature_engineering_pipeline_for_unaligned_data(self, data):
"""
logger.info("Feature engineering pipeline for unaligned data.")

total = self._feat_eng.parallel_process(
data, self._feat_eng.feature_engineering, func="sum", n_jobs=10, axis=1
ranks = self._feat_eng.parallel_process(
data,
self._feat_eng.feature_engineering,
func="rank_intensity",
n_jobs=10,
axis=0,
)
return [total]
return [ranks]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a proper docstring to the method, including Parameters and Returns sections, to improve code documentation and make the method's purpose and usage clearer.

def feature_engineering_pipeline_for_height_and_ms1_intensity(self, data):
        """
        Feature engineering pipeline for height and MS1 intensity data.
        
        Parameters
        ----------
        data : array-like
            The input data for feature engineering.
            
        Returns
        -------
        list
            List of engineered features.
        """
        logger.info("Calculating additional features.")
        mean_corr, std_corr = zip(
            *self._feat_eng.parallel_process(
                data, self._feat_eng.feature_engineering, func="corr", n_jobs=10
            )
        )

def feature_engineering_pipeline_for_height_and_ms1_intensity(self, data):
logger.info("Calculating additional features.")
mean_corr, std_corr = zip(
*self._feat_eng.parallel_process(
data, self._feat_eng.feature_engineering, func="corr", n_jobs=10
)
)

return [
std_corr,
mean_corr,
]

def feature_engineering_pipeline_for_intensity(self, data):
"""
Feature engineering pipeline for intensity data.
"""
logger.info("Calculating mean and std correlation across fragments.")
logger.info("Calculating additional features.")
mean_corr, std_corr = zip(
*self._feat_eng.parallel_process(
data, self._feat_eng.feature_engineering, func="corr", n_jobs=10
)
)
no_of_datapoints_across_fragments = self._feat_eng.parallel_process(
data,
self._feat_eng.feature_engineering,
func="count_no_of_datapoints",
n_jobs=10,
axis=0,
)

no_of_datapoints_across_samples = self._feat_eng.parallel_process(
data,
self._feat_eng.feature_engineering,
func="count_no_of_datapoints",
n_jobs=10,
axis=1,
)

return [
std_corr,
mean_corr,
no_of_datapoints_across_fragments,
no_of_datapoints_across_samples,
]


Expand Down Expand Up @@ -184,9 +231,11 @@ def feature_engineering(self, data, func, **kwargs):
"percentile": self._assign_percentiles,
"mean_distance": _calculate_mean_distance,
"var_distance": _calculate_variance_distance,
"corr": self._nan_mean_std_corr_across_fragments,
"corr": self.nan_mean_std_corr_across_fragments,
"median_std_offset": self._calculate_median_std_offset,
"derivative": self._calculate_derivative,
"count_no_of_datapoints": self._count_no_of_datapoints,
"weighted_variance": self._calculate_weighted_variance,
}

feat_eng = func_mapping[func]
Expand All @@ -200,10 +249,11 @@ def feature_engineering(self, data, func, **kwargs):
"sum",
"mad",
"cv",
# "rank_intensity",
"L2",
"sparsity",
"median_std_offset",
"count_no_of_datapoints",
"weighted_variance",
]:
axis = kwargs.get("axis")

Expand All @@ -221,6 +271,12 @@ def _repeater(self, data, function, instance_method, **kwargs):
else:
return [function(subset, **kwargs) for subset in data]

def _calculate_weighted_variance(self, data, axis=0):
return np.nanvar(data, axis=axis) * self._sparsity(data, axis=axis)

def _count_no_of_datapoints(self, data, axis=0):
return np.sum(np.isfinite(data), axis=axis)

def _calculate_median_std_offset(self, data, axis=1):
stds = np.nanstd(data, axis=axis)
median_stds = np.nanmedian(stds)
Expand Down Expand Up @@ -319,7 +375,7 @@ def _growth_decay_rate(self, data, axis=0):
coefs = np.tile(coef, (imputed_data.shape[1], 1)).T
return coefs

def _nan_mean_std_corr_across_fragments(self, data):
def nan_mean_std_corr_across_fragments(self, data):
mean_corr = self._mean_tile(_nan_correlation_matrix(data), data.shape[1])
std_corr = self._std_tile(_nan_correlation_matrix(data), data.shape[1])

Expand All @@ -331,12 +387,12 @@ def _nan_corrs(self, inputs):
)

def _mean_tile(self, data, samples):
mean = np.nanmean(data, axis=0)
mean = np.mean(data, axis=0)
mean = np.tile(mean, (samples, 1)).T
return mean

def _std_tile(self, data, samples):
std_data = np.nanstd(data, axis=0)
std_data = np.std(data, axis=0)
std_data = np.tile(std_data, (samples, 1)).T
return std_data

Expand Down Expand Up @@ -373,3 +429,28 @@ def parallel_process(self, inputs, method, n_jobs=10, **kwargs):
return Parallel(n_jobs=n_jobs)(
delayed(method)(input, **kwargs) for input in inputs
)

def calculate_ms1_ms2_corr(
self,
ms1_data_extracted: list[np.ndarray],
ms2_data_extracted: list[np.ndarray],

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a check to ensure the lengths of ms1_data_extracted and ms2_data_extracted match before zipping them. This will provide a more helpful error message rather than potential silent errors or less informative error messages later in the process.

def calculate_ms1_ms2_corr(
        self,
        ms1_data_extracted: list[np.ndarray],
        ms2_data_extracted: list[np.ndarray],
    ) -> pd.DataFrame:
        """
        Calculate the correlation between the ms1 and ms2 data.

        Parameters:
            ms1_data_extracted: list[np.ndarray]
                The ms1 data extracted from the precursor data.
            ms2_data_extracted: list[np.ndarray]
                The ms2 data extracted from the precursor data.
        Returns:
            ms1_ms2_corr_data: pd.DataFrame
                The ms1-ms2 correlation data.
        """
        
        if len(ms1_data_extracted) != len(ms2_data_extracted):
            raise ValueError("MS1 and MS2 data must have the same number of elements")

        zipped_data = zip(ms2_data_extracted, ms1_data_extracted)
        ms1_ms2_corr = self.parallel_process(
            inputs=zipped_data, method=_nan_correlation_w_ref, n_jobs=10
        )
        ms1_ms2_corr = pd.DataFrame(np.vstack(ms1_ms2_corr))

) -> pd.DataFrame:
"""
Calculate the correlation between the ms1 and ms2 data.

Parameters:
ms1_data_extracted: list[np.ndarray]
The ms1 data extracted from the precursor data.
ms2_data_extracted: list[np.ndarray]
The ms2 data extracted from the precursor data.
Returns:
ms1_ms2_corr_data: pd.DataFrame
The ms1-ms2 correlation data.
"""

zipped_data = zip(ms2_data_extracted, ms1_data_extracted)
ms1_ms2_corr = self.parallel_process(
inputs=zipped_data, method=_nan_correlation_w_ref, n_jobs=10
)
ms1_ms2_corr = pd.DataFrame(np.vstack(ms1_ms2_corr))
return ms1_ms2_corr
25 changes: 0 additions & 25 deletions selectlfq/ms1_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,31 +203,6 @@ class FeatureConfig:
DEFAULT_FEATURES = [
"intensity",
"delta_rt",
"rt_observed",
"intensity_correlation",
"score",
"proba",
"base_width_rt",
"rt_calibrated",
"rt_library",
"delta_rt",
"cycle_fwhm",
"mz_observed",
"mz_library",
"mz_calibrated",
"mean_ms2_mass_error",
"top_3_ms2_mass_error",
"mean_overlapping_mass_error",
# "isotope_intensity_correlation",
# "isotope_height_correlation",
# "height_correlation",
# "fragment_scan_correlation",
# "template_scan_correlation",
# "fragment_frame_correlation",
"top3_frame_correlation",
# "template_frame_correlation",
"top3_b_ion_correlation",
# "top3_y_ion_correlation",
]

NORMALIZATION_FEATURES = ["intensity", "mono_ms1_intensity", "sum_ms1_intensity"]
Expand Down
Loading
Loading