Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
e55db9d
Refactor NaN correlation matrix calculation for efficiency and clarity
vuductung Mar 13, 2025
4c0e96a
Add new features to FeatureConfig for enhanced analysis
vuductung Mar 13, 2025
5987bfe
Merge branch 'main' into development
vuductung Mar 13, 2025
3eac6e2
Update feature constants in DataConfig for consistency
vuductung Mar 13, 2025
2dc6011
Update feature name handling in Loader class for consistency
vuductung Mar 13, 2025
606f5ac
Update preprocessing to standardize feature naming
vuductung Mar 13, 2025
279e73b
Update feature constants in DataConfig to include height
vuductung Mar 13, 2025
9b9f451
Refactor preprocessing logic for improved clarity and consistency
vuductung Mar 13, 2025
dc1042b
Enhance Loader class to handle optional directory parameter
vuductung Mar 13, 2025
65344b3
Refactor preprocessing methods for improved data extraction and clarity
vuductung Mar 13, 2025
16a7a72
Add SharedState class for managing shared state across modules
vuductung Mar 13, 2025
9ac126f
Update preprocessing pipeline to utilize shared state for column mana…
vuductung Mar 13, 2025
2cce299
Enhance prediction output to utilize shared state for structured data…
vuductung Mar 13, 2025
22da6b5
Enhance model prediction shifting to utilize shared state for improve…
vuductung Mar 13, 2025
acbb1a0
Update preprocessing pipeline to assign extracted data to shared state
vuductung Mar 13, 2025
cea130c
Add lin_scaled_data attribute to SharedState for enhanced data manage…
vuductung Mar 13, 2025
24e90aa
Refactor ModelConfig for improved structure and feature management
vuductung Mar 18, 2025
dc4be30
Update constants in DataConfig and ColumnConfig for enhanced feature …
vuductung Mar 18, 2025
ef9167c
Remove unused imports from dataloader.py for improved clarity and red…
vuductung Mar 18, 2025
049208a
Remove unused figure display call in MixedSpeciesPerformanceEvaluatio…
vuductung Mar 18, 2025
158a979
Enhance Loader class to support feature loading and logging
vuductung Mar 18, 2025
0943346
Add MS1 and MS2 identifier attributes to SharedState for enhanced dat…
vuductung Mar 18, 2025
b8168ef
Refactor get_logger function in utils.py for improved clarity and fun…
vuductung Mar 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 41 additions & 37 deletions selectlfq/config.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,45 @@
import torch.nn as nn
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question on the PR description: this is AI-generated I guess?
if so, please prompt it to be more concise there.. it should be one level above the actual code changes ;-)

if not: please do not invest so much time in that :-D

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok will do!

from selectlfq.constants import DataConfig
from selectlfq.ms1_features import FeatureConfig
import torch.nn as nn

no_of_engineered_features = 2
no_of_removed_features = 1
input_size = (
len(FeatureConfig.DEFAULT_FEATURES)
+ len(DataConfig.MS2_FEATURE_NAMES)
+ no_of_engineered_features
- no_of_removed_features
)

config = {
"criterion_params": {
"alpha": 0.7,
"epsilon": 1e-8,
"kind": "WVL",
"lambda1": 0.0,
},
"model_params": {
"input_size": input_size,
"hidden_sizes": [input_size] * 3,
"dropout_rate": None,
"activation": nn.ReLU(),
"init": "uniform",
"batch_norm": True,
"normalize": False,
"output_activation": "sigmoid",
},
"optmizer_params": {
"lr": 5e-3,
},
"fit_params": {
"epochs": 40,
"batch_size": 64,
"shuffle": False,
"train_size": 200,
"verbose": False,
},
}
class ModelConfig:
# Constants
no_of_engineered_features = 3
no_of_removed_features = 1
input_size = (
len(FeatureConfig.DEFAULT_FEATURES)
+ len(DataConfig.MS2_FEATURE_NAMES)
+ no_of_engineered_features
- no_of_removed_features
)

# Configuration dictionary
CONFIG = {
"criterion_params": {
"alpha": 0.7,
"epsilon": 1e-8,
"kind": "WVL",
"lambda1": 0.0,
},
"model_params": {
"input_size": input_size,
"hidden_sizes": [input_size] * 3,
"dropout_rate": None,
"activation": nn.ReLU(),
"init": "uniform",
"batch_norm": True,
"normalize": False,
"output_activation": "sigmoid",
},
"optmizer_params": {
"lr": 5e-3,
},
"fit_params": {
"epochs": 40,
"batch_size": 64,
"shuffle": False,
"train_size": 200,
"verbose": True,
},
}
29 changes: 21 additions & 8 deletions selectlfq/constants.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,27 @@
class DataConfig:
SPECIES = ["HUMAN", "YEAST", "ECOLI"]
SAMPLE_TYPES = ["Control", "Treatment", "Blank"]
LOG2_TRANSFORM = True
MS2_FEATURE_NAMES = [
"intensity",
"mass_error",
"correlation",
"height",
"charge",
"mz_observed",
"type",
"number",
# "charge",
# "mz_observed",
# "type",
# "number",
]
LOG2_TRANSFORM_FEATURES = [
"ms2_intensity",
"ms2_height",
"ms1_intensity",
"ms1_mean_overlapping_intensity",
]
ALIGN_FEATURES = [
"ms2_intensity",
"ms2_height",
"ms1_intensity",
"ms1_mean_overlapping_intensity",
]
LOG2_TRANSFORM_FEATURES = ["intensity"]
ALIGN_FEATURES = ["intensity"]


class ColumnConfig:
Expand All @@ -28,3 +36,8 @@ class ColumnConfig:
"mod_seq_hash",
"mod_seq_charge_hash",
]
PRECURSOR_IDENTIFIERS = [
"mod_seq_charge_hash",
"pg",
"precursor_idx",
]
2 changes: 0 additions & 2 deletions selectlfq/dataloader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
"""Module for keeping track of feature layers"""

from typing import List
import numpy as np
import torch
from selectlfq.utils import repeater

Expand Down
1 change: 0 additions & 1 deletion selectlfq/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -773,4 +773,3 @@ def plot_binned_residuals_per_species_subplot(
plt.tight_layout()
if path:
plt.savefig(path, bbox_inches="tight", transparent=True)
fig.show()
44 changes: 17 additions & 27 deletions selectlfq/featureengineering.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,35 +53,25 @@ def _calculate_variance_distance(data):
@njit
def _nan_correlation_matrix(data):
n = len(data)
correlation_matrix = np.empty((n, n))
correlation_matrix = np.full((n, n), np.nan) # Initialize with NaN

for i in range(n):
for j in range(n): # Compute all elements
mask = np.isfinite(data[i]) & np.isfinite(data[j])

if np.sum(mask) > 1: # Ensure there are at least two data points
xi = data[i][mask]
xj = data[j][mask]
std_dev_i = np.std(xi)
std_dev_j = np.std(xj)

if (std_dev_i > 0) and (std_dev_j > 0):
mean_i = np.mean(xi)
mean_j = np.mean(xj)
sparsity = np.mean(mask)
covariance = np.mean((xi - mean_i) * (xj - mean_j))
corr = covariance / (std_dev_i * std_dev_j)
correlation_matrix[i, j] = corr * sparsity

else:
correlation_matrix[i, j] = np.nan # Set to NaN if no variation

else:
correlation_matrix[i, j] = (
np.nan
) # Set to NaN if not enough data points

np.fill_diagonal(correlation_matrix, np.nan)
for j in range(n):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if i==j: continue

saves a level of indent ;-)

if i != j: # Only compute for off-diagonal elements
mask = np.isfinite(data[i]) & np.isfinite(data[j])
if np.sum(mask) > 1: # Ensure there are at least two data points
xi = data[i][mask]
xj = data[j][mask]
std_dev_i = np.std(xi)
std_dev_j = np.std(xj)

if (std_dev_i > 0) and (std_dev_j > 0):
mean_i = np.mean(xi)
mean_j = np.mean(xj)
sparsity = np.mean(mask)
covariance = np.mean((xi - mean_i) * (xj - mean_j))
corr = covariance / (std_dev_i * std_dev_j)
correlation_matrix[i, j] = corr * sparsity

return correlation_matrix

Expand Down
92 changes: 41 additions & 51 deletions selectlfq/loader.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import pandas as pd
import os
from selectlfq.utils import repeater
from selectlfq.preprocessing import Preprocessing
from typing import List
from selectlfq.constants import DataConfig

import pandas as pd
from selectlfq.shared_state import shared_state
from selectlfq.preprocessing import Preprocessing
from selectlfq.constants import DataConfig, ColumnConfig
from selectlfq.ms1_features import FeatureConfig
from selectlfq.utils import get_logger

logger = get_logger()


class Loader:
Expand Down Expand Up @@ -66,6 +70,18 @@ def load_pg_data(
data = pd.read_csv(filepath, delimiter="\t")
return self.preprocessing.preprocess_pg_data(data)

def load_features(self, output_folder: str) -> pd.DataFrame:
"""
Load the features from a given directory.
"""
precursor_features = self.load_precursor_file(output_folder)
fragment_features = self.load_fragment_data_files(output_folder)

return {
"ms1": precursor_features,
"ms2": fragment_features,
}

def load_precursor_data(
self, output_folder: str, categorical_features: list = None
) -> pd.DataFrame:
Expand Down Expand Up @@ -97,7 +113,7 @@ def load_precursor_data(
return self.preprocessing.preprocess_pg_data(data)

def load_fragment_data_files(
self, directory: str, feature_folder: str = "features"
self, directory: str = None, feature_folder: str = "features"
) -> List[pd.DataFrame]:
"""
Load all fragment data files from a given directory. The files
Expand All @@ -123,17 +139,26 @@ def load_fragment_data_files(

fragment_data_dict = {}

# if no directoy is provided, use the current directory
if not directory:
directory = os.getcwd()

directory = os.path.join(directory, feature_folder)

for feature_name in DataConfig.MS2_FEATURE_NAMES:
file_name = feature_name + ".csv"
file_path = os.path.join(directory, file_name)
df = pd.read_csv(file_path, index_col=0)
fragment_data_dict[feature_name] = df
logger.info("Reading MS2 feature: %s", feature_name)
feature_name = "ms2_" + feature_name

fragment_data_dict[feature_name] = df
shared_state.ms2_identifiers = fragment_data_dict["ms2_intensity"][
ColumnConfig.IDENTIFIERS
]
return fragment_data_dict

def load_precursor_file(self, directory: str) -> pd.DataFrame:
def load_precursor_file(self, directory: str = None) -> pd.DataFrame:
"""
Load the precursor data from a given directory and pivot
the data by the given features.
Expand All @@ -149,8 +174,11 @@ def load_precursor_file(self, directory: str) -> pd.DataFrame:
"""
# read data

if self.df is None:
if not self.df:
if not directory:
directory = os.getcwd()
file_path = os.path.join(directory, "precursors.tsv")
logger.info("Reading precursor file from: %s", file_path)
self.df = pd.read_csv(file_path, sep="\t", index_col=0)

# pivot table by features
Expand All @@ -164,65 +192,27 @@ def _pivot_table_by_feature(self, features: list, data: pd.DataFrame):
precursor_data_dict = {}
if isinstance(features, list):
for feat in features:
logger.info("Pivoting table by MS1 feature: %s", feat)
if feat == "sequence":
data["prec_len"] = data["sequence"].apply(lambda x: len(x))
data["prec_len"] = data["sequence"].str.len()
feat = "prec_len"

prec_data = data.pivot_table(
index=["mod_seq_charge_hash", "pg", "precursor_idx"],
columns="run",
values=feat,
).reset_index()
feat = "ms1_" + feat
precursor_data_dict[feat] = prec_data

else:
logger.info("Pivoting table by feature: %s", features)
prec_data = data.pivot_table(
index=["mod_seq_charge_hash", "pg", "precursor_idx"],
columns="run",
values=features,
).reset_index()
features = "ms1_" + features
precursor_data_dict[features] = prec_data

return precursor_data_dict

def _sort_by_list(
self, data: pd.DataFrame, col: str, reindexed_list: List[int]
) -> pd.DataFrame:
return data.set_index(col).reindex(reindexed_list).reset_index()

def sync_ms1_and_ms2_data(self, data: List[pd.DataFrame]) -> List[pd.DataFrame]:
"""
Sync the ms1 and ms2 data by the precursor index. The data is
sorted by the ion column, so that the data is in the same order
for both ms1 and ms2.

Parameters
----------
data : List[pd.DataFrame]
The data to sync.
Returns
-------
List[pd.DataFrame]
The synced data.
"""
if self.prec_idx is None:
raise ValueError(
"Precursor index is not defined, load fragment data first to retrieve precursor index"
)
else:
synced_data = repeater(
data,
pd.merge,
False,
right=self.prec_idx,
on=["precursor_idx", "mod_seq_charge_hash"],
how="outer",
)
sorted_data = repeater(
synced_data,
self._sort_by_list,
False,
col="ion",
reindexed_list=self.prec_idx["ion"].tolist(),
)
return sorted_data
19 changes: 18 additions & 1 deletion selectlfq/ms1_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,26 @@ class FeatureConfig:
"score",
"proba",
"base_width_rt",
"rt_calibrated", # might have to delete
"rt_calibrated",
"rt_library",
"delta_rt",
"cycle_fwhm",
"mz_observed",
"mz_library",
"mz_calibrated",
"mean_ms2_mass_error",
"top_3_ms2_mass_error",
"mean_overlapping_mass_error",
# "isotope_intensity_correlation",
# "isotope_height_correlation",
# "height_correlation",
# "fragment_scan_correlation",
# "template_scan_correlation",
# "fragment_frame_correlation",
"top3_frame_correlation",
# "template_frame_correlation",
"top3_b_ion_correlation",
# "top3_y_ion_correlation",
]

NORMALIZATION_FEATURES = ["intensity", "mono_ms1_intensity", "sum_ms1_intensity"]
Expand Down
Loading
Loading