MannLabs · vuductung · Mar 19, 2025 · Mar 13, 2025 · Mar 13, 2025 · Mar 13, 2025
diff --git a/selectlfq/config.py b/selectlfq/config.py
@@ -1,41 +1,45 @@
+import torch.nn as nn
 from selectlfq.constants import DataConfig
 from selectlfq.ms1_features import FeatureConfig
-import torch.nn as nn
 
-no_of_engineered_features = 2
-no_of_removed_features = 1
-input_size = (
-    len(FeatureConfig.DEFAULT_FEATURES)
-    + len(DataConfig.MS2_FEATURE_NAMES)
-    + no_of_engineered_features
-    - no_of_removed_features
-)
 
-config = {
-    "criterion_params": {
-        "alpha": 0.7,
-        "epsilon": 1e-8,
-        "kind": "WVL",
-        "lambda1": 0.0,
-    },
-    "model_params": {
-        "input_size": input_size,
-        "hidden_sizes": [input_size] * 3,
-        "dropout_rate": None,
-        "activation": nn.ReLU(),
-        "init": "uniform",
-        "batch_norm": True,
-        "normalize": False,
-        "output_activation": "sigmoid",
-    },
-    "optmizer_params": {
-        "lr": 5e-3,
-    },
-    "fit_params": {
-        "epochs": 40,
-        "batch_size": 64,
-        "shuffle": False,
-        "train_size": 200,
-        "verbose": False,
-    },
-}
+class ModelConfig:
+    # Constants
+    no_of_engineered_features = 3
+    no_of_removed_features = 1
+    input_size = (
+        len(FeatureConfig.DEFAULT_FEATURES)
+        + len(DataConfig.MS2_FEATURE_NAMES)
+        + no_of_engineered_features
+        - no_of_removed_features
+    )
+
+    # Configuration dictionary
+    CONFIG = {
+        "criterion_params": {
+            "alpha": 0.7,
+            "epsilon": 1e-8,
+            "kind": "WVL",
+            "lambda1": 0.0,
+        },
+        "model_params": {
+            "input_size": input_size,
+            "hidden_sizes": [input_size] * 3,
+            "dropout_rate": None,
+            "activation": nn.ReLU(),
+            "init": "uniform",
+            "batch_norm": True,
+            "normalize": False,
+            "output_activation": "sigmoid",
+        },
+        "optmizer_params": {
+            "lr": 5e-3,
+        },
+        "fit_params": {
+            "epochs": 40,
+            "batch_size": 64,
+            "shuffle": False,
+            "train_size": 200,
+            "verbose": True,
+        },
+    }
diff --git a/selectlfq/constants.py b/selectlfq/constants.py
@@ -1,19 +1,27 @@
 class DataConfig:
-    SPECIES = ["HUMAN", "YEAST", "ECOLI"]
-    SAMPLE_TYPES = ["Control", "Treatment", "Blank"]
     LOG2_TRANSFORM = True
     MS2_FEATURE_NAMES = [
         "intensity",
         "mass_error",
         "correlation",
         "height",
-        "charge",
-        "mz_observed",
-        "type",
-        "number",
+        # "charge",
+        # "mz_observed",
+        # "type",
+        # "number",
+    ]
+    LOG2_TRANSFORM_FEATURES = [
+        "ms2_intensity",
+        "ms2_height",
+        "ms1_intensity",
+        "ms1_mean_overlapping_intensity",
+    ]
+    ALIGN_FEATURES = [
+        "ms2_intensity",
+        "ms2_height",
+        "ms1_intensity",
+        "ms1_mean_overlapping_intensity",
     ]
-    LOG2_TRANSFORM_FEATURES = ["intensity"]
-    ALIGN_FEATURES = ["intensity"]
 
 
 class ColumnConfig:
@@ -28,3 +36,8 @@ class ColumnConfig:
         "mod_seq_hash",
         "mod_seq_charge_hash",
     ]
+    PRECURSOR_IDENTIFIERS = [
+        "mod_seq_charge_hash",
+        "pg",
+        "precursor_idx",
+    ]
diff --git a/selectlfq/dataloader.py b/selectlfq/dataloader.py
@@ -1,7 +1,5 @@
 """Module for keeping track of feature layers"""
 
-from typing import List
-import numpy as np
 import torch
 from selectlfq.utils import repeater
 

diff --git a/selectlfq/evaluation.py b/selectlfq/evaluation.py
@@ -773,4 +773,3 @@ def plot_binned_residuals_per_species_subplot(
         plt.tight_layout()
         if path:
             plt.savefig(path, bbox_inches="tight", transparent=True)
-        fig.show()
diff --git a/selectlfq/featureengineering.py b/selectlfq/featureengineering.py
@@ -53,35 +53,25 @@ def _calculate_variance_distance(data):
 @njit
 def _nan_correlation_matrix(data):
     n = len(data)
-    correlation_matrix = np.empty((n, n))
+    correlation_matrix = np.full((n, n), np.nan)  # Initialize with NaN
 
     for i in range(n):
-        for j in range(n):  # Compute all elements
-            mask = np.isfinite(data[i]) & np.isfinite(data[j])
-
-            if np.sum(mask) > 1:  # Ensure there are at least two data points
-                xi = data[i][mask]
-                xj = data[j][mask]
-                std_dev_i = np.std(xi)
-                std_dev_j = np.std(xj)
-
-                if (std_dev_i > 0) and (std_dev_j > 0):
-                    mean_i = np.mean(xi)
-                    mean_j = np.mean(xj)
-                    sparsity = np.mean(mask)
-                    covariance = np.mean((xi - mean_i) * (xj - mean_j))
-                    corr = covariance / (std_dev_i * std_dev_j)
-                    correlation_matrix[i, j] = corr * sparsity
-
-                else:
-                    correlation_matrix[i, j] = np.nan  # Set to NaN if no variation
-
-            else:
-                correlation_matrix[i, j] = (
-                    np.nan
-                )  # Set to NaN if not enough data points
-
-    np.fill_diagonal(correlation_matrix, np.nan)
+        for j in range(n):
+            if i != j:  # Only compute for off-diagonal elements
+                mask = np.isfinite(data[i]) & np.isfinite(data[j])
+                if np.sum(mask) > 1:  # Ensure there are at least two data points
+                    xi = data[i][mask]
+                    xj = data[j][mask]
+                    std_dev_i = np.std(xi)
+                    std_dev_j = np.std(xj)
+
+                    if (std_dev_i > 0) and (std_dev_j > 0):
+                        mean_i = np.mean(xi)
+                        mean_j = np.mean(xj)
+                        sparsity = np.mean(mask)
+                        covariance = np.mean((xi - mean_i) * (xj - mean_j))
+                        corr = covariance / (std_dev_i * std_dev_j)
+                        correlation_matrix[i, j] = corr * sparsity
 
     return correlation_matrix
 

diff --git a/selectlfq/loader.py b/selectlfq/loader.py
@@ -1,10 +1,14 @@
-import pandas as pd
 import os
-from selectlfq.utils import repeater
-from selectlfq.preprocessing import Preprocessing
 from typing import List
-from selectlfq.constants import DataConfig
+
+import pandas as pd
+from selectlfq.shared_state import shared_state
+from selectlfq.preprocessing import Preprocessing
+from selectlfq.constants import DataConfig, ColumnConfig
 from selectlfq.ms1_features import FeatureConfig
+from selectlfq.utils import get_logger
+
+logger = get_logger()
 
 
 class Loader:
@@ -66,6 +70,18 @@ def load_pg_data(
         data = pd.read_csv(filepath, delimiter="\t")
         return self.preprocessing.preprocess_pg_data(data)
 
+    def load_features(self, output_folder: str) -> pd.DataFrame:
+        """
+        Load the features from a given directory.
+        """
+        precursor_features = self.load_precursor_file(output_folder)
+        fragment_features = self.load_fragment_data_files(output_folder)
+
+        return {
+            "ms1": precursor_features,
+            "ms2": fragment_features,
+        }
+
     def load_precursor_data(
         self, output_folder: str, categorical_features: list = None
     ) -> pd.DataFrame:
@@ -97,7 +113,7 @@ def load_precursor_data(
         return self.preprocessing.preprocess_pg_data(data)
 
     def load_fragment_data_files(
-        self, directory: str, feature_folder: str = "features"
+        self, directory: str = None, feature_folder: str = "features"
     ) -> List[pd.DataFrame]:
         """
         Load all fragment data files from a given directory. The files
@@ -123,17 +139,26 @@ def load_fragment_data_files(
 
         fragment_data_dict = {}
 
+        # if no directoy is provided, use the current directory
+        if not directory:
+            directory = os.getcwd()
+
         directory = os.path.join(directory, feature_folder)
 
         for feature_name in DataConfig.MS2_FEATURE_NAMES:
             file_name = feature_name + ".csv"
             file_path = os.path.join(directory, file_name)
             df = pd.read_csv(file_path, index_col=0)
-            fragment_data_dict[feature_name] = df
+            logger.info("Reading MS2 feature: %s", feature_name)
+            feature_name = "ms2_" + feature_name
 
+            fragment_data_dict[feature_name] = df
+        shared_state.ms2_identifiers = fragment_data_dict["ms2_intensity"][
+            ColumnConfig.IDENTIFIERS
+        ]
         return fragment_data_dict
 
-    def load_precursor_file(self, directory: str) -> pd.DataFrame:
+    def load_precursor_file(self, directory: str = None) -> pd.DataFrame:
         """
         Load the precursor data from a given directory and pivot
         the data by the given features.
@@ -149,8 +174,11 @@ def load_precursor_file(self, directory: str) -> pd.DataFrame:
         """
         # read data
 
-        if self.df is None:
+        if not self.df:
+            if not directory:
+                directory = os.getcwd()
             file_path = os.path.join(directory, "precursors.tsv")
+            logger.info("Reading precursor file from: %s", file_path)
             self.df = pd.read_csv(file_path, sep="\t", index_col=0)
 
         # pivot table by features
@@ -164,65 +192,27 @@ def _pivot_table_by_feature(self, features: list, data: pd.DataFrame):
         precursor_data_dict = {}
         if isinstance(features, list):
             for feat in features:
+                logger.info("Pivoting table by MS1 feature: %s", feat)
                 if feat == "sequence":
-                    data["prec_len"] = data["sequence"].apply(lambda x: len(x))
+                    data["prec_len"] = data["sequence"].str.len()
                     feat = "prec_len"
 
                 prec_data = data.pivot_table(
                     index=["mod_seq_charge_hash", "pg", "precursor_idx"],
                     columns="run",
                     values=feat,
                 ).reset_index()
+                feat = "ms1_" + feat
                 precursor_data_dict[feat] = prec_data
 
         else:
+            logger.info("Pivoting table by feature: %s", features)
             prec_data = data.pivot_table(
                 index=["mod_seq_charge_hash", "pg", "precursor_idx"],
                 columns="run",
                 values=features,
             ).reset_index()
+            features = "ms1_" + features
             precursor_data_dict[features] = prec_data
 
         return precursor_data_dict
-
-    def _sort_by_list(
-        self, data: pd.DataFrame, col: str, reindexed_list: List[int]
-    ) -> pd.DataFrame:
-        return data.set_index(col).reindex(reindexed_list).reset_index()
-
-    def sync_ms1_and_ms2_data(self, data: List[pd.DataFrame]) -> List[pd.DataFrame]:
-        """
-        Sync the ms1 and ms2 data by the precursor index. The data is
-        sorted by the ion column, so that the data is in the same order
-        for both ms1 and ms2.
-
-        Parameters
-        ----------
-        data : List[pd.DataFrame]
-            The data to sync.
-        Returns
-        -------
-        List[pd.DataFrame]
-            The synced data.
-        """
-        if self.prec_idx is None:
-            raise ValueError(
-                "Precursor index is not defined, load fragment data first to retrieve precursor index"
-            )
-        else:
-            synced_data = repeater(
-                data,
-                pd.merge,
-                False,
-                right=self.prec_idx,
-                on=["precursor_idx", "mod_seq_charge_hash"],
-                how="outer",
-            )
-            sorted_data = repeater(
-                synced_data,
-                self._sort_by_list,
-                False,
-                col="ion",
-                reindexed_list=self.prec_idx["ion"].tolist(),
-            )
-            return sorted_data
diff --git a/selectlfq/ms1_features.py b/selectlfq/ms1_features.py
@@ -208,9 +208,26 @@ class FeatureConfig:
         "score",
         "proba",
         "base_width_rt",
-        "rt_calibrated",  # might have to delete
+        "rt_calibrated",
         "rt_library",
         "delta_rt",
+        "cycle_fwhm",
+        "mz_observed",
+        "mz_library",
+        "mz_calibrated",
+        "mean_ms2_mass_error",
+        "top_3_ms2_mass_error",
+        "mean_overlapping_mass_error",
+        # "isotope_intensity_correlation",
+        # "isotope_height_correlation",
+        # "height_correlation",
+        # "fragment_scan_correlation",
+        # "template_scan_correlation",
+        # "fragment_frame_correlation",
+        "top3_frame_correlation",
+        # "template_frame_correlation",
+        "top3_b_ion_correlation",
+        # "top3_y_ion_correlation",
     ]
 
     NORMALIZATION_FEATURES = ["intensity", "mono_ms1_intensity", "sum_ms1_intensity"]