Skip to content

Commit

Permalink
validation of factor values.
Browse files Browse the repository at this point in the history
  • Loading branch information
ypriverol committed Jul 28, 2024
1 parent 793c903 commit b61da33
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 0 deletions.
4 changes: 4 additions & 0 deletions sdrf_pipelines/parse_sdrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def maxquant_from_sdrf(
)
Maxquant().maxquant_experiamental_design(sdrf, output2)


@click.command("validate-sdrf", short_help="Command to validate the sdrf file")
@click.option("--sdrf_file", "-s", help="SDRF file to be validated")
@click.option(
Expand Down Expand Up @@ -169,6 +170,9 @@ def validate_sdrf(ctx, sdrf_file: str, template: str, skip_ms_validation: bool,
if not skip_ms_validation:
errors = errors + df.validate(MASS_SPECTROMETRY)

if not skip_factor_validation:
errors = errors + df.validate_factor_values()

for error in errors:
print(error)

Expand Down
42 changes: 42 additions & 0 deletions sdrf_pipelines/sdrf/sdrf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import logging
from typing import List

import pandas as pd

Expand All @@ -17,6 +18,8 @@
from sdrf_pipelines.sdrf.sdrf_schema import nonvertebrates_chema
from sdrf_pipelines.sdrf.sdrf_schema import plants_chema
from sdrf_pipelines.sdrf.sdrf_schema import vertebrates_chema
from sdrf_pipelines.utils.exceptions import LogicError
from typing import List


class SdrfDataFrame(pd.DataFrame):
Expand Down Expand Up @@ -77,3 +80,42 @@ def validate(self, template: str):
errors = mass_spectrometry_schema.validate(self)

return errors

def validate_factor_values(self) -> List[LogicError]:
"""
Validate that factor values are present in the SDRF columns.
:return: A list of LogicError objects if any factor value columns are missing, otherwise an empty list.
"""
errors = []
# Check if any column starts with 'factor value' (case-insensitive)
fv_values = [col for col in self.columns if col.lower().startswith("factor value")]

if len(fv_values) == 0:
error_message = f"No factor values present in the following SDRF columns: {self.columns}"
errors.append(LogicError(error_message, error_type=logging.ERROR))

# find the corresponding columns for the factor values
fv_dc = {}
for fv in fv_values:
factor = fv.lower().replace("factor value[", "").replace("]", "")
cols = [col for col in self.columns if (factor in col.lower() and "factor value" not in col.lower())]
if len(cols) == 0:
error_message = f"Make sure your SDRF have a sample characteristics or data comment '{factor}' for your factor value column '{fv}'"
errors.append(LogicError(error_message, error_type=logging.ERROR))
elif len(cols) > 1:
error_message = f"Multiple columns found for factor '{factor}': {cols}"
errors.append(LogicError(error_message, error_type=logging.ERROR))
else:
fv_dc[fv] = cols[0]

for factor, col in fv_dc.items():
equals_cols = self[factor].equals(self[col])
if not equals_cols:
# if factor value contains different values from corresponding columns, print the values
different_values = self[factor][self[factor] != self[col]]
different_values = different_values.index.tolist()
error_message = f"Factor '{factor}' and column '{col}' do not have the same values for the following rows: {different_values}"
errors.append(LogicError(error_message, error_type=logging.ERROR))

return errors

0 comments on commit b61da33

Please sign in to comment.