Skip to content

Commit

Permalink
check assay name + data file combination
Browse files Browse the repository at this point in the history
  • Loading branch information
ypriverol committed Jul 28, 2024
1 parent 76a4ea0 commit 807d86b
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 3 deletions.
7 changes: 6 additions & 1 deletion sdrf_pipelines/parse_sdrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,9 @@ def maxquant_from_sdrf(
is_flag=True,
)
@click.option("--skip_factor_validation", help="Disable the validation of factor values in SDRF", is_flag=True)
@click.option("--skip_experimental_design_validation", help="Disable the validation of experimental design", is_flag=True)
@click.pass_context
def validate_sdrf(ctx, sdrf_file: str, template: str, skip_ms_validation: bool, skip_factor_validation: bool):
def validate_sdrf(ctx, sdrf_file: str, template: str, skip_ms_validation: bool, skip_factor_validation: bool, skip_experimental_design_validation: bool):
"""
Command to validate the SDRF file. The validation is based on the template provided by the user.
User can select the template to be used for validation. If no template is provided, the default template will be used.
Expand All @@ -154,6 +155,7 @@ def validate_sdrf(ctx, sdrf_file: str, template: str, skip_ms_validation: bool,
@param template: template to be used for validation
@param skip_ms_validation: flag to skip the validation of mass spectrometry fields
@param skip_factor_validation: flag to skip the validation of factor values
@param skip_experimental_design_validation: flag to skip the validation of experimental design
"""

if sdrf_file is None:
Expand All @@ -173,6 +175,9 @@ def validate_sdrf(ctx, sdrf_file: str, template: str, skip_ms_validation: bool,
if not skip_factor_validation:
errors = errors + df.validate_factor_values()

if not skip_experimental_design_validation:
errors = errors + df.validate_experimental_design()

for error in errors:
print(error)

Expand Down
46 changes: 45 additions & 1 deletion sdrf_pipelines/sdrf/sdrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def _constructor(self):

def get_sdrf_columns(self):
"""
This method return the name of the columns of the SDRF.
This method returns the name of the columns of the SDRF.
:return:
"""
return self.columns
Expand Down Expand Up @@ -118,3 +118,47 @@ def validate_factor_values(self) -> List[LogicError]:
errors.append(LogicError(error_message, error_type=logging.ERROR))

return errors

def validate_experimental_design(self) -> List[LogicError]:
"""
Validate that the experimental design is correct. This method checks that the experimental design is correct,
including the following:
- A raw file can only have one associated assay name. If a raw file has more than one assay name, an error is
raised.
:return: A list of LogicError objects if the experimental design is incorrect, otherwise an empty list.
"""

errors = []

# Check that combination of values assay name and characteristics[data file] is unique in self
errors = self.check_inconsistencies_assay_file(errors)

return errors

def check_inconsistencies_assay_file(self, errors: List[LogicError]) -> List[LogicError]:
"""
Check that combination of values assay name and comment[data file] is unique in self
:return: A list of LogicError objects if the combination of values assay name and characteristics[data file] is
not unique, otherwise an empty list.
"""

# Group by col1 and check if each group has only one unique col2 value
col1_inconsistencies = self.groupby('assay name')['comment[data file]'].nunique()
col1_inconsistent_groups = col1_inconsistencies[col1_inconsistencies > 1]
if len(col1_inconsistent_groups) > 0:
cell_index = col1_inconsistent_groups.index.tolist()
error_message = f"Multiple assays with the same raw files: {cell_index}, the combination assay name and comment[data file] should be unique"
errors.append(LogicError(error_message, error_type=logging.ERROR))

# Group by col2 and check if each group has only one unique col1 value
col2_inconsistencies = self.groupby('comment[data file]')['assay name'].nunique()
col2_inconsistent_groups = col2_inconsistencies[col2_inconsistencies > 1]
if len(col2_inconsistent_groups) > 0:
cell_index = col2_inconsistent_groups.index.tolist()
error_message = f"Multiple raw files with the same assay: {cell_index}, the combination assay name and comment[data file] should be unique"
errors.append(LogicError(error_message, error_type=logging.ERROR))

return errors



2 changes: 1 addition & 1 deletion tests/data/erroneous/example.sdrf.tsv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
source name characteristics[organism] characteristics[organism part] characteristics[cell type] characteristics[developmental stage] characteristics[disease] characteristics[enrichment process] characteristics[biological replicate] characteristics[compound] characteristics[cell line] assay name technology type comment[data file] comment[technical replicate] comment[fraction identifier] comment[label] comment[cleavage agent details] comment[modification parameters] comment[modification parameters] comment[modification parameters] comment[modification parameters] comment[instrument] comment[associated file uri] factor value[compound] factor value[concentration of]
E1S1 Mus musculus immune system B cell not applicable mercury poisoning enrichment of phosphorylated protein 1 none CLO:0009575 run 1 proteomic profiling by mass spectrometry 1342_01.RAW 1 1 AC=MS:1002038;NT=label free sample NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 NT=Acetyl;AC=UNIMOD:1;PP=Protein N-term;MT=variable NT=Phospho;MT=variable;TA=S,T,Y;AC=UNIMOD:21 NT=LTQ;AC=MS:1000447 https://ftp.ebi.ac.uk/pride-archive/2013/12/PXD000312/1342_01.RAW none not applicable
E1S2 Mus musculus immune system B cell not applicable mercury poisoning enrichment of phosphorylated protein 1 none CLO:0009575 run 2 proteomic profiling by mass spectrometry 1342_02.RAW 1 1 AC=MS:1002038;NT=label free sample NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 NT=Acetyl;AC=UNIMOD:1;PP=Protein N-term;MT=variable NT=Phospho;MT=variable;TA=S,T,Y;AC=UNIMOD:21 NT=LTQ;AC=MS:1000447 https://ftp.ebi.ac.uk/pride-archive/2013/12/PXD000312/1342_02.RAW none not applicable
E1S2 Mus musculus immune system B cell not applicable mercury poisoning enrichment of phosphorylated protein 1 none CLO:0009575 run 1 proteomic profiling by mass spectrometry 1342_02.RAW 1 1 AC=MS:1002038;NT=label free sample NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 NT=Acetyl;AC=UNIMOD:1;PP=Protein N-term;MT=variable NT=Phospho;MT=variable;TA=S,T,Y;AC=UNIMOD:21 NT=LTQ;AC=MS:1000447 https://ftp.ebi.ac.uk/pride-archive/2013/12/PXD000312/1342_02.RAW none not applicable
E1S3 Mus musculus immune system B cell not applicable mercury poisoning enrichment of phosphorylated protein 1 none CLO:0009575 run 3 proteomic profiling by mass spectrometry 1342_03.RAW 1 1 AC=MS:1002038;NT=label free sample NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 NT=Acetyl;AC=UNIMOD:1;PP=Protein N-term;MT=variable NT=Phospho;MT=variable;TA=S,T,Y;AC=UNIMOD:21 NT=LTQ;AC=MS:1000447 https://ftp.ebi.ac.uk/pride-archive/2013/12/PXD000312/1342_03.RAW none not applicable
E1S4 Mus musculus immune system B cell not applicable mercury poisoning enrichment of phosphorylated protein 1 none CLO:0009575 run 4 proteomic profiling by mass spectrometry 1342_04.RAW 1 1 AC=MS:1002038;NT=label free sample NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 NT=Acetyl;AC=UNIMOD:1;PP=Protein N-term;MT=variable NT=Phospho;MT=variable;TA=S,T,Y;AC=UNIMOD:21 NT=LTQ;AC=MS:1000447 https://ftp.ebi.ac.uk/pride-archive/2013/12/PXD000312/1342_04.RAW none not applicable
E1S5 Mus musculus immune system B cell not applicable mercury poisoning enrichment of phosphorylated protein 1 mercury dichloride CLO:0009575 run 5 proteomic profiling by mass spectrometry 1342_05.RAW 1 1 AC=MS:1002038;NT=label free sample NT=Trypsin;AC=MS:1001251 NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 NT=Oxidation;MT=variable;TA=M;AC=UNIMOD:35 NT=Acetyl;AC=UNIMOD:1;PP=Protein N-term;MT=variable NT=Phospho;MT=variable;TA=S,T,Y;AC=UNIMOD:21 NT=LTQ;AC=MS:1000447 https://ftp.ebi.ac.uk/pride-archive/2013/12/PXD000312/1342_05.RAW mercury dichloride 250 uM
Expand Down

0 comments on commit 807d86b

Please sign in to comment.