diff --git a/docs/README.adoc b/docs/README.adoc index fe919f8..3764500 100644 --- a/docs/README.adoc +++ b/docs/README.adoc @@ -1152,7 +1152,7 @@ The following table presents the fields needed to describe each feature in quant | `pg_accessions` | Protein group accession. Could be one single protein or multiple protein accessions, depending on the tool. | array[string], null -| Protein.Ids +| Protein.Group | x | Proteins | accession diff --git a/quantmsio/commands/diann_command.py b/quantmsio/commands/diann_command.py index b52ad50..a3847e2 100644 --- a/quantmsio/commands/diann_command.py +++ b/quantmsio/commands/diann_command.py @@ -13,11 +13,6 @@ help="the diann report file path", required=True, ) -@click.option( - "--design_file", - help="the design file path", - required=True, -) @click.option("--qvalue_threshold", help="qvalue_threshold", required=True, default=0.05) @click.option( "--mzml_info_folder", @@ -34,6 +29,11 @@ help="Folder where the Json file will be generated", required=True, ) +@click.option( + "--protein_file", + help="Protein file that meets specific requirements", + required=False, +) @click.option( "--output_prefix_file", help="Prefix of the Json file needed to generate the file name", @@ -50,11 +50,11 @@ ) def diann_convert_to_parquet( report_path: str, - design_file: str, qvalue_threshold: float, mzml_info_folder: str, sdrf_path: str, output_folder: str, + protein_file: str, output_prefix_file: str, duckdb_max_memory: str, duckdb_threads: int, @@ -62,7 +62,6 @@ def diann_convert_to_parquet( ): """ report_path: diann report file path - design_file: the disign file path qvalue_threshold: qvalue threshold mzml_info_folder: mzml info file folder sdrf_path: sdrf file path @@ -74,7 +73,6 @@ def diann_convert_to_parquet( """ if ( report_path is None - or design_file is None or mzml_info_folder is None or output_folder is None or sdrf_path is None @@ -88,7 +86,6 @@ def diann_convert_to_parquet( output_prefix_file = "" feature_output_path = output_folder + "/" + create_uuid_filename(output_prefix_file, ".feature.parquet") - psm_output_path = output_folder + "/" + create_uuid_filename(output_prefix_file, ".psm.parquet") dia_nn = DiaNNConvert( diann_report=report_path, @@ -97,11 +94,10 @@ def diann_convert_to_parquet( duckdb_threads=duckdb_threads, ) - dia_nn.generate_psm_and_feature_file( + dia_nn.write_feature_to_file( qvalue_threshold=qvalue_threshold, mzml_info_folder=mzml_info_folder, - design_file=design_file, - psm_output_path=psm_output_path, - feature_output_path=feature_output_path, - file_num=file_num, + output_path = feature_output_path, + file_num = file_num, + protein_file=protein_file, ) diff --git a/quantmsio/core/common.py b/quantmsio/core/common.py index 1937e96..8728f10 100644 --- a/quantmsio/core/common.py +++ b/quantmsio/core/common.py @@ -38,20 +38,21 @@ DIANN_MAP = { "File.Name": "reference_file_name", - "Precursor.Normalised": "intensity", + "Precursor.Quantity": "intensity", "RT.Start": "rt_start", "RT.Stop": "rt_stop", "RT": "rt", "Predicted.RT": "predicted_rt", - "Protein.Ids": "pg_accessions", + "Protein.Group": "pg_accessions", + "Protein.Ids": "mp_accessions", "PEP": "posterior_error_probability", "Global.Q.Value": "global_qvalue", - "Global.PG.Q.Value": "protein_global_qvalue", + "Global.PG.Q.Value": "pg_global_qvalue", "Precursor.Charge": "precursor_charge", "Stripped.Sequence": "sequence", "Modified.Sequence": "peptidoform", "Genes": "gg_names", - "opt_global_spectrum_reference": "scan_number", + "opt_global_spectrum_reference": "scan", "Calculate.Precursor.Mz": "calculated_mz", "exp_mass_to_charge": "observed_mz", } diff --git a/quantmsio/core/diann.py b/quantmsio/core/diann.py index 591f0f9..513a0bf 100644 --- a/quantmsio/core/diann.py +++ b/quantmsio/core/diann.py @@ -11,55 +11,14 @@ from pyopenms import AASequence from pyopenms.Constants import PROTON_MASS_U from quantmsio.core.project import create_uuid_filename +from quantmsio.utils.file_utils import extract_protein_list from quantmsio.core.sdrf import SDRFHandler -from quantmsio.core.psm import Psm from quantmsio.core.feature import Feature from quantmsio.utils.pride_utils import get_peptidoform_proforma_version_in_mztab, generate_scan_number -from quantmsio.core.common import DIANN_MAP, QUANTMSIO_VERSION +from quantmsio.core.common import DIANN_MAP MODIFICATION_PATTERN = re.compile(r"\((.*?)\)") - -def get_exp_design_dfs(exp_design_file): - with open(exp_design_file, "r") as f: - data = f.readlines() - empty_row = data.index("\n") - f_table = [i.replace("\n", "").split("\t") for i in data[1:empty_row]] - f_header = data[0].replace("\n", "").split("\t") - f_table = pd.DataFrame(f_table, columns=f_header) - f_table.loc[:, "run"] = f_table.apply(lambda x: _true_stem(x["Spectra_Filepath"]), axis=1) - f_table.rename(columns={"Fraction_Group": "ms_run", "run": "Run"}, inplace=True) - s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:] - s_header = data[empty_row + 1].replace("\n", "").split("\t") - s_data_frame = pd.DataFrame(s_table, columns=s_header) - - return s_data_frame, f_table - - -def _true_stem(x): - """ - Return the true stem of a file name, i.e. the - file name without the extension. - - :param x: The file name - :type x: str - :return: The true stem of the file name - :rtype: str - - Examples: - >>> _true_stem("foo.mzML") - 'foo' - >>> _true_stem("foo.d.tar") - 'foo' - - These examples can be tested with pytest: - $ pytest -v --doctest-modules - """ - stem = os.path.splitext(os.path.basename(x))[0] - - return stem - - def find_modification(peptide): """ Identify the modification site based on the peptide containing modifications. @@ -137,8 +96,8 @@ def get_report_from_database(self, runs: list) -> pd.DataFrame: s = time.time() database = self._duckdb.query( """ - select "File.Name", "Run", "Protein.Ids", "Genes", "RT", "Predicted.RT", "RT.Start", "RT.Stop", "Precursor.Id", "Q.Value", "Global.Q.Value", "PEP", - "Global.PG.Q.Value", "Modified.Sequence", "Stripped.Sequence", "Precursor.Charge", "Precursor.Normalised" + select "File.Name", "Run", "Protein.Group", "Protein.Ids", "Genes", "RT", "Predicted.RT", "RT.Start", "RT.Stop", "Precursor.Id", "Q.Value", "Global.Q.Value", "PEP", + "PG.Q.Value", "Global.PG.Q.Value", "Modified.Sequence", "Stripped.Sequence", "Precursor.Charge", "Precursor.Quantity", "Precursor.Normalised" from diann_report where Run IN {} """.format( @@ -189,6 +148,7 @@ def main_report_df( qvalue_threshold: float, mzml_info_folder: str, file_num: int, + protein_str: str = None ): def intergrate_msg(n): nonlocal report @@ -227,6 +187,8 @@ def intergrate_msg(n): info_list = [info_list[i : i + file_num] for i in range(0, len(info_list), file_num)] for refs in info_list: report = self.get_report_from_database(refs) + if protein_str: + report = report[report["Protein.Ids"].str.contains(f"{protein_str}", na=False)] # restrict report = report[report["Q.Value"] < qvalue_threshold] usecols = report.columns.to_list() + [ @@ -246,8 +208,8 @@ def intergrate_msg(n): report["modifications"] = report["Modified.Sequence"].apply(find_modification) report["Modified.Sequence"] = report["Modified.Sequence"].map(modifications_map) # pep - report["psm_reference_file_name"] = report["Precursor.Id"].map(best_ref_map) - report["psm_scan_number"] = None + report["scan_reference_file_name"] = report["Precursor.Id"].map(best_ref_map) + #report["scan"] = None report.rename(columns=DIANN_MAP, inplace=True) # add extra msg report = self.add_additional_msg(report) @@ -262,69 +224,60 @@ def add_additional_msg(self, report: pd.DataFrame) -> pd.DataFrame: report["reference_file_name"] = report["reference_file_name"].apply(lambda x: x.split(".")[0]) report.loc[:, "is_decoy"] = "0" + report.loc[:, "channel"] = "LFQ" report.loc[:, "unique"] = report["pg_accessions"].apply(lambda x: "0" if ";" in str(x) else "1") - report.loc[:, "pg_positions"] = None - report["peptidoform"] = report[["sequence", "modifications"]].apply( lambda row: get_peptidoform_proforma_version_in_mztab( row["sequence"], row["modifications"], self._modifications ), axis=1, ) - report["scan_number"] = report["scan_number"].apply(generate_scan_number) + report["scan"] = report["scan"].apply(generate_scan_number) report.loc[:, "gg_names"] = report["gg_names"].str.split(",") - report.loc[:, "additional_scores"] = None + report.loc[:, "additional_intensities"] = report["Precursor.Normalised"].apply(lambda v: [{"name": "normalized intensity", "value": np.float32(v)}]) + report.loc[:, "additional_scores"] = report[["Q.Value","PG.Q.Value"]].apply(lambda row: [{"name": "qvalue", "value": row["Q.Value"]}, {"name": "pg_qvalue", "value": row["PG.Q.Value"]}],axis=1) report.loc[:, "modification_details"] = None report.loc[:, "cv_params"] = None - report.loc[:, "quantmsio_version"] = QUANTMSIO_VERSION report.loc[:, "gg_accessions"] = None - + report.loc[:, "best_id_score"] = None return report - def generate_psm_file(self, report, psm_pqwriter, psm_output_path): - psm = report.copy() - psm.loc[:, "intensity_array"] = None - psm.loc[:, "ion_mobility"] = None - psm.loc[:, "mz_array"] = None - psm.loc[:, "num_peaks"] = None - psm.loc[:, "rank"] = None - parquet_table = Psm.convert_to_parquet(psm, self._modifications) - if not psm_pqwriter: - psm_pqwriter = pq.ParquetWriter(psm_output_path, parquet_table.schema) - psm_pqwriter.write_table(parquet_table) - return psm_pqwriter - - def generate_psm_and_feature_file( + + def generate_feature( self, qvalue_threshold: float, mzml_info_folder: str, - design_file: str, - psm_output_path: str, - feature_output_path: str, file_num: int = 50, + protein_str: str = None ): - psm_pqwriter = None - feature_pqwriter = None - - s_data_frame, f_table = get_exp_design_dfs(design_file) - for report in self.main_report_df(qvalue_threshold, mzml_info_folder, file_num): + for report in self.main_report_df(qvalue_threshold, mzml_info_folder, file_num, protein_str): s = time.time() - psm_pqwriter = self.generate_psm_file(report, psm_pqwriter, psm_output_path) - feature_pqwriter = self.generate_feature_file( - report, - s_data_frame, - f_table, - feature_pqwriter, - feature_output_path, - ) + report = self.merge_sdrf_to_feature(report) + Feature.convert_to_parquet_format(report, self._modifications) + feature = Feature.transform_feature(report) et = time.time() - s logging.info("Time to generate psm and feature file {} seconds".format(et)) - if psm_pqwriter: - psm_pqwriter.close() - if feature_pqwriter: - feature_pqwriter.close() + yield feature + + def write_feature_to_file( + self, + qvalue_threshold: float, + mzml_info_folder: str, + output_path: str, + file_num: int = 50, + protein_file=None, + ): + protein_list = extract_protein_list(protein_file) if protein_file else None + protein_str = "|".join(protein_list) if protein_list else None + pqwriter = None + for feature in self.generate_feature(qvalue_threshold, mzml_info_folder, file_num, protein_str): + if not pqwriter: + pqwriter = pq.ParquetWriter(output_path, feature.schema) + pqwriter.write_table(feature) + if pqwriter: + pqwriter.close() self.destroy_duckdb_database() def destroy_duckdb_database(self): @@ -337,70 +290,23 @@ def destroy_duckdb_database(self): self._duckdb_name = None self._duckdb = None - def generate_feature_file( - self, - report, - s_data_frame, - f_table, - feature_pqwriter, - feature_output_path, - ): - sdrf = pd.read_csv( - self._sdrf_path, - sep="\t", - usecols=[ - "source name", - "comment[data file]", - "comment[technical replicate]", - ], + def merge_sdrf_to_feature(self, report): + sdrf = Feature.transform_sdrf(self._sdrf_path) + report = pd.merge( + report, + sdrf, + left_on=["reference_file_name"], + right_on=["reference"], + how="left", ) - samples = sdrf["source name"].unique() - mixed_map = dict(zip(samples, range(1, len(samples) + 1))) - sdrf["comment[data file]"] = sdrf["comment[data file]"].apply(lambda x: x.split(".")[0]) - sdrf = sdrf.set_index(["comment[data file]"]) - sdrf_map = sdrf.to_dict()["source name"] - tec_map = sdrf.to_dict()["comment[technical replicate]"] - report = report[report["intensity"] != 0] - report = report.merge( - ( - s_data_frame[["Sample", "MSstats_Condition", "MSstats_BioReplicate"]] - .merge(f_table[["Fraction", "Sample", "Run"]], on="Sample") - .rename( - columns={ - "MSstats_BioReplicate": "BioReplicate", - "MSstats_Condition": "Condition", - } - ) - .drop(columns=["Sample"]) - ), - on="Run", - validate="many_to_one", - ) - report.rename( - columns={ - "Condition": "condition", - "Fraction": "fraction", - "BioReplicate": "biological_replicate", - "Run": "run", - }, - inplace=True, - ) - report.loc[:, "channel"] = "LFQ" - report.loc[:, "sample_accession"] = report["reference_file_name"].map(sdrf_map) - report.loc[:, "comment[technical replicate]"] = report["reference_file_name"].map(tec_map) - report.loc[:, "run"] = report[["sample_accession", "comment[technical replicate]", "fraction"]].apply( - lambda row: str(mixed_map[row["sample_accession"]]) - + "_" - + str(row["comment[technical replicate]"]) - + "_" - + str(row["fraction"]), + report.drop( + [ + "reference", + "label", + ], axis=1, + inplace=True, ) - report.drop(["comment[technical replicate]"], axis=1, inplace=True) - Feature.convert_to_parquet_format(report, self._modifications) - parquet_table = Feature.transform_feature(report) - if not feature_pqwriter: - feature_pqwriter = pq.ParquetWriter(feature_output_path, parquet_table.schema) - feature_pqwriter.write_table(parquet_table) - - return feature_pqwriter + return report + + diff --git a/quantmsio/core/feature.py b/quantmsio/core/feature.py index baec25d..1676bc5 100644 --- a/quantmsio/core/feature.py +++ b/quantmsio/core/feature.py @@ -126,7 +126,7 @@ def merge_pep_msg(row): for key, df in psm.groupby(["opt_global_cv_MS:1000889_peptidoform_sequence", "precursor_charge"]): df = df.reset_index(drop=True) if key not in map_dict: - map_dict[key] = [None for i in range(10)] + map_dict[key] = [None for _ in range(10)] qvalue = None temp_df = None if len(df["best_qvalue"].unique()) > 1 or not pd.isna(df.loc[0, "best_qvalue"]): diff --git a/tests/examples/DIANN/PXD019909-DIA.sdrf_openms_design.tsv b/tests/examples/DIANN/PXD019909-DIA.sdrf_openms_design.tsv deleted file mode 100644 index 679b786..0000000 --- a/tests/examples/DIANN/PXD019909-DIA.sdrf_openms_design.tsv +++ /dev/null @@ -1,89 +0,0 @@ -Fraction_Group Fraction Spectra_Filepath Label Sample -1 1 20180914_QE8_nLC0_BDA_SA_DIA_Keratinocytes_NN002.mzML 1 1 -2 1 20180914_QE8_nLC0_BDA_SA_DIA_Keratinocytes_NN003.mzML 1 2 -3 1 20180914_QE8_nLC0_BDA_SA_DIA_Keratinocytes_NN004.mzML 1 3 -4 1 20180914_QE8_nLC0_BDA_SA_DIA_Keratinocytes_NN009.mzML 1 4 -5 1 20180914_QE8_nLC0_BDA_SA_DIA_Skin_Dendritic_cells_DC_MT_250000.mzML 1 5 -6 1 20180914_QE8_nLC0_BDA_SA_DIA_Skin_Dendritic_cells_DC_MT_500000.mzML 1 6 -7 1 20180914_QE8_nLC0_BDA_SA_DIA_Skin_Dendritic_cells_DC_MT_600000.mzML 1 7 -8 1 20180914_QE8_nLC0_BDA_SA_DIA_Skin_Dermal_T_cells_CD4_MT_200000_1.mzML 1 8 -9 1 20180914_QE8_nLC0_BDA_SA_DIA_Skin_Dermal_T_cells_CD4_MT_200000_2.mzML 1 9 -10 1 20180914_QE8_nLC0_BDA_SA_DIA_Skin_Dermal_T_cells_CD4_MT_350000_3.mzML 1 10 -11 1 20180914_QE8_nLC0_BDA_SA_DIA_Skin_Dermal_T_cells_CD8_MT_200000_1.mzML 1 11 -12 1 20180914_QE8_nLC0_BDA_SA_DIA_Skin_Dermal_T_cells_CD8_MT_200000_3.mzML 1 12 -13 1 20180914_QE8_nLC0_BDA_SA_DIA_Skin_Dermal_T_cells_CD8_MT_450000_2.mzML 1 13 -14 1 20180916_QE8_nLC0_BDA_SA_DIA_Epi_T_cells_donor_11.mzML 1 14 -15 1 20180916_QE8_nLC0_BDA_SA_DIA_Epi_T_cells_donor_12.mzML 1 15 -16 1 20180916_QE8_nLC0_BDA_SA_DIA_Epi_T_cells_donor_13.mzML 1 16 -17 1 20180916_QE8_nLC0_BDA_SA_DIA_Epi_T_cells_donor_14.mzML 1 17 -18 1 20180916_QE8_nLC0_BDA_SA_DIA_Epi_T_cells_donor_15.mzML 1 18 -19 1 20180916_QE8_nLC0_BDA_SA_DIA_Epi_T_cells_donor_16.mzML 1 19 -20 1 20180916_QE8_nLC0_BDA_SA_DIA_Skin_MastCells_350000_1.mzML 1 20 -21 1 20180916_QE8_nLC0_BDA_SA_DIA_Skin_MastCells_600000_2.mzML 1 21 -22 1 20180916_QE8_nLC0_BDA_SA_DIA_Skin_MastCells_50000_3.mzML 1 22 -23 1 20180916_QE8_nLC0_BDA_SA_DIA_Skin_MastCells_300000_4.mzML 1 23 -24 1 20180916_QE8_nLC0_BDA_SA_DIA_Skin_MastCells_1000000_5.mzML 1 24 -25 1 20180916_QE8_nLC0_BDA_SA_DIA_Skin_MastCells_300000_6.mzML 1 25 -26 1 20180920_QE8_nLC0_BDA_SA_DIA_Endothelial_cells_donor_10.mzML 1 26 -27 1 20180920_QE8_nLC0_BDA_SA_DIA_Endothelial_cells_donor_11.mzML 1 27 -28 1 20180920_QE8_nLC0_BDA_SA_DIA_Endothelial_cells_donor_8.mzML 1 28 -29 1 20180920_QE8_nLC0_BDA_SA_DIA_Endothelial_cells_donor_9.mzML 1 29 -30 1 20180920_QE8_nLC0_BDA_SA_DIA_Melanocytes_donor_11.mzML 1 30 -31 1 20180920_QE8_nLC0_BDA_SA_DIA_Melanocytes_donor_12.mzML 1 31 -32 1 20180920_QE8_nLC0_BDA_SA_DIA_Melanocytes_donor_13.mzML 1 32 -33 1 20180920_QE8_nLC0_BDA_SA_DIA_Melanocytes_donor_15.mzML 1 33 -34 1 20180920_QE8_nLC0_BDA_SA_DIA_Melanocytes_donor_16.mzML 1 34 -35 1 20181001_QE8_nLC0_BDA_SA_DIA_Macrophages_donor_12.mzML 1 35 -36 1 20181001_QE8_nLC0_BDA_SA_DIA_Macrophages_donor_13.mzML 1 36 -37 1 20181001_QE8_nLC0_BDA_SA_DIA_Macrophages_donor_15.mzML 1 37 -38 1 20181001_QE8_nLC0_BDA_SA_DIA_Marcrophages_donor_16.mzML 1 38 -39 1 20181012_QE8_nLC0_BDA_SA_DIA_P1_NN006_fibroblasts.mzML 1 39 -40 1 20181012_QE8_nLC0_BDA_SA_DIA_P1_NN007_fibroblasts.mzML 1 40 -41 1 20181012_QE8_nLC0_BDA_SA_DIA_P1_NN008_fibroblasts.mzML 1 41 -42 1 20181012_QE8_nLC0_BDA_SA_DIA_P1_NN010_fibroblasts.mzML 1 42 -43 1 20181012_QE8_nLC0_BDA_SA_DIA_P2_NN009_fibroblasts.mzML 1 43 - -Sample MSstats_Condition MSstats_BioReplicate -1 skin 1 -2 skin 2 -3 skin 3 -4 skin 4 -5 skin 5 -6 skin 6 -7 skin 7 -8 skin 8 -9 skin 9 -10 skin 10 -11 skin 11 -12 skin 12 -13 skin 13 -14 skin 14 -15 skin 15 -16 skin 16 -17 skin 17 -18 skin 18 -19 skin 19 -20 skin 20 -21 skin 21 -22 skin 22 -23 skin 23 -24 skin 24 -25 skin 25 -26 skin 26 -27 skin 27 -28 skin 28 -29 skin 29 -30 skin 30 -31 skin 31 -32 skin 32 -33 skin 33 -34 skin 34 -35 skin 35 -36 skin 36 -37 skin 37 -38 skin 38 -39 skin 39 -40 skin 40 -41 skin 41 -42 skin 42 -43 skin 43 diff --git a/tests/test_diann.py b/tests/test_diann.py index 6e5942c..60e8257 100644 --- a/tests/test_diann.py +++ b/tests/test_diann.py @@ -1,6 +1,7 @@ from .common import datafile from unittest import TestCase from quantmsio.core.diann import DiaNNConvert +from quantmsio.core.feature import Feature from ddt import data from ddt import ddt @@ -11,7 +12,6 @@ class TestFeatureHandler(TestCase): test_datas = [ ( "DIANN/diann_report.tsv", - "DIANN/PXD019909-DIA.sdrf_openms_design.tsv", "DIANN/PXD019909-DIA.sdrf.tsv", "DIANN/mzml", ), @@ -20,9 +20,10 @@ class TestFeatureHandler(TestCase): @data(*test_datas) def test_transform_msstats(self, test_data): report_file = datafile(test_data[0]) - # design_file = datafile(test_data[1]) - sdrf_file = datafile(test_data[2]) - mzml = datafile(test_data[3]) + sdrf_file = datafile(test_data[1]) + mzml = datafile(test_data[2]) D = DiaNNConvert(report_file, sdrf_file) - for _ in D.main_report_df(0.05, mzml, 2): - print("ok") + for report in D.main_report_df(0.05, mzml, 2): + report = D.merge_sdrf_to_feature(report) + Feature.convert_to_parquet_format(report, D._modifications) + Feature.transform_feature(report)