Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/dev' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
ypriverol committed Oct 21, 2024
2 parents 2f75b92 + 656d3e7 commit c865bff
Show file tree
Hide file tree
Showing 7 changed files with 81 additions and 266 deletions.
2 changes: 1 addition & 1 deletion docs/README.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -1152,7 +1152,7 @@ The following table presents the fields needed to describe each feature in quant
| `pg_accessions`
| Protein group accession. Could be one single protein or multiple protein accessions, depending on the tool.
| array[string], null
| Protein.Ids
| Protein.Group
| x
| Proteins
| accession
Expand Down
24 changes: 10 additions & 14 deletions quantmsio/commands/diann_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,6 @@
help="the diann report file path",
required=True,
)
@click.option(
"--design_file",
help="the design file path",
required=True,
)
@click.option("--qvalue_threshold", help="qvalue_threshold", required=True, default=0.05)
@click.option(
"--mzml_info_folder",
Expand All @@ -34,6 +29,11 @@
help="Folder where the Json file will be generated",
required=True,
)
@click.option(
"--protein_file",
help="Protein file that meets specific requirements",
required=False,
)
@click.option(
"--output_prefix_file",
help="Prefix of the Json file needed to generate the file name",
Expand All @@ -50,19 +50,18 @@
)
def diann_convert_to_parquet(
report_path: str,
design_file: str,
qvalue_threshold: float,
mzml_info_folder: str,
sdrf_path: str,
output_folder: str,
protein_file: str,
output_prefix_file: str,
duckdb_max_memory: str,
duckdb_threads: int,
file_num: int,
):
"""
report_path: diann report file path
design_file: the disign file path
qvalue_threshold: qvalue threshold
mzml_info_folder: mzml info file folder
sdrf_path: sdrf file path
Expand All @@ -74,7 +73,6 @@ def diann_convert_to_parquet(
"""
if (
report_path is None
or design_file is None
or mzml_info_folder is None
or output_folder is None
or sdrf_path is None
Expand All @@ -88,7 +86,6 @@ def diann_convert_to_parquet(
output_prefix_file = ""

feature_output_path = output_folder + "/" + create_uuid_filename(output_prefix_file, ".feature.parquet")
psm_output_path = output_folder + "/" + create_uuid_filename(output_prefix_file, ".psm.parquet")

dia_nn = DiaNNConvert(
diann_report=report_path,
Expand All @@ -97,11 +94,10 @@ def diann_convert_to_parquet(
duckdb_threads=duckdb_threads,
)

dia_nn.generate_psm_and_feature_file(
dia_nn.write_feature_to_file(
qvalue_threshold=qvalue_threshold,
mzml_info_folder=mzml_info_folder,
design_file=design_file,
psm_output_path=psm_output_path,
feature_output_path=feature_output_path,
file_num=file_num,
output_path = feature_output_path,
file_num = file_num,
protein_file=protein_file,
)
9 changes: 5 additions & 4 deletions quantmsio/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,20 +38,21 @@

DIANN_MAP = {
"File.Name": "reference_file_name",
"Precursor.Normalised": "intensity",
"Precursor.Quantity": "intensity",
"RT.Start": "rt_start",
"RT.Stop": "rt_stop",
"RT": "rt",
"Predicted.RT": "predicted_rt",
"Protein.Ids": "pg_accessions",
"Protein.Group": "pg_accessions",
"Protein.Ids": "mp_accessions",
"PEP": "posterior_error_probability",
"Global.Q.Value": "global_qvalue",
"Global.PG.Q.Value": "protein_global_qvalue",
"Global.PG.Q.Value": "pg_global_qvalue",
"Precursor.Charge": "precursor_charge",
"Stripped.Sequence": "sequence",
"Modified.Sequence": "peptidoform",
"Genes": "gg_names",
"opt_global_spectrum_reference": "scan_number",
"opt_global_spectrum_reference": "scan",
"Calculate.Precursor.Mz": "calculated_mz",
"exp_mass_to_charge": "observed_mz",
}
Expand Down
208 changes: 57 additions & 151 deletions quantmsio/core/diann.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,55 +11,14 @@
from pyopenms import AASequence
from pyopenms.Constants import PROTON_MASS_U
from quantmsio.core.project import create_uuid_filename
from quantmsio.utils.file_utils import extract_protein_list
from quantmsio.core.sdrf import SDRFHandler
from quantmsio.core.psm import Psm
from quantmsio.core.feature import Feature
from quantmsio.utils.pride_utils import get_peptidoform_proforma_version_in_mztab, generate_scan_number
from quantmsio.core.common import DIANN_MAP, QUANTMSIO_VERSION
from quantmsio.core.common import DIANN_MAP

MODIFICATION_PATTERN = re.compile(r"\((.*?)\)")


def get_exp_design_dfs(exp_design_file):
with open(exp_design_file, "r") as f:
data = f.readlines()
empty_row = data.index("\n")
f_table = [i.replace("\n", "").split("\t") for i in data[1:empty_row]]
f_header = data[0].replace("\n", "").split("\t")
f_table = pd.DataFrame(f_table, columns=f_header)
f_table.loc[:, "run"] = f_table.apply(lambda x: _true_stem(x["Spectra_Filepath"]), axis=1)
f_table.rename(columns={"Fraction_Group": "ms_run", "run": "Run"}, inplace=True)
s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:]
s_header = data[empty_row + 1].replace("\n", "").split("\t")
s_data_frame = pd.DataFrame(s_table, columns=s_header)

return s_data_frame, f_table


def _true_stem(x):
"""
Return the true stem of a file name, i.e. the
file name without the extension.
:param x: The file name
:type x: str
:return: The true stem of the file name
:rtype: str
Examples:
>>> _true_stem("foo.mzML")
'foo'
>>> _true_stem("foo.d.tar")
'foo'
These examples can be tested with pytest:
$ pytest -v --doctest-modules
"""
stem = os.path.splitext(os.path.basename(x))[0]

return stem


def find_modification(peptide):
"""
Identify the modification site based on the peptide containing modifications.
Expand Down Expand Up @@ -137,8 +96,8 @@ def get_report_from_database(self, runs: list) -> pd.DataFrame:
s = time.time()
database = self._duckdb.query(
"""
select "File.Name", "Run", "Protein.Ids", "Genes", "RT", "Predicted.RT", "RT.Start", "RT.Stop", "Precursor.Id", "Q.Value", "Global.Q.Value", "PEP",
"Global.PG.Q.Value", "Modified.Sequence", "Stripped.Sequence", "Precursor.Charge", "Precursor.Normalised"
select "File.Name", "Run", "Protein.Group", "Protein.Ids", "Genes", "RT", "Predicted.RT", "RT.Start", "RT.Stop", "Precursor.Id", "Q.Value", "Global.Q.Value", "PEP",
"PG.Q.Value", "Global.PG.Q.Value", "Modified.Sequence", "Stripped.Sequence", "Precursor.Charge", "Precursor.Quantity", "Precursor.Normalised"
from diann_report
where Run IN {}
""".format(
Expand Down Expand Up @@ -189,6 +148,7 @@ def main_report_df(
qvalue_threshold: float,
mzml_info_folder: str,
file_num: int,
protein_str: str = None
):
def intergrate_msg(n):
nonlocal report
Expand Down Expand Up @@ -227,6 +187,8 @@ def intergrate_msg(n):
info_list = [info_list[i : i + file_num] for i in range(0, len(info_list), file_num)]
for refs in info_list:
report = self.get_report_from_database(refs)
if protein_str:
report = report[report["Protein.Ids"].str.contains(f"{protein_str}", na=False)]
# restrict
report = report[report["Q.Value"] < qvalue_threshold]
usecols = report.columns.to_list() + [
Expand All @@ -246,8 +208,8 @@ def intergrate_msg(n):
report["modifications"] = report["Modified.Sequence"].apply(find_modification)
report["Modified.Sequence"] = report["Modified.Sequence"].map(modifications_map)
# pep
report["psm_reference_file_name"] = report["Precursor.Id"].map(best_ref_map)
report["psm_scan_number"] = None
report["scan_reference_file_name"] = report["Precursor.Id"].map(best_ref_map)
#report["scan"] = None
report.rename(columns=DIANN_MAP, inplace=True)
# add extra msg
report = self.add_additional_msg(report)
Expand All @@ -262,69 +224,60 @@ def add_additional_msg(self, report: pd.DataFrame) -> pd.DataFrame:
report["reference_file_name"] = report["reference_file_name"].apply(lambda x: x.split(".")[0])

report.loc[:, "is_decoy"] = "0"
report.loc[:, "channel"] = "LFQ"
report.loc[:, "unique"] = report["pg_accessions"].apply(lambda x: "0" if ";" in str(x) else "1")

report.loc[:, "pg_positions"] = None

report["peptidoform"] = report[["sequence", "modifications"]].apply(
lambda row: get_peptidoform_proforma_version_in_mztab(
row["sequence"], row["modifications"], self._modifications
),
axis=1,
)
report["scan_number"] = report["scan_number"].apply(generate_scan_number)
report["scan"] = report["scan"].apply(generate_scan_number)
report.loc[:, "gg_names"] = report["gg_names"].str.split(",")
report.loc[:, "additional_scores"] = None
report.loc[:, "additional_intensities"] = report["Precursor.Normalised"].apply(lambda v: [{"name": "normalized intensity", "value": np.float32(v)}])
report.loc[:, "additional_scores"] = report[["Q.Value","PG.Q.Value"]].apply(lambda row: [{"name": "qvalue", "value": row["Q.Value"]}, {"name": "pg_qvalue", "value": row["PG.Q.Value"]}],axis=1)
report.loc[:, "modification_details"] = None
report.loc[:, "cv_params"] = None
report.loc[:, "quantmsio_version"] = QUANTMSIO_VERSION
report.loc[:, "gg_accessions"] = None

report.loc[:, "best_id_score"] = None
return report

def generate_psm_file(self, report, psm_pqwriter, psm_output_path):
psm = report.copy()
psm.loc[:, "intensity_array"] = None
psm.loc[:, "ion_mobility"] = None
psm.loc[:, "mz_array"] = None
psm.loc[:, "num_peaks"] = None
psm.loc[:, "rank"] = None
parquet_table = Psm.convert_to_parquet(psm, self._modifications)
if not psm_pqwriter:
psm_pqwriter = pq.ParquetWriter(psm_output_path, parquet_table.schema)
psm_pqwriter.write_table(parquet_table)
return psm_pqwriter

def generate_psm_and_feature_file(

def generate_feature(
self,
qvalue_threshold: float,
mzml_info_folder: str,
design_file: str,
psm_output_path: str,
feature_output_path: str,
file_num: int = 50,
protein_str: str = None
):
psm_pqwriter = None
feature_pqwriter = None

s_data_frame, f_table = get_exp_design_dfs(design_file)
for report in self.main_report_df(qvalue_threshold, mzml_info_folder, file_num):
for report in self.main_report_df(qvalue_threshold, mzml_info_folder, file_num, protein_str):
s = time.time()
psm_pqwriter = self.generate_psm_file(report, psm_pqwriter, psm_output_path)
feature_pqwriter = self.generate_feature_file(
report,
s_data_frame,
f_table,
feature_pqwriter,
feature_output_path,
)
report = self.merge_sdrf_to_feature(report)
Feature.convert_to_parquet_format(report, self._modifications)
feature = Feature.transform_feature(report)
et = time.time() - s
logging.info("Time to generate psm and feature file {} seconds".format(et))
if psm_pqwriter:
psm_pqwriter.close()
if feature_pqwriter:
feature_pqwriter.close()
yield feature


def write_feature_to_file(
self,
qvalue_threshold: float,
mzml_info_folder: str,
output_path: str,
file_num: int = 50,
protein_file=None,
):
protein_list = extract_protein_list(protein_file) if protein_file else None
protein_str = "|".join(protein_list) if protein_list else None
pqwriter = None
for feature in self.generate_feature(qvalue_threshold, mzml_info_folder, file_num, protein_str):
if not pqwriter:
pqwriter = pq.ParquetWriter(output_path, feature.schema)
pqwriter.write_table(feature)
if pqwriter:
pqwriter.close()
self.destroy_duckdb_database()

def destroy_duckdb_database(self):
Expand All @@ -337,70 +290,23 @@ def destroy_duckdb_database(self):
self._duckdb_name = None
self._duckdb = None

def generate_feature_file(
self,
report,
s_data_frame,
f_table,
feature_pqwriter,
feature_output_path,
):
sdrf = pd.read_csv(
self._sdrf_path,
sep="\t",
usecols=[
"source name",
"comment[data file]",
"comment[technical replicate]",
],
def merge_sdrf_to_feature(self, report):
sdrf = Feature.transform_sdrf(self._sdrf_path)
report = pd.merge(
report,
sdrf,
left_on=["reference_file_name"],
right_on=["reference"],
how="left",
)
samples = sdrf["source name"].unique()
mixed_map = dict(zip(samples, range(1, len(samples) + 1)))
sdrf["comment[data file]"] = sdrf["comment[data file]"].apply(lambda x: x.split(".")[0])
sdrf = sdrf.set_index(["comment[data file]"])
sdrf_map = sdrf.to_dict()["source name"]
tec_map = sdrf.to_dict()["comment[technical replicate]"]
report = report[report["intensity"] != 0]
report = report.merge(
(
s_data_frame[["Sample", "MSstats_Condition", "MSstats_BioReplicate"]]
.merge(f_table[["Fraction", "Sample", "Run"]], on="Sample")
.rename(
columns={
"MSstats_BioReplicate": "BioReplicate",
"MSstats_Condition": "Condition",
}
)
.drop(columns=["Sample"])
),
on="Run",
validate="many_to_one",
)
report.rename(
columns={
"Condition": "condition",
"Fraction": "fraction",
"BioReplicate": "biological_replicate",
"Run": "run",
},
inplace=True,
)
report.loc[:, "channel"] = "LFQ"
report.loc[:, "sample_accession"] = report["reference_file_name"].map(sdrf_map)
report.loc[:, "comment[technical replicate]"] = report["reference_file_name"].map(tec_map)
report.loc[:, "run"] = report[["sample_accession", "comment[technical replicate]", "fraction"]].apply(
lambda row: str(mixed_map[row["sample_accession"]])
+ "_"
+ str(row["comment[technical replicate]"])
+ "_"
+ str(row["fraction"]),
report.drop(
[
"reference",
"label",
],
axis=1,
inplace=True,
)
report.drop(["comment[technical replicate]"], axis=1, inplace=True)
Feature.convert_to_parquet_format(report, self._modifications)
parquet_table = Feature.transform_feature(report)
if not feature_pqwriter:
feature_pqwriter = pq.ParquetWriter(feature_output_path, parquet_table.schema)
feature_pqwriter.write_table(parquet_table)

return feature_pqwriter
return report


Loading

0 comments on commit c865bff

Please sign in to comment.