Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions docs/file_formats.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ Precursors Features Parquet Schema
.. raw:: html

<details>
<summary>Click to expand the Precursors Features Parquet Schema</summary>
<summary>Click to expand the Precursors Features Parquet Schema. Note that not all columns listed here are in every parquet file.</summary>
<pre>
Schema([
('PROTEIN_ID', Int64),
Expand Down Expand Up @@ -200,8 +200,28 @@ Precursors Features Parquet Schema
('SCORE_PEPTIDE_RUN_SPECIFIC_SCORE', Float64),
('SCORE_PEPTIDE_RUN_SPECIFIC_P_VALUE', Float64),
('SCORE_PEPTIDE_RUN_SPECIFIC_Q_VALUE', Float64),
('SCORE_PEPTIDE_RUN_SPECIFIC_PEP', Float64)
])
('SCORE_PEPTIDE_RUN_SPECIFIC_PEP', Float64),
('SCORE_PEPTIDE_EXPERIMENT_WIDE_SCORE', Float64),
('SCORE_PEPTIDE_EXPERIMENT_WIDE_P_VALUE', Float64),
('SCORE_PEPTIDE_EXPERIMENT_WIDE_Q_VALUE', Float64),
('SCORE_PEPTIDE_EXPERIMENT_WIDE_PEP', Float64),
('SCORE_PEPTIDE_GLOBAL_SCORE', Float64),
('SCORE_PEPTIDE_GLOBAL_P_VALUE', Float64),
('SCORE_PEPTIDE_GLOBAL_Q_VALUE', Float64),
('SCORE_PEPTIDE_GLOBAL_PEP', Float64),
('SCORE_PROTEIN_RUN_SPECIFIC_SCORE', Float64),
('SCORE_PROTEIN_RUN_SPECIFIC_P_VALUE', Float64),
('SCORE_PROTEIN_RUN_SPECIFIC_Q_VALUE', Float64),
('SCORE_PROTEIN_RUN_SPECIFIC_PEP', Float64),
('SCORE_PROTEIN_EXPERIMENT_WIDE_SCORE', Float64),
('SCORE_PROTEIN_EXPERIMENT_WIDE_P_VALUE', Float64),
('SCORE_PROTEIN_EXPERIMENT_WIDE_Q_VALUE', Float64),
('SCORE_PROTEIN_EXPERIMENT_WIDE_PEP', Float64),
('SCORE_PROTEIN_GLOBAL_SCORE', Float64),
('SCORE_PROTEIN_GLOBAL_P_VALUE', Float64),
('SCORE_PROTEIN_GLOBAL_Q_VALUE', Float64),
('SCORE_PROTEIN_GLOBAL_PEP', Float64)])

</pre>
</details>
<br>
Expand All @@ -213,7 +233,7 @@ Transition Features Parquet Schema
.. raw:: html

<details>
<summary>Click to expand the Transition Feature Parquet Schema</summary>
<summary>Click to expand the Transition Feature Parquet Schema. Note that not all columns listed here are in every parquet file. </summary>
<pre>
Schema([
('RUN_ID', Int64),
Expand Down
148 changes: 139 additions & 9 deletions pyprophet/io/export/osw.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,8 @@ def read(self) -> pd.DataFrame:
"""
Read data from the OpenSWATH workflow OSW-sqlite based file.
"""
if False:
con = duckdb.connect(self.infile)
return self._read_duckdb(con)
else:
con = sqlite3.connect(self.infile)
return self._read_sqlite(con)
con = sqlite3.connect(self.infile)
return self._read_sqlite(con)

def _create_indexes(self):
"""
Expand Down Expand Up @@ -744,6 +740,13 @@ def _prepare_column_info(self, conn) -> dict:
"score_transition_exists": {"SCORE_TRANSITION"}.issubset(table_names),
}

if column_info["score_protein_exists"]:
logger.debug("Checking SCORE_PROTEIN contexts")
column_info['score_protein_contexts'] = self._check_contexts(sql_conn, "SCORE_PROTEIN")
if column_info["score_peptide_exists"]:
logger.debug("Checking SCORE_PEPTIDE contexts")
column_info['score_peptide_contexts'] = self._check_contexts(sql_conn, "SCORE_PEPTIDE")

return column_info

def _export_split_by_run(self, conn, column_info: dict) -> None:
Expand Down Expand Up @@ -864,7 +867,7 @@ def _build_precursor_query(self, conn, column_info: dict) -> str:
)

# Check if score tables exist and build score SQLs
score_cols_selct, score_table_joins = (
score_cols_selct, score_table_joins, score_column_views = (
self._build_score_column_selection_and_joins(column_info)
)

Expand All @@ -878,7 +881,7 @@ def _build_precursor_query(self, conn, column_info: dict) -> str:
unimod_to_codename
)

# Create the merged mapping as you did in your example
# Create the merged mapping
unimod_mask = peptide_df["MODIFIED_SEQUENCE"].str.contains("UniMod")
merged_df = pd.merge(
peptide_df[unimod_mask][["codename", "ID"]],
Expand Down Expand Up @@ -935,6 +938,7 @@ def _build_precursor_query(self, conn, column_info: dict) -> str:
-- JOIN ipf_groups g USING (NORMALIZED_SEQUENCE)
--)

{score_column_views}
SELECT
PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID AS PROTEIN_ID,
PEPTIDE.ID AS PEPTIDE_ID,
Expand Down Expand Up @@ -1361,12 +1365,107 @@ def _build_gene_joins(self, column_info: dict) -> str:
"""
return ""

def _check_contexts(self, conn, score_table) -> list:
"""Get list of contexts available in score table"""
contexts_query = f""" SELECT DISTINCT context FROM {score_table} """
result = conn.execute(contexts_query).fetchall()
logger.debug("result of contexts query: ", result)
return [row[0] for row in result]

def _get_peptide_protein_score_table(self, level, contexts: list) -> str:
"""Create a DuckDB view for peptide/protein score data and return the view name"""
if level == "peptide":
id_col = "PEPTIDE_ID"
score_table = "SCORE_PEPTIDE"
else: # level == 'protein'
id_col = "PROTEIN_ID"
score_table = "SCORE_PROTEIN"

view_name = f"{score_table.lower()}_view"

# Build pivot columns for available contexts
pivot_cols = []
for context in contexts:
# Skip 'global' context as it will be handled separately
if context != 'global':
# Convert context to column-safe format: 'run-specific' -> 'RUN_SPECIFIC'
safe_context = context.upper().replace("-", "_")
pivot_cols.extend([
f"ANY_VALUE(CASE WHEN context = '{context}' THEN SCORE END) as {score_table}_{safe_context}_SCORE",
f"ANY_VALUE(CASE WHEN context = '{context}' THEN PVALUE END) as {score_table}_{safe_context}_PVALUE",
f"ANY_VALUE(CASE WHEN context = '{context}' THEN QVALUE END) as {score_table}_{safe_context}_QVALUE",
f"ANY_VALUE(CASE WHEN context = '{context}' THEN PEP END) as {score_table}_{safe_context}_PEP"
])

pivot_cols_str = ", ".join(pivot_cols)

# Non-global contexts pivot query
if pivot_cols_str: # Only if there are non-global contexts
non_global_query = f"""
SELECT {id_col}, RUN_ID, {pivot_cols_str}
FROM sqlite_scan('{self.config.infile}', '{score_table}')
WHERE context != 'global'
GROUP BY {id_col}, RUN_ID
"""
else:
# If no non-global contexts, create empty result with same structure
non_global_query = f"""
SELECT {id_col}, RUN_ID
FROM sqlite_scan('{self.config.infile}', '{score_table}')
WHERE 1=0
"""

global_exists = 'global' in contexts
# Global data query (only if global context exists)
if global_exists:
glob_query = f"""
SELECT {id_col},
SCORE as {score_table}_GLOBAL_SCORE,
PVALUE as {score_table}_GLOBAL_PVALUE,
QVALUE as {score_table}_GLOBAL_QVALUE,
PEP as {score_table}_GLOBAL_PEP
FROM sqlite_scan('{self.config.infile}', '{score_table}')
WHERE context = 'global'
"""

# Build final merged query based on what exists
if pivot_cols_str and global_exists:
# Both non-global and global contexts exist
merged_query = f"""
SELECT ng.*,
g.{score_table}_GLOBAL_SCORE,
g.{score_table}_GLOBAL_PVALUE,
g.{score_table}_GLOBAL_QVALUE,
g.{score_table}_GLOBAL_PEP
FROM ({non_global_query}) ng
LEFT JOIN ({glob_query}) g ON ng.{id_col} = g.{id_col}
"""
elif pivot_cols_str and not global_exists:
# Only non-global contexts exist
merged_query = non_global_query
elif not pivot_cols_str and global_exists:
# Only global context exists
merged_query = glob_query
else:
# No contexts exist - return empty result with ID columns
merged_query = f"""
SELECT {id_col}, RUN_ID
FROM sqlite_scan('{self.config.infile}', '{score_table}')
WHERE 1=0
"""

# Create the view in DuckDB
return f"{view_name} AS ({merged_query})"


def _build_score_column_selection_and_joins(
self, column_info: dict
) -> Tuple[str, str]:
) -> Tuple[str, str, str]:
"""Build score column selection and joins based on available score tables"""
score_columns_to_select = []
score_tables_to_join = []
score_views = []

if column_info["score_ms1_exists"]:
logger.debug("SCORE_MS1 table exists, adding score columns to selection")
score_columns_to_select.append(
Expand All @@ -1385,9 +1484,40 @@ def _build_score_column_selection_and_joins(
f"INNER JOIN sqlite_scan('{self.config.infile}', 'SCORE_MS2') AS SCORE_MS2 ON FEATURE.ID = SCORE_MS2.FEATURE_ID"
)

# Create views for peptide and protein score tables if they exist
if column_info["score_peptide_exists"]:
logger.debug("SCORE_PEPTIDE table exists, adding score table view to query")
score_views.append(self._get_peptide_protein_score_table("peptide", column_info["score_peptide_contexts"]))
# Add JOIN for peptide score view
score_tables_to_join.append("INNER JOIN score_peptide_view ON PEPTIDE.ID = score_peptide_view.PEPTIDE_ID AND FEATURE.RUN_ID = score_peptide_view.RUN_ID")
if column_info["score_protein_exists"]:
logger.debug("SCORE_PROTEIN table exists, adding score table view to query")
score_views.append(self._get_peptide_protein_score_table("protein", column_info["score_protein_contexts"]))
# Add JOIN for protein score view
score_tables_to_join.append("INNER JOIN score_protein_view ON PEPTIDE_PROTEIN_MAPPING.PROTEIN_ID = score_protein_view.PROTEIN_ID AND FEATURE.RUN_ID = score_protein_view.RUN_ID")

# Add score columns for peptide and protein contexts
for table in ["peptide", "protein"]:
if column_info[f"score_{table}_exists"]:
logger.debug(f"SCORE_{table.upper()} table exists, adding score columns to selection")
for context in column_info[f"score_{table}_contexts"]:
safe_context = context.upper().replace("-", "_")
score_columns_to_select.append(
f"score_{table.lower()}_view.SCORE_{table.upper()}_{safe_context}_SCORE AS SCORE_{table.upper()}_{safe_context}_SCORE, "
f"score_{table.lower()}_view.SCORE_{table.upper()}_{safe_context}_PVALUE AS SCORE_{table.upper()}_{safe_context}_P_VALUE, "
f"score_{table.lower()}_view.SCORE_{table.upper()}_{safe_context}_QVALUE AS SCORE_{table.upper()}_{safe_context}_Q_VALUE, "
f"score_{table.lower()}_view.SCORE_{table.upper()}_{safe_context}_PEP AS SCORE_{table.upper()}_{safe_context}_PEP"
)

# Properly format the WITH clause
with_clause = ""
if score_views:
with_clause = "WITH " + ", ".join(score_views)

return (
", ".join(score_columns_to_select),
" ".join(score_tables_to_join),
with_clause
)

def _execute_copy_query(self, conn, query: str, path: str) -> None:
Expand Down
1 change: 0 additions & 1 deletion pyprophet/io/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,6 @@ def group_by_run(files):
def unimod_to_codename(seq):
"""
Convert a sequence with unimod modifications to a codename.
This is a placeholder function; actual implementation may vary.
"""
seq_poms = poms.AASequence.fromString(seq)
codename = seq_poms.toString()
Expand Down