Merge branch 'dev' of https://github.com/bigbio/quantms.io into dev

bigbio · Oct 15, 2024 · 3edeee6 · 3edeee6
2 parents 5689ccd + 5a7503c
commit 3edeee6
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 46 deletions.
diff --git a/docs/README.adoc b/docs/README.adoc
@@ -733,23 +733,15 @@ The following table presents all the fields and attributes for each PSM entry in
 | -
 | -
 
-7+^| Protein fields shared by <<feature>> <<psm>>
-| `pg_accessions`
-| Protein group accessions of all the proteins that the peptide maps to
-| array[string], null
-| Protein.Ids
-| x
-| Proteins
-| accession
-
-| `pg_positions`
-| Protein start and end positions written as start_post: end_post
-| array[string], null
+| `cv_params`
+| Optional list of CV parameters for additional metadata <<psm-cv-params>>
+| array[struct{name:string, value:string}], null
+| -
+| -
+| -
 | -
-| x
-| x
-| Combination of start and end positions
 
+7+^| Protein fields shared by <<feature>> <<psm>>
 | `unique`
 | Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0
 | int32, null
@@ -758,29 +750,21 @@ The following table presents all the fields and attributes for each PSM entry in
 | Unique
 | unique
 
-| `protein_global_qvalue`
+| `pg_global_qvalue`
 | Global q-value of the protein group at the experiment level
 | float32, null
 | Global.PG.Q.Value
 | x
 | x
 | best_search_engine_score
 
-| `gg_accessions`
-| Gene group accessions, as a string array
+| `mp_accessions`
+| Protein accessions of all the proteins that the peptide maps to
 | array[string], null
+| Protein.Ids
 | x
-| x
-| x
-| -
-
-| `gg_names`
-| Gene names, as a string array
-| array[string], null
-| -
-| x
-| -
-| -
+| Proteins
+| accession
 
 7+^| Spectra fields shared by <<feature>> <<psm>>
 | `precursor_charge`
@@ -869,11 +853,23 @@ The following table presents all the fields and attributes for each PSM entry in
 ====
  - Psm view is NOT RECOMMENDED to be generated for **DIA** methods because it will be duplicated information with the feature view. The psm view is more suitable for **DDA** methods where the psm is the main output of the identification process.
 
- - Protein inference SHOULD NOT be included in the psm view, as it is not the main purpose of the psm view. Then, is RECOMMENDED that all proteins mapping to the peptide in the psm view are included in the `pg_accessions`, `pg_positions`, `gg_accessions` and `gg_names` fields. For protein inference please look into the feature view (<<feature>>) and protein group (<<pg>>).
+ - Protein inference SHOULD NOT be included in the psm view, as it is not the main purpose of the psm view. However, for some use cases like peptide filtering, search, etc., maybe interesting to have access to all the psms for a given protein accession, you can include that in the `mp_accessions`: mapped protein accessions. For protein inference please look into the feature view (<<feature>>) and protein group (<<pg>>).
 
  - The `mz_array` and `intensity_array` are arrays of the same length, where the `mz_array` contains the m/z values and the `intensity_array` contains the intensity values; and the size of the arrays is the same as the number of peaks in the spectrum. These three columns could help use cases like AI/ML that need the spectrum information for a given psm. We RECOMMEND using for spectra data the mz view (<<mz>>), where the spectra are stored in a more efficient way.
 ====
 
+[[psm-cv-params]]
+==== Psm CV parameters
+
+Cv params are a key-value pairs list that allows to store additional information for a given psm. For example, it could be used to store the following, mzIdentML information:
+
+- 'prot:FDR threshold': 0.01
+- number of unmatched peaks: 3
+
+The cv_params are stored as a list of key-value pairs, where the key is the name of the parameter, and the value is the value of the parameter. This is similar to the CVParams in the mzIdentML format. Please, be aware that search engine scores should be stored for psms in the column `additional_scores`.
+
+[[peptidoform]]
+
 [[psm-file-metadata]]
 ==== Psm file metadata
 

diff --git a/docs/psm.avsc b/docs/psm.avsc
@@ -64,21 +64,46 @@
               "doc": "List of identification scores subsidiary to the best score"
             },
             {"name": "rank", "type": ["null", "int"], "doc": "Rank of the peptide spectrum match in the search engine output"},
+              {
+              "name": "cv_params",
+              "type": [
+                "null",
+                {
+                  "type": "array",
+                  "items": {
+                    "type": "record",
+                    "name": "cv_param",
+                    "doc": "Controlled vocabulary (CV) parameters providing additional metadata for the scan.",
+                    "fields": [
+                      {
+                        "name": "name",
+                        "type": "string",
+                        "doc": "Name of the CV term (e.g., from PSI-MS or other ontologies)."
+                      },
+                      {
+                        "name": "value",
+                        "type": "string",
+                        "doc": "Value associated with the CV term."
+                      }
+                    ]
+                  }
+                }
+              ],
+              "default": null,
+              "doc": "Optional list of CV parameters for additional metadata."
+            },
 
-            {"name": "pg_accessions", "type": ["null",{"type": "array","items": "string"}], "doc": "Protein group accessions of all the proteins that the peptide maps to"},
-            {"name": "pg_positions", "type": ["null",{"type": "array","items": "string"}], "doc": "Protein start and end positions written as start_post:end_post"},
             {"name": "unique", "type": ["null", "int"], "doc": "Unique peptide indicator, if the peptide maps to a single protein, the value is 1, otherwise 0"},
-            {"name": "protein_global_qvalue", "type": ["null", "float32"], "doc": "Global q-value of the protein group at the experiment level"},
-            {"name": "gg_accessions", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene accessions, as string array"},
-            {"name": "gg_names", "type": ["null", {"type": "array", "items": "string"}], "doc": "Gene names, as string array"},
+            {"name": "pg_global_qvalue", "type": ["null", "float32"], "doc": "Global q-value of the protein group at the experiment level"},
+            {"name": "mp_accessions", "type": ["null", {"type": "array","items": "string"}], "doc": "List of all proteins accession where the given peptide map to"},
 
             {"name": "precursor_charge", "type": "int", "doc": "Precursor charge"},
             {"name": "observed_mz", "type": "float32", "doc": "Experimental peptide mass-to-charge ratio of identified peptide (in Da)"},
             {"name": "rt", "type": ["null", "float32"], "doc": "MS2 scan’s precursor retention time (in seconds)"},
             {"name": "predicted_rt", "type": ["null", "float32"], "doc": "Predicted retention time of the peptide (in seconds)"},
 
             {"name": "reference_file_name", "type": "string", "doc": "Spectrum file name with no path information and not including the file extension"},
-            {"name": "scan_number", "type": "string", "doc": "Scan number of the spectrum"},
+            {"name": "scan", "type": "string", "doc": "Scan number of the spectrum"},
             {"name": "ion_mobility", "type": ["null", "float32"], "doc": "Ion mobility value for the precursor ion"},
             {"name": "num_peaks", "type": ["null", "int"], "doc": "Number of peaks in the spectrum used for the peptide spectrum match"},
             {"name": "mz_array", "type": ["null", {"type": "array", "items": "float32"}], "doc": "Array of m/z values for the spectrum used for the peptide spectrum match"},

diff --git a/quantmsio/operate/tools.py b/quantmsio/operate/tools.py
@@ -63,6 +63,7 @@ def generate_features_of_spectrum(
         for pqwriter in pqwriters.values():
             pqwriter.close()
 
+
 def slice_parquet_file(df, partitions, output_folder, label):
     pqwriters = {}
     cols = df.columns
@@ -74,16 +75,17 @@ def slice_parquet_file(df, partitions, output_folder, label):
         if partion not in cols:
             raise Exception(f"{partion} does not exist")
     for key, df in df.groupby(partitions):
-            parquet_table = pa.Table.from_pandas(df, schema=schema)
-            folder = [output_folder] + [str(col) for col in key]
-            folder = os.path.join(*folder)
-            if not os.path.exists(folder):
-                os.makedirs(folder, exist_ok=True)
-            save_path = os.path.join(*[folder,label,".parquet"])
-            if not os.path.exists(save_path):
-                pqwriter = pq.ParquetWriter(save_path, parquet_table.schema)
-                pqwriters[key] = pqwriter
-            pqwriters[key].write_table(parquet_table)
+        parquet_table = pa.Table.from_pandas(df, schema=schema)
+        folder = [output_folder] + [str(col) for col in key]
+        folder = os.path.join(*folder)
+        if not os.path.exists(folder):
+            os.makedirs(folder, exist_ok=True)
+        save_path = os.path.join(*[folder, label, ".parquet"])
+        if not os.path.exists(save_path):
+            pqwriter = pq.ParquetWriter(save_path, parquet_table.schema)
+            pqwriters[key] = pqwriter
+        pqwriters[key].write_table(parquet_table)
+
 
 # gei unqnimous name
 def map_protein_for_parquet(parquet_path, fasta, output_path, map_parameter, label):