Skip to content

Commit cddb5b3

Browse files
authored
Merge pull request #133 from wilhelm-lab/fix/custom_mods_tokens
Fix/custom mods tokens
2 parents 0efcfc0 + 3babf78 commit cddb5b3

File tree

2 files changed

+28
-4
lines changed

2 files changed

+28
-4
lines changed

spectrum_fundamentals/mod_string.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import difflib
22
import re
33
from itertools import combinations, repeat
4-
from typing import Dict, List, Optional, Tuple, Union
4+
from typing import Dict, List, Optional, Set, Tuple, Union
55

66
import numpy as np
77
import pandas as pd
@@ -311,10 +311,17 @@ def split_modstring(sequence: str, r_pattern):
311311
# Ugly and fast fix for reading modifications as is from maxquant we should reconsider how to fix it.
312312
# sequence = sequence.replace('M(ox)','M(U:35)')
313313
# sequence = sequence.replace('C','C(U:4)')
314+
val = max(alphabet.values()) + 1
314315
split_seq = r_pattern.findall(sequence)
315316
if "".join(split_seq) == sequence:
316317
if translate:
317-
return [alphabet[aa] for aa in split_seq]
318+
results = []
319+
for aa in split_seq:
320+
if aa not in alphabet: # does not exist
321+
alphabet[aa] = val
322+
val += 1
323+
results.append(alphabet[aa])
324+
return results
318325
else:
319326
return split_seq
320327
elif filter:
@@ -327,13 +334,23 @@ def split_modstring(sequence: str, r_pattern):
327334
f"The element(s) [{not_parsable_elements}] " f"in the sequence [{sequence}] could not be parsed"
328335
)
329336

330-
pattern = sorted(alphabet, key=len, reverse=True)
337+
unimod_pattern = r"[A-Z]\[UNIMOD:\d+\]"
338+
alphabet_pattern = [re.escape(i) for i in sorted(alphabet, key=len, reverse=True)]
331339

332-
pattern = [re.escape(i) for i in pattern]
340+
pattern = [unimod_pattern] + alphabet_pattern
333341
regex_pattern = re.compile("|".join(pattern))
334342
return map(split_modstring, sequences, repeat(regex_pattern))
335343

336344

345+
def get_all_tokens(sequences: List[str]) -> Set[str]:
346+
"""Parse given sequences in UNIMOD ProForma standard into a set of all tokens."""
347+
pattern = r"[ACDEFGHIKLMNPQRSTVWY](\[UNIMOD:\d+\])?"
348+
tokens = set()
349+
for seq in sequences:
350+
tokens |= {match.group() for match in re.finditer(pattern, seq)}
351+
return tokens
352+
353+
337354
def add_permutations(modified_sequence: str, unimod_id: int, residues: List[str]):
338355
"""
339356
Generate different peptide sequences with moving the modification to all possible residues.

tests/unit_tests/test_mod_string.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,13 @@ def test_parse_modstrings_invalid_with_filtering(self):
309309
invalid_seq = "testing"
310310
self.assertEqual(next(mod.parse_modstrings([invalid_seq], alphabet=c.ALPHABET, filter=True)), [0])
311311

312+
def test_get_all_tokens(self):
313+
"""Test parsing of any UNIMOD sequence into tokens."""
314+
seqs = ["ACKC[UNIMOD:4]AD", "PEPTIDE", "PEM[UNIMOD:35]"]
315+
316+
result = mod.get_all_tokens(seqs)
317+
self.assertEqual(result, {"A", "C", "C[UNIMOD:4]", "D", "E", "I", "K", "M[UNIMOD:35]", "P", "T"})
318+
312319

313320
class TestCustomToInternal(unittest.TestCase):
314321
"""Class to test custom to internal."""

0 commit comments

Comments
 (0)