11import difflib
22import re
33from itertools import combinations , repeat
4- from typing import Dict , List , Optional , Tuple , Union
4+ from typing import Dict , List , Optional , Set , Tuple , Union
55
66import numpy as np
77import pandas as pd
@@ -311,10 +311,17 @@ def split_modstring(sequence: str, r_pattern):
311311 # Ugly and fast fix for reading modifications as is from maxquant we should reconsider how to fix it.
312312 # sequence = sequence.replace('M(ox)','M(U:35)')
313313 # sequence = sequence.replace('C','C(U:4)')
314+ val = max (alphabet .values ()) + 1
314315 split_seq = r_pattern .findall (sequence )
315316 if "" .join (split_seq ) == sequence :
316317 if translate :
317- return [alphabet [aa ] for aa in split_seq ]
318+ results = []
319+ for aa in split_seq :
320+ if aa not in alphabet : # does not exist
321+ alphabet [aa ] = val
322+ val += 1
323+ results .append (alphabet [aa ])
324+ return results
318325 else :
319326 return split_seq
320327 elif filter :
@@ -327,13 +334,23 @@ def split_modstring(sequence: str, r_pattern):
327334 f"The element(s) [{ not_parsable_elements } ] " f"in the sequence [{ sequence } ] could not be parsed"
328335 )
329336
330- pattern = sorted (alphabet , key = len , reverse = True )
337+ unimod_pattern = r"[A-Z]\[UNIMOD:\d+\]"
338+ alphabet_pattern = [re .escape (i ) for i in sorted (alphabet , key = len , reverse = True )]
331339
332- pattern = [re . escape ( i ) for i in pattern ]
340+ pattern = [unimod_pattern ] + alphabet_pattern
333341 regex_pattern = re .compile ("|" .join (pattern ))
334342 return map (split_modstring , sequences , repeat (regex_pattern ))
335343
336344
345+ def get_all_tokens (sequences : List [str ]) -> Set [str ]:
346+ """Parse given sequences in UNIMOD ProForma standard into a set of all tokens."""
347+ pattern = r"[ACDEFGHIKLMNPQRSTVWY](\[UNIMOD:\d+\])?"
348+ tokens = set ()
349+ for seq in sequences :
350+ tokens |= {match .group () for match in re .finditer (pattern , seq )}
351+ return tokens
352+
353+
337354def add_permutations (modified_sequence : str , unimod_id : int , residues : List [str ]):
338355 """
339356 Generate different peptide sequences with moving the modification to all possible residues.
0 commit comments