diff --git a/proteinnetpy/mutation.py b/proteinnetpy/mutation.py index fbc56b3..2a3ccf0 100644 --- a/proteinnetpy/mutation.py +++ b/proteinnetpy/mutation.py @@ -1,6 +1,6 @@ """ -Module containing functions for mutating ProteinNetRecords and feeding that -data into further computations (e.g. Tensorflow) +Module containing functions for mutating ProteinNetRecords and feeding that data into further computations (e.g. Tensorflow). +These functions are fairly specific so may often be better used as inspiration to build users own solutions. """ import random import logging @@ -15,27 +15,57 @@ class ProteinNetMutator(LabeledFunction): """ - Map function mutating records and outputting the mutant sequence - in additions to other features. Produces the - appropriate data for NN training with tensorflow. + Map function generating mutated records. + + + Apply a mutator function to a ProteinNet record and return the mutated sequence. This is a LabeledFunction that can be used to generate a TensorFlow Dataset. This setup is fairly specific to your downstream model design, so it will often be more useful to use it as a base to create an alternate implementation. Returns are in the form: ([wt_seq], mut_seq, [phi, psi, chi1]), label, [weights] - mutator: mutator function - per_position: whether mutants deleteriousness is tracked per sequence or per - mutant. Requires a compatible mutator returning a tuple of - mutant_seq, deleterious_inds, neutral_inds - include: variables to return for computation, alongside mutant sequence. - Options: (wt, phi, psi, chi1) - weights: weightings for [wt, deleterious, neutral] variants when processing - per residue variants - encoding: Optional dictionary mapping alphabetically encoded AA indeces to a new - scheme (e.g. that used in UniRep) - **kwargs: arguments passed on to mutator + Attributes + ---------- + wildtype : bool + Outputs wildtype as well as mutant sequence. + phi : bool + Outputs Phi backbone angles. + psi : bool + Outputs Psi backbone angles. + chi : bool + Outputs rotamer angles. + mutator : function + Mutator function taking a ProteinNetRecord and returning the sampled variants and their deleteriousness. The return format depends on `per_position`. If per_position=False must return a tuple with the mutated sequence index array and whether it is deleterious (1/0). If per_position=True must return a tuple with mutant_seq, deleterious_inds, neutral_inds arrays. + kwargs : dict + Keyword arguments passed to the mutator function. + encoding : dict + Encoding mapping alphabetically encoded integer indeces to a new scheme. + weights : list + List of float weights for WT, Deleterious and Neutral variants when mutating per position. + func : function + Function applied when the class is called. This is a mutator applied to the whole sequence or per position derived from the initialisation parameters. + output_shapes, output_types : tuple + Tuple of output shapes and types (see data.LabeledFunction for details) """ def __init__(self, mutator, per_position=False, include=('wt',), weights=(0, 1, 1), encoding=None, **kwargs): + """ + Initialise the mutator. + + Parameters + ---------- + mutator : function + Mutator function taking a ProteinNetRecord and returning the sampled variants and their deleteriousness. The return format depends on `per_position`. If per_position=False must return a tuple with the mutated sequence index array and whether it is deleterious (1/0). If per_position=True must return a tuple with mutant_seq, deleterious_inds, neutral_inds arrays. + per_position : bool + Return deleteriousness for each position rather than the entire sequence. + include: : sequence_like including some of {"wt", "phi", "psi", "chi1"} + Variables to return for computation, alongside mutant sequence. + weights : list + weightings for [wt, deleterious, neutral] variants when processing per residue variants + encoding : dict + Optional dictionary mapping alphabetically encoded AA integer indeces to a new scheme (e.g. that used in UniRep). + **kwargs : dict + arguments passed on to mutator + """ self.wildtype = 'wt' in include self.phi = 'phi' in include self.psi = 'psi' in include @@ -81,7 +111,12 @@ def __init__(self, mutator, per_position=False, include=('wt',), def _per_position_func(self, record): """ - Function producing per position labels + Function producing per position mutants and labels from a ProteinNetRecord + + Parameters + ---------- + record : ProteinNetRecord + Record to mutate. """ mut_seq, deleterious, neutral = self.mutator(record, **self.kwargs) @@ -115,7 +150,12 @@ def _per_position_func(self, record): def _whole_seq_func(self, record): """ - Function producing a single label for the sequence + Function applying mutator to a record and producing mutant data and a single label for the sequence from a ProteinNetRecord. + + Parameters + ---------- + record : ProteinNetRecord + Record to mutate. """ mut_seq, label = self.mutator(record, **self.kwargs) @@ -146,8 +186,27 @@ def _whole_seq_func(self, record): def sequence_mutator(record, p_deleterious=0.5, max_mutations=3, max_deleterious=0.01, min_neutral=0.1): """ - Generate mutated sequences from ProteinNetRecords with a few deleterious or - neutral variants, along with a label identifying them as deleterious (1) or neutral (0) + Generate mutated sequences from a ProteinNetRecord with a few deleterious or neutral variants. + + Generate mutated sequences from a ProteinNetRecord with a few deleterious and/or neutral variants. First randomly choose to generate a deleterious or neutral sequence then sample some of the corresponding variant types based on the records MSA frequencies. + + Parameters + ---------- + record : ProteinNetRecord + Record to mutate. + p_deleterious : float + Probability of returning a deleterious set of variants. + max_mutations : int + Maximum number of mutations to make. + max_deleterious : float + Maximum MSA frequency for a variant to be considered deleterious. + min_neutral : float + Minimum MSA frequency for a variant to be considered neutral. + + Returns + ------- + tuple + Tuple of the format (seq, deleterious). The first entry is the mutated amino acid sequence, encoded with integer indeces and the second is 1 if the sequence is deleterious and 0 if neutral. """ deleterious = int(random.random() < p_deleterious) seq = record.primary_ind.copy() @@ -170,9 +229,28 @@ def sequence_mutator(record, p_deleterious=0.5, max_mutations=3, def per_position_mutator(record, max_deleterious=2, max_neutral=4, max_deleterious_freq=0.01, min_neutral_freq=0.1): """ - Geneate mutated sequences from ProteinNetRecords and return the variant sequence along - with labels identifying deleterious and neutral positions. Will always generate at least - one variant. + Generate mutated sequences from ProteinNetRecords with labels identifying deleterious and neutral mutations. + + Generate mutated sequences from ProteinNetRecords with labels identifying where deleterious and neutral mutations have been made. + Will always generate at least one variant. + + Parameters + ---------- + record : ProteinNetRecord + Record to mutate. + max_deleterious : int + Maximum number of deleterious variants to make. + max_neutral : int + Maximum number of neutral variants to make. + max_deleterious_freq : float + Maximum MSA frequency for a variant to be considered deleterious. + min_neutral_freq : float + Minimum MSA frequency for a variant to be considered neutral. + + Returns + ------- + tuple + Tuple of the format seq, deleterious, neutral. The first entry is the mutated sequence, the second a list of positions with deleterious variants and the third a list of positions with neutral variants. """ seq = record.primary_ind.copy() @@ -223,15 +301,27 @@ def per_position_mutator(record, max_deleterious=2, max_neutral=4, def sample_deleterious(num, pssm, wt_seq, max_freq=0.025, mask=None): """ - Sample deleterious mutations froma pssm - - num: number of mutations to make - pssm: PSSM in a np.array - wt_seq: WT sequence of the protein - max_freq: maximum frequency considered deleterious - mask: positions to mask - - returns: np.array(positions), np.array(substitutions) + Sample deleterious mutations from a MSA frequency matrix. + + Randomly choose a selection of deleterious variants from a MSA frequency matrix. + + Parameters + ---------- + num : int + Number of mutations to make. + pssm : float ndarray (20, N) + MSA frequency matrix to determine neutral and deleterious variants. + wt_seq : int ndarray (N,) + WT sequence of the protein (as int indeces corresponding to the MSA matrix rows). + max_freq : float + Maximum frequency considered deleterious. + mask : int array_like + Array of positions not to mutate. + + Returns + ------- + tuple + Numpy array of position indeces chosen and an array of the alternate amino acid in each position (as MSA row indeces). """ if num == 0: raise ValueError('num must be > 0') @@ -261,13 +351,23 @@ def sample_neutral(num, pssm, wt_seq, min_freq=0.025, mask=None): """ Sample deleterious mutations froma pssm - num: number of mutations to make - pssm: PSSM in a np.array - wt_seq: WT sequence of the protein - min_freq: minimum frequency considered neutral - mask: positions to mask - - returns: np.array(positions), np.array(substitutions, as PSSM row indeces) + Parameters + ---------- + num : int + Number of mutations to make. + pssm : float ndarray (20, N) + MSA frequency matrix to determine neutral and deleterious variants. + wt_seq : int ndarray (N,) + WT sequence of the protein (as int indeces corresponding to the MSA matrix rows). + min_freq : float + Minimum frequency considered neutral. + mask : int array_like + Array of positions not to mutate. + + Returns + ------- + tuple + Numpy array of position indeces chosen and an array of the alternate amino acid in each position (as MSA row indeces). """ if num == 0: return None, None diff --git a/proteinnetpy/record.py b/proteinnetpy/record.py index 4b36090..2fccf30 100644 --- a/proteinnetpy/record.py +++ b/proteinnetpy/record.py @@ -23,12 +23,94 @@ class ProteinNetRecord: # pylint: disable=invalid-name # pylint: disable=redefined-builtin """ + Record from a ProteinNet dataset. + A record from a ProteinNet database, with support for torsion angles and additional - per-position profiles (e.g. precalculated from something like UniRep) + per-position profiles (e.g. precalculated from a language model). + The only required attributes are id and primary, allow various others are derived from these on initialisation. + Many attributes are identified from the ID, based on the format of the main ProteinNet files IDs. + This will likely fail if other data is put into the same format, but should not reduce functionality much as only a few specific functions utilise this data, which is mainly for information purposes. + + Attributes + ---------- + id : str + Record ID + split : {'training', 'testing', 'validation'} + The data split the record comes from. Identified from the ID. + record_class : str + Split the record comes from, for validation records. Identified from the ID. + source : str + Source of the record. Identified from the ID. + casp_id : str + CASP ID of the record, for records in the test set sourced from CASP entries. Identified from the ID. + astral_id : str + Astral ID of the record, for those sourced from Astral. Identified from the ID. + pdb_id : str + PDB ID of the record, for those deriving from a PDB entry. Identified from the ID. + pdb_chain_number : str + Numeric PDB Chain of the record, for those deriving from a PDB entry. Identified from the ID. + pdb_chain : str + Alphabetical PDB Chain of the record, for those deriving from a PDB entry. Identified from the ID. + evolutionary : float ndarray (20, N) + Variant frequencies accross a multiple sequence alignment for this protein. + info_content : float ndarray (N,) + Information content of the MSA at each position. + primary : U1 ndarray (N,) + Protein sequence in single letter code form. + primary_ind : int ndarray (N,) + Protein sequence in integer form, based on the index of amino acids in single letter alphabetical order (see record.AMINO_ACIDS and record.AA_HASH). + secondary : ndarray + Protein secondary structure (currently not included in the dataset) + tertiary : float ndarray (3, 3N) + Residue atom coordinates. Rows are x,y,z cartesian coordinates. Each residue includes 3 columns for the N, CA and C' backbone atoms. This means the atom at position i starts in column 3i and a matrix of N/CA/C' atom coordinates can be extracted with indeces in the arithmetic series 3x + c with c = 0 for N, c = 1 for CA or c = 2 for C'. + mask : int ndarray (N,) + Mask indicating which residues have structural information. Residues marked with 1 have information present. The mask needs to be tripled to apply to the tertiary structure array. None if tertiary is None. + rama : float ndarray (3, N) + Backbone angles for each residue, calculated from tertiary. The rows (first index) represent omega, phi and psi angles. Either in radians or normalised between -1 and 1. + rama_mask : int ndarray (N,) + Mask indicating which residues have backbone angles, with 1 indicating information is present. None if rama is None. + chi : float ndarray (N,) + Chi1 side chain rotamer conformation. Either in radians or normalised between -1 and 1. This requires more information than `tertiary` provides so can only be calculated from the full structural model (for example with the add_angles_to_proteinnet script). + chi_mask : int ndarray (N,) + Mask indicating which residues have rotamer angles. None if chi is None. + profiles : ndarray (X, N) + Additional profiles for each amino acid position. Can contain any additional information the user requires. + _normalised_angles : bool + Torsion angles have been normalised to vary between -1 and 1, rather than -pi to pi. """ def __init__(self, id, primary, evolutionary=None, info_content=None, secondary=None, tertiary=None, mask=None, rama=None, rama_mask=None, chi=None, chi_mask=None, profiles=None): + """ + Initialise the record. + + Parameters + ---------- + id : str + Record ID + primary : U1 ndarray (N,) + Protein sequence in single letter code form. + evolutionary : float ndarray (20, N) + Variant frequencies accross a multiple sequence alignment for this protein. + info_content : float ndarray (N,) + Information content of the MSA at each position. + secondary : ndarray + Protein secondary structure (currently not included in the dataset) + tertiary : float ndarray (3, 3N) + Residue atom coordinates. Rows are x,y,z cartesian coordinates. Each residue includes 3 columns for the N, CA and C' backbone atoms. This means the atom at position i starts in column 3i and a matrix of N/CA/C' atom coordinates can be extracted with indeces in the arithmetic series 3x + c with c = 0 for N, c = 1 for CA or c = 2 for C'. + mask : int ndarray (N,) + Mask indicating which residues have structural information. Residues marked with 1 have information present. The mask needs to be tripled to apply to the tertiary structure array. None if tertiary is None. + rama : float ndarray (3, N) + Backbone angles for each residue, calculated from tertiary. The rows (first index) represent omega, phi and psi angles. Either in radians or normalised between -1 and 1. + rama_mask : int ndarray (N,) + Mask indicating which residues have backbone angles, with 1 indicating information is present. None if rama is None. + chi : float ndarray (N,) + Chi1 side chain rotamer conformation. Either in radians or normalised between -1 and 1. This requires more information than `tertiary` provides so can only be calculated from the full structural model (for example with the add_angles_to_proteinnet script). + chi_mask : int ndarray (N,) + Mask indicating which residues have rotamer angles. None if chi is None. + profiles : ndarray (X, N) + Additional profiles for each amino acid position. Can contain any additional information the user requires. + """ self.id = id # Manage the different properties for Testing/Validation/Training sets @@ -76,6 +158,11 @@ def __init__(self, id, primary, evolutionary=None, info_content=None, secondary= def _set_id_properties(self, id): """ Extract details from the identifier + + Paramters + --------- + id : str + Record ID. """ if CASP_REGEX.match(id): self.split = 'testing' @@ -165,15 +252,32 @@ def __str__(self): def enumerate_sequence(self, aa_hash=AA_HASH): """ - Generate a numeric representation of the sequence with each - amino acid represented by an interger index, as mapped in aa_hash. + Generate a numeric representation of the sequence with each amino acid represented by an interger index. + + Generate a numeric representation of the sequence with each amino acid represented by an interger index. The default indeces are in single letter code alphabetical order. This function is used in __init__ to generate primary_ind. + + Parameters + ---------- + aa_hash : dict, optional + Dictionary mapping single letter codes to ints. Can be replaced to interface with a model that enumerates amino acids differently. For example Unirep orders amino acids alphabetically by full name. """ return np.array([aa_hash[aa] for aa in self.primary], dtype=np.int) def get_one_hot_sequence(self, aa_hash=AA_HASH): """ Generate a 1-hot encoded matrix of the proteins sequence. - aa_hash maps amino acids to their row number. + + Generate a 1-hot encoded matrix of the proteins sequence. The default indeces are in single letter code alphabetical order, which can be altered to feed into models expecting different orders. + + Parameters + ---------- + aa_hash : dict, optional + Dictionary mapping single letter codes to ints. Can be replaced to interface with a model that enumerates amino acids differently. For example Unirep orders amino acids alphabetically by full name. + + Returns + ------- + int ndarray (20, N) + One-hot encoded primary sequence for the record. Matrix with 20 rows representing each amino acid. Each column has a single 1 in the row of its amino acid. """ num_aas = max(v for k, v in aa_hash.items()) + 1 indeces = np.array([aa_hash[aa] for aa in self.primary]) @@ -184,8 +288,9 @@ def get_one_hot_sequence(self, aa_hash=AA_HASH): def calculate_backbone_angles(self): """ - Calculate Omega, Phi, and Psi from the ProteinNet 3D coordinates and set them - as instance variables + Calculate Omega, Phi, and Psi backbone angles and set the rama attribute. + + Calculate Omega, Phi, and Psi backbone angles from the tertiary structure included in ProteinNet, and set the results as the rama attribute. The rama_mask attribute is also caculated and set. """ coords = self.tertiary.T psi = np.array([calc_dihedral(coords[i:(i+4)]) for i in range(0, 3*len(self)-3, 3)] + [0]) @@ -206,8 +311,14 @@ def calculate_backbone_angles(self): def normalise_angles(self, factor=np.pi): """ - Normalise backbone and chi angles to betweeen -1 and 1. Will only - apply normalisation once per record. + Normalise backbone and chi angles to betweeen -1 and 1. + + Normalise backbone and chi angles to betweeen -1 and 1. Also sets a flag indicating angles have been normalised, and does nothing if this is set to prevent normalising twice. + + Parameters + ---------- + factor : numeric + Factor to normalise angles by. It will not generally be useful to change this since the package naturally works in radians between -pi and pi, but may be useful if you need to work with angles in other formats. """ if not self._normalised_angles: self._normalised_angles = True @@ -216,7 +327,14 @@ def normalise_angles(self, factor=np.pi): def distance_matrix(self): """ - Calculate the distance matrix between residues C-alpha atoms + Calculate the distance matrix between residues C-alpha atoms. + + Calculate the distance matrix between residues C-alpha atoms. ProteinNet coordinates and therefore the distance matrix are in nanometers. + + Returns + ------- + float ndarray (N, N) + Distance matrix where X[i,j] gives the distance in nanometers between residue i and j. """ calpha = np.copy(self.tertiary[:, 1::3])