Skip to content

Commit

Permalink
Added numpy style docstrings to last two modules
Browse files Browse the repository at this point in the history
  • Loading branch information
allydunham committed Nov 25, 2021
1 parent a2afb48 commit 53413ec
Show file tree
Hide file tree
Showing 2 changed files with 266 additions and 48 deletions.
178 changes: 139 additions & 39 deletions proteinnetpy/mutation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
Module containing functions for mutating ProteinNetRecords and feeding that
data into further computations (e.g. Tensorflow)
Module containing functions for mutating ProteinNetRecords and feeding that data into further computations (e.g. Tensorflow).
These functions are fairly specific so may often be better used as inspiration to build users own solutions.
"""
import random
import logging
Expand All @@ -15,27 +15,57 @@

class ProteinNetMutator(LabeledFunction):
"""
Map function mutating records and outputting the mutant sequence
in additions to other features. Produces the
appropriate data for NN training with tensorflow.
Map function generating mutated records.
Apply a mutator function to a ProteinNet record and return the mutated sequence. This is a LabeledFunction that can be used to generate a TensorFlow Dataset. This setup is fairly specific to your downstream model design, so it will often be more useful to use it as a base to create an alternate implementation.
Returns are in the form:
([wt_seq], mut_seq, [phi, psi, chi1]), label, [weights]
mutator: mutator function
per_position: whether mutants deleteriousness is tracked per sequence or per
mutant. Requires a compatible mutator returning a tuple of
mutant_seq, deleterious_inds, neutral_inds
include: variables to return for computation, alongside mutant sequence.
Options: (wt, phi, psi, chi1)
weights: weightings for [wt, deleterious, neutral] variants when processing
per residue variants
encoding: Optional dictionary mapping alphabetically encoded AA indeces to a new
scheme (e.g. that used in UniRep)
**kwargs: arguments passed on to mutator
Attributes
----------
wildtype : bool
Outputs wildtype as well as mutant sequence.
phi : bool
Outputs Phi backbone angles.
psi : bool
Outputs Psi backbone angles.
chi : bool
Outputs rotamer angles.
mutator : function
Mutator function taking a ProteinNetRecord and returning the sampled variants and their deleteriousness. The return format depends on `per_position`. If per_position=False must return a tuple with the mutated sequence index array and whether it is deleterious (1/0). If per_position=True must return a tuple with mutant_seq, deleterious_inds, neutral_inds arrays.
kwargs : dict
Keyword arguments passed to the mutator function.
encoding : dict
Encoding mapping alphabetically encoded integer indeces to a new scheme.
weights : list
List of float weights for WT, Deleterious and Neutral variants when mutating per position.
func : function
Function applied when the class is called. This is a mutator applied to the whole sequence or per position derived from the initialisation parameters.
output_shapes, output_types : tuple
Tuple of output shapes and types (see data.LabeledFunction for details)
"""
def __init__(self, mutator, per_position=False, include=('wt',),
weights=(0, 1, 1), encoding=None, **kwargs):
"""
Initialise the mutator.
Parameters
----------
mutator : function
Mutator function taking a ProteinNetRecord and returning the sampled variants and their deleteriousness. The return format depends on `per_position`. If per_position=False must return a tuple with the mutated sequence index array and whether it is deleterious (1/0). If per_position=True must return a tuple with mutant_seq, deleterious_inds, neutral_inds arrays.
per_position : bool
Return deleteriousness for each position rather than the entire sequence.
include: : sequence_like including some of {"wt", "phi", "psi", "chi1"}
Variables to return for computation, alongside mutant sequence.
weights : list
weightings for [wt, deleterious, neutral] variants when processing per residue variants
encoding : dict
Optional dictionary mapping alphabetically encoded AA integer indeces to a new scheme (e.g. that used in UniRep).
**kwargs : dict
arguments passed on to mutator
"""
self.wildtype = 'wt' in include
self.phi = 'phi' in include
self.psi = 'psi' in include
Expand Down Expand Up @@ -81,7 +111,12 @@ def __init__(self, mutator, per_position=False, include=('wt',),

def _per_position_func(self, record):
"""
Function producing per position labels
Function producing per position mutants and labels from a ProteinNetRecord
Parameters
----------
record : ProteinNetRecord
Record to mutate.
"""
mut_seq, deleterious, neutral = self.mutator(record, **self.kwargs)

Expand Down Expand Up @@ -115,7 +150,12 @@ def _per_position_func(self, record):

def _whole_seq_func(self, record):
"""
Function producing a single label for the sequence
Function applying mutator to a record and producing mutant data and a single label for the sequence from a ProteinNetRecord.
Parameters
----------
record : ProteinNetRecord
Record to mutate.
"""
mut_seq, label = self.mutator(record, **self.kwargs)

Expand Down Expand Up @@ -146,8 +186,27 @@ def _whole_seq_func(self, record):
def sequence_mutator(record, p_deleterious=0.5, max_mutations=3,
max_deleterious=0.01, min_neutral=0.1):
"""
Generate mutated sequences from ProteinNetRecords with a few deleterious or
neutral variants, along with a label identifying them as deleterious (1) or neutral (0)
Generate mutated sequences from a ProteinNetRecord with a few deleterious or neutral variants.
Generate mutated sequences from a ProteinNetRecord with a few deleterious and/or neutral variants. First randomly choose to generate a deleterious or neutral sequence then sample some of the corresponding variant types based on the records MSA frequencies.
Parameters
----------
record : ProteinNetRecord
Record to mutate.
p_deleterious : float
Probability of returning a deleterious set of variants.
max_mutations : int
Maximum number of mutations to make.
max_deleterious : float
Maximum MSA frequency for a variant to be considered deleterious.
min_neutral : float
Minimum MSA frequency for a variant to be considered neutral.
Returns
-------
tuple
Tuple of the format (seq, deleterious). The first entry is the mutated amino acid sequence, encoded with integer indeces and the second is 1 if the sequence is deleterious and 0 if neutral.
"""
deleterious = int(random.random() < p_deleterious)
seq = record.primary_ind.copy()
Expand All @@ -170,9 +229,28 @@ def sequence_mutator(record, p_deleterious=0.5, max_mutations=3,
def per_position_mutator(record, max_deleterious=2, max_neutral=4,
max_deleterious_freq=0.01, min_neutral_freq=0.1):
"""
Geneate mutated sequences from ProteinNetRecords and return the variant sequence along
with labels identifying deleterious and neutral positions. Will always generate at least
one variant.
Generate mutated sequences from ProteinNetRecords with labels identifying deleterious and neutral mutations.
Generate mutated sequences from ProteinNetRecords with labels identifying where deleterious and neutral mutations have been made.
Will always generate at least one variant.
Parameters
----------
record : ProteinNetRecord
Record to mutate.
max_deleterious : int
Maximum number of deleterious variants to make.
max_neutral : int
Maximum number of neutral variants to make.
max_deleterious_freq : float
Maximum MSA frequency for a variant to be considered deleterious.
min_neutral_freq : float
Minimum MSA frequency for a variant to be considered neutral.
Returns
-------
tuple
Tuple of the format seq, deleterious, neutral. The first entry is the mutated sequence, the second a list of positions with deleterious variants and the third a list of positions with neutral variants.
"""
seq = record.primary_ind.copy()

Expand Down Expand Up @@ -223,15 +301,27 @@ def per_position_mutator(record, max_deleterious=2, max_neutral=4,

def sample_deleterious(num, pssm, wt_seq, max_freq=0.025, mask=None):
"""
Sample deleterious mutations froma pssm
num: number of mutations to make
pssm: PSSM in a np.array
wt_seq: WT sequence of the protein
max_freq: maximum frequency considered deleterious
mask: positions to mask
returns: np.array(positions), np.array(substitutions)
Sample deleterious mutations from a MSA frequency matrix.
Randomly choose a selection of deleterious variants from a MSA frequency matrix.
Parameters
----------
num : int
Number of mutations to make.
pssm : float ndarray (20, N)
MSA frequency matrix to determine neutral and deleterious variants.
wt_seq : int ndarray (N,)
WT sequence of the protein (as int indeces corresponding to the MSA matrix rows).
max_freq : float
Maximum frequency considered deleterious.
mask : int array_like
Array of positions not to mutate.
Returns
-------
tuple
Numpy array of position indeces chosen and an array of the alternate amino acid in each position (as MSA row indeces).
"""
if num == 0:
raise ValueError('num must be > 0')
Expand Down Expand Up @@ -261,13 +351,23 @@ def sample_neutral(num, pssm, wt_seq, min_freq=0.025, mask=None):
"""
Sample deleterious mutations froma pssm
num: number of mutations to make
pssm: PSSM in a np.array
wt_seq: WT sequence of the protein
min_freq: minimum frequency considered neutral
mask: positions to mask
returns: np.array(positions), np.array(substitutions, as PSSM row indeces)
Parameters
----------
num : int
Number of mutations to make.
pssm : float ndarray (20, N)
MSA frequency matrix to determine neutral and deleterious variants.
wt_seq : int ndarray (N,)
WT sequence of the protein (as int indeces corresponding to the MSA matrix rows).
min_freq : float
Minimum frequency considered neutral.
mask : int array_like
Array of positions not to mutate.
Returns
-------
tuple
Numpy array of position indeces chosen and an array of the alternate amino acid in each position (as MSA row indeces).
"""
if num == 0:
return None, None
Expand Down
Loading

0 comments on commit 53413ec

Please sign in to comment.