-
Notifications
You must be signed in to change notification settings - Fork 0
/
exe4_pssm.py
151 lines (120 loc) · 6.07 KB
/
exe4_pssm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import numpy as np
"""
ATTENTION: Use the following dictionaries to get the correct index for each
amino acid when accessing any type of matrix or array provided as
parameters. Further, use those indices when generating or returning
any matrices or arrays. Failure to do so will most likely result in
not passing the tests.
EXAMPLE: To access the substitution frequency from alanine 'A' to proline 'P'
in the bg_matrix use bg_matrix[AA_TO_INT['A'], AA_TO_INT['P']].
"""
ALPHABET = 'ACDEFGHIKLMNPQRSTVWY-'
AA_TO_INT = {aa: index for index, aa in enumerate(ALPHABET)}
INT_TO_AA = {index: aa for index, aa in enumerate(ALPHABET)}
GAP_INDEX = AA_TO_INT['-']
def all_equal(lst):
return lst.count(lst[0]) == len(lst)
class MSA:
def __init__(self, sequences):
if not (
len(sequences) > 0
and all_equal([len(s) for s in sequences])
and all(all(si in ALPHABET for si in s) for s in sequences)
):
raise TypeError
self.sequences = sequences
self.sequences_matrix = np.array(sequences, dtype=bytes).view('S1').reshape((len(sequences), -1))
self.r = np.apply_along_axis(lambda col: np.unique(col).size, axis=0, arr=self.sequences_matrix)
self.seqs_elems_entries = self.get_seqs_elems_entries()
def get_seqs_elems_entries(self, weights=None):
a = np.array(list(ALPHABET), dtype='S1')
seqs_elems_entries = np.empty((self.sequences_matrix.shape[1], a.size))
for i in range(seqs_elems_entries.shape[0]):
b = self.sequences_matrix[:, i]
idx_sorted = a.argsort()
idx = idx_sorted[np.searchsorted(a, b, sorter=idx_sorted)]
mask = a[idx] == b
seqs_elems_entries[i] = np.bincount(idx[mask], minlength=a.size, weights=weights)
return seqs_elems_entries
def get_pssm(self, *, bg_matrix=None, beta=10, use_sequence_weights=False,
redistribute_gaps=False, add_pseudocounts=False):
"""
Return a PSSM for the underlying MSA. Use the appropriate refinements
according to the parameters. If no bg_matrix is specified, use uniform
background and pair frequencies.
Every row in the resulting PSSM corresponds to a non-gap position in
the primary sequence of the MSA (i.e. the first one).
Every column in the PSSM corresponds to one of the 20 amino acids.
Values that would be -inf must be replaced by -20 in the final PSSM.
Before casting to dtype=numpy.int64, round all values to the nearest
integer (do not just FLOOR all values).
:param bg_matrix: Amino acid pair frequencies as numpy array (20, 20).
Access the matrix using the indices from AA_TO_INT.
:param beta: Beta value (float) used to weight the pseudocounts
against the observed amino acids in the MSA.
:param use_sequence_weights: Calculate and apply sequence weights.
:param redistribute_gaps: Redistribute the gaps according to the
background frequencies.
:param add_pseudocounts: Calculate and add pseudocounts according
to the background frequencies.
:return: PSSM as numpy array of shape (L x 20, dtype=numpy.int64).
L = ungapped length of the primary sequence.
"""
if bg_matrix is None:
bg_matrix = np.full((20, 20), 1 / (20 ** 2), dtype=np.float64)
else:
bg_matrix = np.array(bg_matrix, dtype=np.float64)
bg_freq = np.sum(bg_matrix, axis=0)
non_gap_mask = self.sequences_matrix[0] != b'-'
if use_sequence_weights:
f_matrix = self.get_seqs_elems_entries(self.get_sequence_weights())
else:
f_matrix = self.seqs_elems_entries
pssm = f_matrix[non_gap_mask, :-1]
if redistribute_gaps:
pssm += f_matrix[non_gap_mask, -1:] * bg_freq
if add_pseudocounts:
alpha = self.get_number_of_observations() - 1
ps = (pssm / bg_freq) @ bg_matrix
pssm = (alpha * pssm + beta * ps) / (alpha + beta)
pssm /= pssm.sum(axis=1, keepdims=True)
pssm = 2 * np.log2(pssm / bg_freq)
pssm[np.isinf(pssm)] = -20
res = np.rint(pssm).astype(np.int64)
return res
def get_size(self):
"""
Return the number of sequences in the MSA and the MSA length, i.e.
the number of columns in the MSA. This includes gaps.
:return: Tuple of two integers. First element is the number of
sequences in the MSA, second element is the MSA length.
"""
return self.sequences_matrix.shape
def get_primary_sequence(self):
"""
Return the primary sequence of the MSA. In this exercise, the primary
sequence is always the first sequence of the MSA. The returned
sequence must NOT include gap characters.
:return: String containing the ungapped primary sequence.
"""
return self.sequences[0].replace('-', '')
def get_sequence_weights(self):
"""
Return the calculated sequence weights for all sequences in the MSA.
The order of weights in the array must be equal to the order of the
sequences in the MSA.
:return: Numpy array (dtype=numpy.float64) containing the weights for
all sequences in the MSA.
"""
a = np.array(list(ALPHABET), dtype='S1')
matrix_alphabet_indexes_sorted = np.searchsorted(np.sort(a), self.sequences_matrix)
matrix_alphabet_indexes = np.argsort(a)[matrix_alphabet_indexes_sorted]
s = np.take_along_axis(self.seqs_elems_entries, matrix_alphabet_indexes.T, axis=1)
w = 1 / (self.r[:, np.newaxis] * s)
return w[self.r != 1].sum(axis=0)
def get_number_of_observations(self):
"""
Return the estimated number of independent observations in the MSA.
:return: Estimate of independent observation (dtype=numpy.float64).
"""
return np.mean(self.r)