-
Notifications
You must be signed in to change notification settings - Fork 0
/
randomised motif search.py
133 lines (119 loc) · 4.94 KB
/
randomised motif search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# A Monte Carlo algorithm which begins with a collection of randomly chosen k-mers Motifs (one from each sequence in DNA)
# it then constructs a Position Probability Matrix for the motif collection and uses the PPM to generate new, more likely motifs
# This algorithm can be iterated N number of times to keep improving the motif collection
# ultimately, it identifies the regulatory motif in each DNA sequence
import random
def random_kmers(DNA, k):
# generates random k-mers found within DNA
rand_list = []
for sequence in DNA:
l = len(sequence)
r = random.randint(0, (l-k))
rand_kmer = sequence[r:r+k]
rand_list.append(rand_kmer)
return (rand_list)
def most_common(motif_matrix):
# finds the most common nucleotide at any given position in a list of k-mers
from collections import Counter
return [Counter(col).most_common(1)[0][0] for col in zip(*motif_matrix)]
def consensus(motifs):
# uses most_common to create a k-mer motif of the most frequently occuring bases
consensus = []
consensus.append(most_common(motifs))
consensus_string = ''.join(consensus[0])
return(consensus_string)
def hamming_distance(most_prob_kmer, kmer):
# calculates the number of mismatches between two k-mers
return sum(pattern_base1 != pattern_base2 for pattern_base1, pattern_base2 in zip(most_prob_kmer, kmer))
def calculate_score(motifs):
# calculates the total hamming distance between the consensus string and each k-mers in a list of motifs
# i.e. the lower the score, the closer the motif list is to the consensus string
consensus_str = consensus(motifs)
score = 0
for motif in motifs:
score = score + hamming_distance(consensus_str, motif)
return score
def generate_profile(motifs, k):
# applies Laplace's rule of succession
# creates a position frequency matrix (PFM) based on a list of motif k-mers
# creates a position probability matrix from the PFM
import numpy
profile = []
probA, probC, probG, probT = [], [], [], []
for base in range(k):
countA, countC, countG, countT = 1, 1, 1, 1
for motif in motifs:
if motif[base] == "A":
countA += 1
elif motif[base] == "C":
countC += 1
elif motif[base] == "G":
countG += 1
elif motif[base] == "T":
countT += 1
totalcount = countA + countC + countG + countT
probA.append(countA/totalcount)
probC.append(countC/totalcount)
probG.append(countG/totalcount)
probT.append(countT/totalcount)
profile.append(probA)
profile.append(probC)
profile.append(probG)
profile.append(probT)
t_profile = numpy.transpose(profile)
return t_profile
def probability (profile, kmer):
# uses the position probability matrix of an existing k-mer motif list
# calculates the probability of a possible kmer
product_probability = 1
for base in range(len(kmer)):
if kmer[base] == 'A':
p = profile[base][0]
elif kmer[base] == 'C':
p = profile[base][1]
elif kmer[base] == 'G':
p = profile[base][2]
elif kmer[base] == 'T':
p = profile[base][3]
product_probability = product_probability * p
return(product_probability)
def generate_motif(profile, DNA, k):
# uses the PPM to identify the k-mer within each DNA sequence with the highest product probability
motif_list = []
for sequence in DNA:
prob_dict = {}
for j in range(len(sequence)-k):
kmer = sequence[j:j+k]
prob_dict[kmer] = probability(profile, kmer)
max_prob_kmer = max(prob_dict, key=prob_dict.get)
motif_list.append(max_prob_kmer)
return(motif_list)
def Randomised_Motif_Search(DNA, k):
# generates a profile based on existing motif collection and uses profile to generate a new motif collection
# if the new motif collection has a lower score that the existing one, it replaces it
# process continues until score fails to improve
initial_motifs = random_kmers(DNA, k)
BestMotifs = initial_motifs
while True:
Profile = generate_profile(BestMotifs, k)
Motifs = generate_motif(Profile, DNA, k)
if calculate_score(Motifs) < calculate_score(BestMotifs):
BestMotifs = Motifs
else:
return BestMotifs
DNA=['CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA'
'GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG'
'TAGTACCGAGACCGAAAGAAGTATACAGGCGT'
'TAGATCAAGTTTCAGGTGCACGTCGGTGAACC'
'AATCCACCAGCTCCACGTGCAATGTTGGCCTA']
k = 8
N = 1000
# number of iterations of the randomised algorithm
BestMotifs = Randomised_Motif_Search(DNA, k)
for n in range (1, N):
Motifs = Randomised_Motif_Search(DNA, k)
if calculate_score(Motifs) < calculate_score(BestMotifs):
BestMotifs = Motifs
print(BestMotifs)
#print(calculate_score(BestMotifs))
# Output -> ['TCTCGGGG', 'CCAAGGTG', TACAGGCG', 'TTCAGGTG', 'TCCACGTG']