-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHexamer.py
117 lines (76 loc) · 2.39 KB
/
Hexamer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#compute all possible hexamer
import itertools
bases=['A','T','G','C']
#value of k set to 6 for hexamer
k = 6
six_mer = [''.join(p) for p in itertools.product(bases, repeat=k)]
print("Total number of hexamer:", len(six_mer))
#start included and stop excluded
sample_mer = six_mer[0:400]
print("Number of selected hexamer:", len(sample_mer))
test_mer = six_mer[400:(len(six_mer))]
print("The number of rest of the hexamer:", len(test_mer))
#finding out the most different hexamer:
def jaccard_index(a, b):
a = set(a)
b = set(b)
intersection = len(a.intersection(b))
union = len(a.union(b))
return intersection / union
f = open("test_out",'w')
for i in sample_mer:
for j in test_mer:
sim_index = jaccard_index(i,j)
if sim_index == 0:
break
print("Most different hexamer found:", j)
#display the duplex in a convenient format:
def reverse_compliment(seq):
seq_dict = {'A':'T','T':'A','G':'C','C':'G'}
return "".join([seq_dict[base] for base in reversed(seq)])
def display_complements(seq):
rev_comp = reverse_compliment(seq)
# Print template
print(seq)
# Print "base pairs"
for base in seq:
print('|', end='')
# Print final newline character after base pairs
print()
# Print reverse complement
for base in reversed(rev_comp):
print(base, end='')
# Print final newline character
print()
#display_complements("CCCCCC")
print("The most different dulex hexamer:")
display_complements(j)
#generate all possible k-mer when k = 7, 8, 9, 10, 11, 12, 13...:
import itertools
bases=['A','T','G','C']
k = 7
%timeit k_mer = [''.join(p) for p in itertools.product(bases, repeat=k)]
import itertools
bases=['A','T','G','C']
k = 8
%timeit k_mer = [''.join(p) for p in itertools.product(bases, repeat=k)]
import itertools
bases=['A','T','G','C']
k = 9
%timeit k_mer = [''.join(p) for p in itertools.product(bases, repeat=k)]
import itertools
bases=['A','T','G','C']
k = 10
%timeit k_mer = [''.join(p) for p in itertools.product(bases, repeat=k)]
import itertools
bases=['A','T','G','C']
k = 11
%timeit k_mer = [''.join(p) for p in itertools.product(bases, repeat=k)]
import itertools
bases=['A','T','G','C']
k = 12
%timeit k_mer = [''.join(p) for p in itertools.product(bases, repeat=k)]
import itertools
bases=['A','T','G','C']
k = 13
%timeit k_mer = [''.join(p) for p in itertools.product(bases, repeat=k)]