forked from mtarbit/Rosalind-Problems
-
Notifications
You must be signed in to change notification settings - Fork 0
/
e016-mrna.py
executable file
·82 lines (68 loc) · 2.81 KB
/
e016-mrna.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python
# coding=utf-8
# Inferring mRNA from Protein
# ===========================
#
# For positive integers a and n, a modulo n (written amodn in shorthand) is the
# remainder when a is divided by n. For example, 29mod11=7 because 29=11×2+7.
#
# Modular arithmetic is the study of addition, subtraction, multiplication, and
# division with respect to the modulo operation. We say that a and b are
# congruent modulo n if amodn=bmodn; in this case, we use the notation a≡bmodn.
#
# Two useful facts in modular arithmetic are that if a≡bmodn and c≡dmodn, then
# a+c≡b+dmodn and a×c≡b×dmodn. To check your understanding of these rules, you
# may wish to verify these relationships for a=29, b=73, c=10, d=32, and n=11.
#
# As you will see in this exercise, some Rosalind problems will ask for a (very
# large) integer solution modulo a smaller number to avoid the computational
# pitfalls that arise with storing such large numbers.
#
# Given: A protein string of length at most 1000 aa.
#
# Return: The total number of different RNA strings from which the protein could
# have been translated, modulo 1,000,000. (Don't neglect the importance of the
# stop codon in protein translation.)
#
# Sample Dataset
# --------------
# MA
#
# Sample Output
# -------------
# 12
RNA_CODON_TABLE = {
'UUU': 'F', 'CUU': 'L', 'AUU': 'I', 'GUU': 'V',
'UUC': 'F', 'CUC': 'L', 'AUC': 'I', 'GUC': 'V',
'UUA': 'L', 'CUA': 'L', 'AUA': 'I', 'GUA': 'V',
'UUG': 'L', 'CUG': 'L', 'AUG': 'M', 'GUG': 'V',
'UCU': 'S', 'CCU': 'P', 'ACU': 'T', 'GCU': 'A',
'UCC': 'S', 'CCC': 'P', 'ACC': 'T', 'GCC': 'A',
'UCA': 'S', 'CCA': 'P', 'ACA': 'T', 'GCA': 'A',
'UCG': 'S', 'CCG': 'P', 'ACG': 'T', 'GCG': 'A',
'UAU': 'Y', 'CAU': 'H', 'AAU': 'N', 'GAU': 'D',
'UAC': 'Y', 'CAC': 'H', 'AAC': 'N', 'GAC': 'D',
'UAA': 'Stop', 'CAA': 'Q', 'AAA': 'K', 'GAA': 'E',
'UAG': 'Stop', 'CAG': 'Q', 'AAG': 'K', 'GAG': 'E',
'UGU': 'C', 'CGU': 'R', 'AGU': 'S', 'GGU': 'G',
'UGC': 'C', 'CGC': 'R', 'AGC': 'S', 'GGC': 'G',
'UGA': 'Stop', 'CGA': 'R', 'AGA': 'R', 'GGA': 'G',
'UGG': 'W', 'CGG': 'R', 'AGG': 'R', 'GGG': 'G'
}
def codon_frequencies():
frequencies = {}
for k, v in RNA_CODON_TABLE.iteritems():
if not frequencies.has_key(v):
frequencies[v] = 0
frequencies[v] += 1
return frequencies
def possible_rna_strings(input):
f = codon_frequencies()
n = f['Stop']
for c in input:
n *= f[c]
return n
if __name__ == "__main__":
small_dataset = "MA"
large_dataset = open('datasets/rosalind_mrna.txt').read().strip()
print possible_rna_strings(large_dataset) % 1000000