forked from mtarbit/Rosalind-Problems
-
Notifications
You must be signed in to change notification settings - Fork 0
/
e028-sseq.py
executable file
·57 lines (44 loc) · 1.47 KB
/
e028-sseq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env python
# coding=utf-8
# Finding a Spliced Motif
# =======================
#
# A subsequence of a string is a collection of symbols contained in order
# (though not necessarily contiguously) in the string (e.g., ACG is a
# subsequence of TATGCTAAGATC). The indices of a subsequence are the positions
# in the string at which the symbols of the subsequence appear; thus, the
# indices of ACG in TATGCTAAGATC can be represented by (2, 5, 9).
#
# As a substring can have multiple locations, a subsequence can have multiple
# collections of indices, and the same index can be reused in more than one
# appearance of the subsequence; for example, ACG is a subsequence of AACCGGTT
# in 8 different ways.
#
# Given: Two DNA strings s and t (each of length at most 1 kbp).
#
# Return: One collection of indices of s in which the symbols of t appear as a
# subsequence of s. If multiple solutions exist, you may return any one.
#
# Sample Dataset
# --------------
# ACGTACGTGACG
# GTA
#
# Sample Output
# -------------
# 3 8 10
def subsequence_indices(s):
indices = []
s, t = s.split()
i = j = 0
while i < len(s) and j < len(t):
if s[i] == t[j]:
indices.append(i + 1)
j += 1
i += 1
return indices
if __name__ == "__main__":
small_dataset = "ACGTACGTGACG\nGTA"
large_dataset = open('datasets/rosalind_sseq.txt').read().strip()
result = subsequence_indices(large_dataset)
print ' '.join(map(str, result))