forked from mtarbit/Rosalind-Problems
-
Notifications
You must be signed in to change notification settings - Fork 0
/
e012-grph.py
executable file
·95 lines (79 loc) · 2.47 KB
/
e012-grph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python
# coding=utf-8
# Overlap Graphs
# ==============
#
# A graph whose nodes have all been labeled can be represented by an adjacency
# list, in which each row of the list contains the two node labels corresponding
# to a unique edge.
#
# A directed graph (or digraph) is a graph containing directed edges, each of
# which has an orientation. That is, a directed edge is represented by an arrow
# instead of simply a segment; the starting and ending nodes of an edge form its
# tail and head, respectively. The directed edge with tail v and head w is
# represented by (v,w) (but not by (w,v)). A directed loop is a directed edge of
# the form (v,v).
#
# For a collection of strings and a positive integer k, the overlap graph for
# the strings is a directed graph Ok in which each string is represented by a
# node, and string s is connected to string t with a directed edge if and only
# if there is a length k suffix of s that matches a length k prefix of t.
# Directed loops are not allowed in the overlap graph.
#
# Given: A collection of DNA strings in FASTA format having total length at most
# 10 kbp.
#
# Return: The adjacency list corresponding to O3.
#
# Sample Dataset
# --------------
# >Rosalind_0498
# AAATAAA
# >Rosalind_2391
# AAATTTT
# >Rosalind_2323
# TTTTCCC
# >Rosalind_0442
# AAATCCC
# >Rosalind_5013
# GGGTGGG
#
# Sample Output
# -------------
# Rosalind_0498 Rosalind_2391
# Rosalind_0498 Rosalind_0442
# Rosalind_2391 Rosalind_2323
def parse_fasta(fasta):
results = []
strings = fasta.strip().split('>')
for s in strings:
if len(s):
parts = s.split()
k = parts[0]
v = ''.join(parts[1:])
results.append((k, v))
return results
def overlap_graph(fasta, n):
results = []
dna = parse_fasta(fasta)
for k1, v1 in dna:
for k2, v2 in dna:
if k1 != k2 and v1.endswith(v2[:n]):
results.append((k1, k2))
return results
if __name__ == "__main__":
small_dataset = """
>Rosalind_0498
AAATAAA
>Rosalind_2391
AAATTTT
>Rosalind_2323
TTTTCCC
>Rosalind_0442
AAATCCC
>Rosalind_5013
GGGTGGG
"""
large_dataset = open('datasets/rosalind_grph.txt').read()
for edge in overlap_graph(large_dataset, 3):
print edge[0], edge[1]