forked from NickSto/dunovo
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathswalign.py
104 lines (84 loc) · 3.22 KB
/
swalign.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import errno
import ctypes
import string
# Locate the library file.
LIBFILE = 'libswalign.so'
script_dir = os.path.dirname(os.path.realpath(__file__))
library_path = os.path.join(script_dir, LIBFILE)
if not os.path.isfile(library_path):
ioe = IOError('Library file "'+LIBFILE+'" not found.')
ioe.errno = errno.ENOENT
raise ioe
swalign = ctypes.cdll.LoadLibrary(library_path)
REVCOMP_TABLE = string.maketrans('acgtrymkbdhvACGTRYMKBDHV', 'tgcayrkmvhdbTGCAYRKMVHDB')
# C struct for ctypes
class SeqPairC(ctypes.Structure):
_fields_ = [
('a', ctypes.c_char_p),
('alen', ctypes.c_uint),
('b', ctypes.c_char_p),
('blen', ctypes.c_uint),
]
# C struct for ctypes
class AlignC(ctypes.Structure):
_fields_ = [
('seqs', ctypes.POINTER(SeqPairC)),
('start_a', ctypes.c_int),
('start_b', ctypes.c_int),
('end_a', ctypes.c_int),
('end_b', ctypes.c_int),
('matches', ctypes.c_int),
('score', ctypes.c_double),
]
# The Python version
class Align(object):
def __init__(self, align_c):
self.target = align_c.seqs.contents.a
self.query = align_c.seqs.contents.b
# Where the first base of the target aligns on the query, in query coordinates (or 1, if <= 0).
self.start_target = align_c.start_a
# Where the first base of the query aligns on the target, in target coordinates (or 1, if <= 0).
self.start_query = align_c.start_b
# Where the last base of the target aligns on the query, in query coordinates.
self.end_target = align_c.end_a
# Where the last base of the query aligns on the target, in target coordinates.
self.end_query = align_c.end_b
self.matches = align_c.matches
self.score = align_c.score
# Provide this common function.
def __str__(self):
"""Print a human-readable representation of the alignment."""
start_query = str(self.start_query)
start_target = str(self.start_target)
start_width = str(max(len(start_query), len(start_target)))
line_format = '{:'+start_width+'} {} {}'
output = line_format.format(start_target, self.target, self.end_target) + '\n'
output += line_format.format(start_query, self.query, self.end_query)
return output
# Initialize functions (define types).
swalign.smith_waterman.restype = ctypes.POINTER(AlignC)
swalign.revcomp.restype = ctypes.c_char_p
def smith_waterman(target, query):
seq_pair = SeqPairC(target, len(target), query, len(query))
align_c = swalign.smith_waterman(ctypes.pointer(seq_pair), 1).contents
return Align(align_c)
def smith_waterman_duplex(target, query):
"""Smith-Waterman align query to target in both orientations and return the best.
Convenience function that calls smith_waterman() twice, and returns the
alignment with the highest score."""
align = smith_waterman(target, query)
query_rc = revcomp(query)
align_rc = smith_waterman(target, query_rc)
if align_rc.score > align.score:
return align_rc
else:
return align
def revcomp(seq):
"""Return the reverse complement of the input sequence.
Leaves the input string unaltered."""
return seq.translate(REVCOMP_TABLE)[::-1]
def revcomp_inplace(seq):
"""Convert the input sequence to its reverse complement.
WARNING: This will alter the input string in-place!"""
swalign.revcomp(seq)