Skip to content

Commit bc9ad52

Browse files
authored
Merge pull request #44 from alyst/agn_pos
Add alignment string position support
2 parents 118c5ad + cc8237c commit bc9ad52

File tree

9 files changed

+236
-157
lines changed

9 files changed

+236
-157
lines changed

src/BioAlignments.jl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,14 @@ export
2121
AlignmentAnchor,
2222
Alignment,
2323
AlignedSequence,
24+
2425
seq2ref,
2526
ref2seq,
27+
seq2aln,
28+
ref2aln,
29+
aln2seq,
30+
aln2ref,
31+
2632
ismatchop,
2733
isinsertop,
2834
isdeleteop,

src/alignedseq.jl

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ function AlignedSequence(seq::BioSequences.BioSequence, seqpos::Integer,
2727
end
2828
seqpos -= 1
2929
refpos -= 1
30+
alnpos = 0
3031
op = OP_START
3132
newseq = similar(seq, 0) # sequence without gap symbols
3233
anchors = AlignmentAnchor[]
@@ -44,7 +45,7 @@ function AlignedSequence(seq::BioSequences.BioSequence, seqpos::Integer,
4445
end
4546

4647
if op′ != op
47-
push!(anchors, AlignmentAnchor(seqpos, refpos, op))
48+
push!(anchors, AlignmentAnchor(seqpos, refpos, alnpos, op))
4849
op = op′
4950
end
5051

@@ -55,8 +56,9 @@ function AlignedSequence(seq::BioSequences.BioSequence, seqpos::Integer,
5556
if y != BioSequences.gap(eltype(ref))
5657
refpos += 1
5758
end
59+
alnpos += 1 # one or another don't have gap
5860
end
59-
push!(anchors, AlignmentAnchor(seqpos, refpos, op))
61+
push!(anchors, AlignmentAnchor(seqpos, refpos, alnpos, op))
6062
return AlignedSequence(newseq, anchors)
6163
end
6264

@@ -70,13 +72,12 @@ function IntervalTrees.last(alnseq::AlignedSequence)
7072
return alnseq.aln.lastref
7173
end
7274

73-
function seq2ref(alnseq::AlignedSequence, i::Integer)
74-
return seq2ref(alnseq.aln, i)
75-
end
76-
77-
function ref2seq(alnseq::AlignedSequence, i::Integer)
78-
return ref2seq(alnseq.aln, i)
79-
end
75+
seq2ref(alnseq::AlignedSequence, i) = seq2ref(alnseq.aln, i)
76+
ref2seq(alnseq::AlignedSequence, i) = ref2seq(alnseq.aln, i)
77+
seq2aln(alnseq::AlignedSequence, i) = seq2aln(alnseq.aln, i)
78+
ref2aln(alnseq::AlignedSequence, i) = ref2aln(alnseq.aln, i)
79+
aln2seq(alnseq::AlignedSequence, i) = aln2seq(alnseq.aln, i)
80+
aln2ref(alnseq::AlignedSequence, i) = aln2ref(alnseq.aln, i)
8081

8182
# simple letters and dashes representation of an alignment
8283
function Base.show(io::IO, alnseq::AlignedSequence)

src/alignment.jl

Lines changed: 107 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77
# License is MIT: https://github.com/BioJulia/Bio.jl/blob/master/LICENSE.md
88

99
"""
10-
Alignment of two sequences.
10+
Defines how to align a given sequence onto a reference sequence.
11+
The alignment is represented as a sequence of elementary operations (match, insertion, deletion etc)
12+
anchored to specific positions of the input and reference sequence.
1113
"""
1214
struct Alignment
1315
anchors::Vector{AlignmentAnchor}
@@ -58,9 +60,10 @@ function Alignment(cigar::AbstractString, seqpos::Int=1, refpos::Int=1)
5860
# path starts prior to the first aligned position pair
5961
seqpos -= 1
6062
refpos -= 1
63+
alnpos = 0
6164

6265
n = 0
63-
anchors = AlignmentAnchor[AlignmentAnchor(seqpos, refpos, OP_START)]
66+
anchors = AlignmentAnchor[AlignmentAnchor(seqpos, refpos, alnpos, OP_START)]
6467
for c in cigar
6568
if isdigit(c)
6669
n = n * 10 + convert(Int, c - '0')
@@ -79,8 +82,9 @@ function Alignment(cigar::AbstractString, seqpos::Int=1, refpos::Int=1)
7982
else
8083
error("The $(op) CIGAR operation is not yet supported.")
8184
end
85+
alnpos += n
8286

83-
push!(anchors, AlignmentAnchor(seqpos, refpos, op))
87+
push!(anchors, AlignmentAnchor(seqpos, refpos, alnpos, op))
8488
n = 0
8589
end
8690
end
@@ -100,41 +104,74 @@ function Base.show(io::IO, aln::Alignment)
100104
print(io, " CIGAR string: ", cigar(aln))
101105
end
102106

103-
"""
104-
seq2ref(aln::Alignment, i::Integer)::Tuple{Int,Operation}
105-
106-
Map a position `i` from sequence to reference.
107-
"""
108-
function seq2ref(aln::Alignment, i::Integer)::Tuple{Int,Operation}
109-
idx = findanchor(aln, i, Val{true})
107+
# generic function for mapping between sequence, reference and alignment positions
108+
# getsrc specifies anchor source position getter
109+
# getdest specifies anchor destination position getter
110+
function pos2pos(aln::Alignment, i::Integer,
111+
srcpos::Function, destpos::Function)::Tuple{Int,Operation}
112+
idx = findanchor(aln, i, srcpos)
110113
if idx == 0
111-
throw(ArgumentError("invalid sequence position: $i"))
114+
if srcpos === seqpos
115+
throw(ArgumentError("invalid sequence position: $i"))
116+
elseif srcpos === refpos
117+
throw(ArgumentError("invalid reference position: $i"))
118+
elseif srcpos === alnpos
119+
throw(ArgumentError("invalid alignment position: $i"))
120+
else
121+
throw(ArgumentError("Unknown position getter: $srcpos"))
122+
end
112123
end
113124
anchor = aln.anchors[idx]
114-
refpos = anchor.refpos
115-
if ismatchop(anchor.op)
116-
refpos += i - anchor.seqpos
125+
pos = destpos(anchor)
126+
if ismatchop(anchor.op) ||
127+
((srcpos === alnpos) && ((destpos === seqpos) && isinsertop(anchor.op) || (destpos === refpos) && isdeleteop(anchor.op))) ||
128+
((destpos === alnpos) && ((srcpos === seqpos) && isinsertop(anchor.op) || (srcpos === refpos) && isdeleteop(anchor.op)))
129+
pos += i - srcpos(anchor)
117130
end
118-
return refpos, anchor.op
131+
return pos, anchor.op
119132
end
120133

121134
"""
122-
ref2seq(aln::Alignment, i::Integer)::Tuple{Int,Operation}
135+
seq2ref(aln::Union{Alignment, AlignedSequence, PairwiseAlignment}, i::Integer)::Tuple{Int,Operation}
136+
137+
Map a position `i` from sequence to reference.
138+
"""
139+
seq2ref(aln::Alignment, i::Integer) = pos2pos(aln, i, seqpos, refpos)
140+
141+
"""
142+
ref2seq(aln::Union{Alignment, AlignedSequence, PairwiseAlignment}, i::Integer)::Tuple{Int,Operation}
123143
124144
Map a position `i` from reference to sequence.
125145
"""
126-
function ref2seq(aln::Alignment, i::Integer)::Tuple{Int,Operation}
127-
idx = findanchor(aln, i, Val{false})
128-
if idx == 0
129-
throw(ArgumentError("invalid reference position: $i"))
130-
end
131-
anchor = aln.anchors[idx]
132-
seqpos = anchor.seqpos
133-
if ismatchop(anchor.op)
134-
seqpos += i - anchor.refpos
135-
end
136-
return seqpos, anchor.op
137-
end
146+
ref2seq(aln::Alignment, i::Integer) = pos2pos(aln, i, refpos, seqpos)
147+
148+
"""
149+
seq2aln(aln::Union{Alignment, AlignedSequence, PairwiseAlignment}, i::Integer)::Tuple{Int,Operation}
150+
151+
Map a position `i` from the input sequence to the alignment sequence.
152+
"""
153+
seq2aln(aln::Alignment, i::Integer) = pos2pos(aln, i, seqpos, alnpos)
154+
155+
"""
156+
ref2aln(aln::Union{Alignment, AlignedSequence, PairwiseAlignment}, i::Integer)::Tuple{Int,Operation}
157+
158+
Map a position `i` from the reference sequence to the alignment sequence.
159+
"""
160+
ref2aln(aln::Alignment, i::Integer) = pos2pos(aln, i, refpos, alnpos)
161+
162+
"""
163+
aln2seq(aln::Union{Alignment, AlignedSequence, PairwiseAlignment}, i::Integer)::Tuple{Int,Operation}
164+
165+
Map a position `i` from the alignment sequence to the input sequence.
166+
"""
167+
aln2seq(aln::Alignment, i::Integer) = pos2pos(aln, i, alnpos, seqpos)
168+
169+
"""
170+
aln2ref(aln::Union{Alignment, AlignedSequence, PairwiseAlignment}, i::Integer)::Tuple{Int,Operation}
171+
172+
Map a position `i` from the alignment sequence to the reference sequence.
173+
"""
174+
aln2ref(aln::Alignment, i::Integer) = pos2pos(aln, i, alnpos, refpos)
138175

139176
"""
140177
cigar(aln::Alignment)
@@ -172,65 +209,68 @@ function check_alignment_anchors(anchors)
172209
end
173210

174211
for i in 2:lastindex(anchors)
175-
if anchors[i].refpos < anchors[i-1].refpos ||
176-
anchors[i].seqpos < anchors[i-1].seqpos
212+
@inbounds acur, aprev = anchors[i], anchors[i-1]
213+
if acur.refpos < aprev.refpos || acur.seqpos < aprev.seqpos || acur.alnpos < aprev.alnpos
177214
error("Alignment anchors must be sorted.")
178215
end
179216

180-
op = anchors[i].op
181-
if convert(UInt8, op) > convert(UInt8, OP_MAX_VALID)
217+
op = acur.op
218+
if !isvalid(op)
182219
error("Anchor at index $(i) has an invalid operation.")
183220
end
184221

185222
# reference skip/delete operations
186223
if isdeleteop(op)
187-
if anchors[i].seqpos != anchors[i-1].seqpos
188-
error("Invalid anchor positions for reference deletion.")
224+
if acur.seqpos != aprev.seqpos
225+
error("Invalid anchor sequence positions for reference deletion.")
226+
end
227+
if acur.alnpos - aprev.alnpos != acur.refpos - aprev.refpos
228+
error("Invalid anchor reference positions for reference deletion.")
189229
end
190230
# reference insertion operations
191231
elseif isinsertop(op)
192-
if anchors[i].refpos != anchors[i-1].refpos
193-
error("Invalid anchor positions for reference insertion.")
232+
if acur.refpos != aprev.refpos
233+
error("Invalid anchor reference positions for reference insertion.")
234+
end
235+
if acur.alnpos - aprev.alnpos != acur.seqpos - aprev.seqpos
236+
error("Invalid anchor sequence positions for reference deletion.")
194237
end
195238
# match operations
196239
elseif ismatchop(op)
197-
if anchors[i].refpos - anchors[i-1].refpos !=
198-
anchors[i].seqpos - anchors[i-1].seqpos
199-
error("Invalid anchor positions for match operation.")
240+
if (acur.refpos - aprev.refpos != acur.seqpos - aprev.seqpos) ||
241+
(acur.alnpos - aprev.alnpos != acur.seqpos - aprev.seqpos)
242+
error("Invalid anchor positions for match operation.")
200243
end
201244
end
202245
end
203246
end
204247

205-
# find the index of the first anchor that satisfies `i ≤ pos`
206-
@generated function findanchor(aln::Alignment, i::Integer, ::Type{Val{isseq}}) where isseq
207-
pos = isseq ? :seqpos : :refpos
208-
quote
209-
anchors = aln.anchors
210-
lo = 1
211-
hi = lastindex(anchors)
212-
if !(anchors[lo].$pos < i anchors[hi].$pos)
213-
return 0
214-
end
215-
# binary search
216-
@inbounds while hi - lo > 2
217-
m = (lo + hi) >> 1
218-
if anchors[m].$pos < i
219-
lo = m
220-
else # i ≤ anchors[m].$pos
221-
hi = m
222-
end
223-
# invariant (activate this for debugging)
224-
#@assert anchors[lo].$pos < i ≤ anchors[hi].$pos
248+
# find the index of the first anchor that satisfies `i ≤ pos(anchor)`
249+
function findanchor(aln::Alignment, i::Integer, pos::Function)
250+
anchors = aln.anchors
251+
lo = 1
252+
hi = lastindex(anchors)
253+
@inbounds if !(pos(anchors[lo]) < i pos(anchors[hi]))
254+
return 0
255+
end
256+
# binary search
257+
@inbounds while hi - lo > 2
258+
m = (lo + hi) >> 1
259+
if pos(anchors[m]) < i
260+
lo = m
261+
else # i ≤ pos(anchors[m])
262+
hi = m
225263
end
226-
# linear search
227-
@inbounds for j in lo+1:hi
228-
if i aln.anchors[j].$pos
229-
return j
230-
end
264+
# invariant (activate this for debugging)
265+
#@assert pos(anchors[lo]) < i ≤ pos(anchors[hi])
266+
end
267+
# linear search
268+
@inbounds for j in lo+1:hi
269+
if i pos(aln.anchors[j])
270+
return j
231271
end
232-
# do not reach here
233-
@assert false
234-
return 0
235272
end
273+
# do not reach here
274+
@assert false
275+
return 0
236276
end

src/anchors.jl

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,15 @@ Alignment operation with anchoring positions.
1212
struct AlignmentAnchor
1313
seqpos::Int
1414
refpos::Int
15+
alnpos::Int
1516
op::Operation
1617
end
1718

18-
function AlignmentAnchor(pos::Tuple{Int,Int}, op)
19-
return AlignmentAnchor(pos[1], pos[2], op)
20-
end
19+
seqpos(anc::AlignmentAnchor) = anc.seqpos
20+
refpos(anc::AlignmentAnchor) = anc.refpos
21+
alnpos(anc::AlignmentAnchor) = anc.alnpos
2122

2223
function Base.show(io::IO, anc::AlignmentAnchor)
23-
print(io, "AlignmentAnchor(", anc.seqpos, ", ", anc.refpos, ", '", anc.op, "')")
24+
print(io, "AlignmentAnchor(", anc.seqpos, ", ", anc.refpos,
25+
", ", anc.alnpos, ", '", anc.op, "')")
2426
end

src/operations.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ function Base.convert(::Type{Char}, op::Operation)
113113
end
114114

115115
function Base.print(io::IO, op::Operation)
116-
write(io, convert(Char, op))
116+
write(io, isvalid(op) ? convert(Char, op) : '?')
117117
return
118118
end
119119

src/pairwise/algorithms/common.jl

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,27 +32,44 @@ const TRACE_EXTINS = 0b10000
3232

3333
macro start_traceback()
3434
esc(quote
35-
anchor_point = (i, j)
35+
__alnpos = 0
36+
anchor_point = (i, j, __alnpos)
3637
op = OP_INVALID
3738
end)
3839
end
3940

41+
# reverses the anchors sequence at the end of the traceback
42+
# and offsets the alignment positions, so that (by default) it starts from 0
43+
function reverse_anchors!(v::AbstractVector{AlignmentAnchor},
44+
alignment_offset=!isempty(v) ? -v[end].alnpos : 0)
45+
r = n = length(v)
46+
@inbounds for i in 1:fld1(n, 2)
47+
vr = v[r]
48+
vi = v[i]
49+
v[i] = AlignmentAnchor(vr.seqpos, vr.refpos, vr.alnpos + alignment_offset, vr.op)
50+
(i != r) && (v[r] = AlignmentAnchor(vi.seqpos, vi.refpos, vi.alnpos + alignment_offset, vi.op))
51+
r -= 1
52+
end
53+
return v
54+
end
55+
4056
macro finish_traceback()
4157
esc(quote
42-
push!(anchors, AlignmentAnchor(anchor_point, op))
43-
push!(anchors, AlignmentAnchor((i, j), OP_START))
44-
reverse!(anchors)
58+
push!(anchors, AlignmentAnchor(anchor_point..., op))
59+
push!(anchors, AlignmentAnchor(i, j, __alnpos, OP_START))
60+
reverse_anchors!(anchors)
4561
pop!(anchors) # remove OP_INVALID
4662
end)
4763
end
4864

4965
macro anchor(ex)
5066
esc(quote
5167
if op != $ex
52-
push!(anchors, AlignmentAnchor(anchor_point, op))
68+
push!(anchors, AlignmentAnchor(anchor_point..., op))
5369
op = $ex
54-
anchor_point = (i, j)
70+
anchor_point = (i, j, __alnpos)
5571
end
72+
__alnpos -= 1
5673
if ismatchop(op)
5774
i -= 1
5875
j -= 1

0 commit comments

Comments
 (0)