-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathexpand-cigar.py
118 lines (99 loc) · 3.35 KB
/
expand-cigar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import sys
import pysam
import argparse
"""Author: Aaron R. Quinlan, December 2014"""
# http://pysam.readthedocs.org/en/latest/api.html#pysam.AlignedRead.cigar
MATCH = 0 # M
INS = 1 # I
DEL = 2 # D
SKIP = 3 # N
SOFT = 4 # S
HARD = 5 # H
PAD = 6 # P
EQUAL = 7 # =
DIFF = 8 # X
def get_chrom(fasta_fh, chrom):
"""
Return the chromosome sequence
"""
return fasta_fh.fetch(chrom)
def expand_match(qry_seq, ref_seq):
"""
Return an expanded list of CIGAR ops
based upon nuceltotide matches (=, or 7)
and mismatches (X, or 8)
"""
prev_op = None
curr_op = None
length = 1
for idx, q_nucl in enumerate(qry_seq):
if q_nucl == ref_seq[idx]: curr_op = 7 # EQUAL (=)
else: curr_op = 8 # DIFF (X)
if curr_op == prev_op:
length += 1
elif prev_op is not None:
yield (prev_op, length)
length = 1
prev_op = curr_op
yield (curr_op, length)
def main(args):
bam = pysam.AlignmentFile(args.bam)
fa = pysam.FastaFile(args.fasta)
outfile = pysam.AlignmentFile("-", "wb", template=bam)
prev_chrom_id = None
curr_chrom_id = None
curr_chrom_seq = None
for read in bam:
# skip unaligned reads.
if read.is_unmapped:
continue
curr_chrom_id = read.reference_id
# load the current chromosome into memory
if curr_chrom_id != prev_chrom_id:
curr_chrom_seq = get_chrom(fa, bam.getrname(curr_chrom_id))
# iterate through the existing CIGAR
# and replace M ops with expanded X and = ops.
ref_pos = read.reference_start
qry_pos = 0
new_cigar = []
for cigar_tuple in read.cigar:
op = cigar_tuple[0]
op_len = cigar_tuple[1]
if op == EQUAL or op == DIFF or op == MATCH:
if op == MATCH:
qry_seq = read.query_sequence[qry_pos:qry_pos + op_len]
ref_seq = curr_chrom_seq[ref_pos:ref_pos + op_len]
if qry_seq == ref_seq:
new_cigar.append((7, len(qry_seq))) # EQUAL (=)
else: # expand the M CIGAR op into X and = ops.
for new_cigar_tuple in expand_match(qry_seq, ref_seq):
new_cigar.append(new_cigar_tuple)
else:
new_cigar.append(cigar_tuple)
ref_pos += op_len
qry_pos += op_len
elif op == DEL or op == SKIP:
ref_pos += op_len
new_cigar.append(cigar_tuple)
elif op == INS or op == SOFT:
qry_pos += op_len
new_cigar.append(cigar_tuple)
elif op == HARD:
new_cigar.append(cigar_tuple)
# replace the old CIGAR and write updated record.
read.cigar=new_cigar
outfile.write(read)
prev_chrom_id=curr_chrom_id
outfile.close()
if __name__ == "__main__":
parser=argparse.ArgumentParser(prog='expand_cigar.py')
parser.add_argument('--bam',
dest='bam',
metavar='STRING',
help='The sorted BAM file whose CIGARs you wish expand.')
parser.add_argument('--fasta',
dest='fasta',
metavar='STRING',
help='The reference genome used to create the BAM.')
args=parser.parse_args()
main(args)