-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathinsert_size.nim
194 lines (170 loc) · 5.71 KB
/
insert_size.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import hts
import sequtils
import strutils
import strformat
import math
import os
import colorize
import algorithm
import threadpool
import utils/helpers
#import ggplotnim
const INS_ARR = 10000
const insert_size_header* = ["median",
"mean",
"std_dev",
"min",
"percentile_99.5",
"max_all",
"n_reads",
"n_accept",
"n_use",
"sample"].join("\t")
const distribution_header = ["insert_size", "count", "sample"].join("\t")
proc accept_record(r: Record): bool =
if (r.flag.pair == false or
r.flag.unmapped == true or
r.flag.mate_unmapped or
r.flag.read1 == true or
r.flag.secondary or
r.flag.supplementary or
r.flag.dup or
r.isize == 0):
return false
return true
proc mean_freq(arr: seq[int64]): float64 =
var total = 0i64
for idx, val in arr:
total += (idx+1)*val
return total.float / sum(arr).float
proc median_freq(arr: seq[int64]): int64 =
# Calculate the median index of a frequency distribution
var total = sum(arr)
var running_total = 0i64
for idx, val in arr:
running_total += val
if running_total.float / total.float >= 0.5:
return idx
proc bam_sample(b: Bam): string =
# Fetches the sample name from the header
for line in splitLines($(b.hdr)):
if line.startsWith("@RG"):
for field in line.split("\t"):
if field.startswith("SM:"):
return field.replace("SM:", "")
return ""
type
chrom_freqs = ref object
inserts, overflow: seq[int64]
n_reads, n_accept: int
proc freq_inserts(bamfile: string, contig: string, verbose: bool): chrom_freqs =
var
b: Bam
n_reads = 0
n_accept = 0
inserts = new_seq[int64](INS_ARR)
overflow: seq[int64]
open(b, bamfile, index=true)
for record in b.query(contig):
n_reads += 1
if record.accept_record():
n_accept += 1
var insert_val = abs(record.isize)
if insert_val <= inserts.len:
inserts[insert_val-1] += 1
else:
overflow.add(insert_val)
close(b)
if verbose:
stderr.write_line fmt"{contig} complete".fgBlue
return chrom_freqs(inserts: inserts,
overflow: overflow,
n_reads: n_reads,
n_accept: n_accept)
proc cmd_insert_size*(bamfile: string, distfile: string, verbose: bool, basename: bool, absolute: bool) =
#[
Calculates insert size
]#
let fname = bam_file.lastPathPart()
var
b: Bam
inserts = new_seq[int64](INS_ARR)
inserts_trimmed = new_seq[int64](INS_ARR)
overflow: seq[int64]
n_reads = 0
n_accept = 0
max_insert = 0i64
p99 = 0i64
open(b, bamfile, index=true)
var freq_results = newSeq[FlowVar[chrom_freqs]](b.hdr.targets().len)
for idx, contig in b.hdr.targets():
freq_results[idx] = spawn freq_inserts(bamfile, contig.name, verbose)
sync()
# Compile the results
for idx, contig in b.hdr.targets():
var result = ^freq_results[idx]
for i, v in result.inserts:
inserts[i] += v
for i, v in result.overflow:
overflow.add(v)
n_reads += result.n_reads
n_accept += result.n_accept
close(b)
# Calculate the median insert size
var total_length = sum(inserts) + overflow.len
# Identify midpoint index
var running_total = 0.int64
var cumulative_sum: seq[int64]
# Trim last 0.5%
for idx, val in inserts:
running_total += val
cumulative_sum.add(running_total)
if running_total.float / total_length.float <= 0.995:
inserts_trimmed[idx] = val
p99 = idx + 1
#echo inserts_trimmed
let median_insert_size = median_freq(inserts_trimmed)
let mean_insert_size = mean_freq(inserts_trimmed)
let min_insert_size = inserts_trimmed.find(inserts_trimmed.filterIt(it > 0)[0]) + 1
# # Plotting
# let x = toSeq(1..inserts_trimmed.len)
# #echo x
# let y = inserts_trimmed
# #echo inserts_trimmed
# var df = seqsToDf(x, y).filter( f{ "y" > 0 } )
# echo df
# ggplot(df, aes(x = "x", y = "y")) +
# geom_point() +
# ggtitle("ggplotnim - or I Suck At Naming Things™") +
# ggsave("scatterColor.pdf")
# Output distribution
if distfile != "":
var f = open(distfile, fmWrite)
f.writeLine(output_header(distribution_header, basename, absolute))
for idx, val in inserts_trimmed.filterIt(it > 0):
f.writeLine(output_w_fnames([$idx, $val, bam_sample(b), fname].join("\t"), bamfile, basename, absolute))
f.close()
# Calc max
if overflow.len > 0:
max_insert = max(overflow.mapIt(it.int))
else:
max_insert = inserts.len.int64 - inserts.reversed().find(inserts.filterIt(it > 0)[^1])
# Calc sd
# Standard dev
let n = sum(inserts[1..p99 - 1])
var m = 0.int64
for idx, val in inserts[0..p99 - 1]:
m += val * (idx + 1)^2
let variance = (m.float - (n.float*(mean_insert_size^2))) / (n - 1).float
let std_dev = pow(variance, 0.5)
var header_out = [$median_insert_size,
fmt"{mean_insert_size:0.3f}",
fmt"{std_dev:0.3f}",
$min_insert_size,
$p99,
$max_insert,
$n_reads,
$n_accept,
$(sum(inserts_trimmed)),
bam_sample(b)]
echo output_w_fnames(header_out.join("\t"), bamfile, basename, absolute)