@@ -29,7 +29,7 @@ get_source_idx = function(protocol = NULL) {
29
29
col_names = c(" chr" , " start" , " M" , " U" ),
30
30
fix_missing = c(" cov := M+U" , " beta := M/cov" ), select = TRUE ))
31
31
} else {
32
- # Bismark
32
+ # Bismark
33
33
return (list (col_idx = list (character = 1 , numeric = 2 , numeric = 4 , numeric = 5 , numeric = 6 ),
34
34
col_names = c(" chr" , " start" , " beta" , " M" , " U" ),
35
35
fix_missing = c(" cov := M+U" ), select = TRUE ))
@@ -157,7 +157,7 @@ read_bdg = function(bdg, col_list = NULL, genome = NULL, verbose = TRUE,
157
157
strand_collapse = FALSE , fill_cpgs = TRUE ,
158
158
contigs = contigs , synced_coordinates = synced_coordinates ,
159
159
file_uncovered = NULL , zero_based = TRUE ) {
160
-
160
+
161
161
chr <- M <- U <- . <- NULL
162
162
message(paste0(" -Processing: " , basename(bdg )))
163
163
if (col_list $ select ){
@@ -168,14 +168,14 @@ read_bdg = function(bdg, col_list = NULL, genome = NULL, verbose = TRUE,
168
168
colClasses = col_list $ col_classes ,
169
169
verbose = FALSE ,
170
170
showProgress = FALSE ))
171
-
171
+
172
172
colnames(bdg_dat )[col_list $ col_idx ] = names(col_list $ col_idx )
173
173
bdg_dat [, `:=`(chr , as.character(chr ))]
174
174
bdg_dat [, `:=`(start , as.integer(start ))]
175
175
}
176
-
177
-
178
-
176
+
177
+
178
+
179
179
if (" beta" %in% colnames(bdg_dat )) {
180
180
if (nrow(bdg_dat ) < 1000 ) {
181
181
sample_row_idx = 1 : nrow(bdg_dat )
@@ -196,19 +196,19 @@ read_bdg = function(bdg, col_list = NULL, genome = NULL, verbose = TRUE,
196
196
}
197
197
gc(verbose = FALSE )
198
198
}
199
-
199
+
200
200
if (! is.null(col_list $ fix_missing )) {
201
201
for (cmd in col_list $ fix_missing ) {
202
202
bdg_dat [, eval(parse(text = cmd ))]
203
203
}
204
204
}
205
-
206
-
205
+
206
+
207
207
if (zero_based ) {
208
208
# Bring bedgraphs to 1-based cordinate
209
209
bdg_dat [, `:=`(start , start + 1 )]
210
210
}
211
-
211
+
212
212
# Check for contig prefixes and add them if necessary
213
213
if (nrow(bdg_dat ) < 1000 ) {
214
214
sample_row_idx = sample(x = seq_len(nrow(bdg_dat )),
@@ -227,33 +227,33 @@ read_bdg = function(bdg, col_list = NULL, genome = NULL, verbose = TRUE,
227
227
stop(" Prefix mismatch between provided CpGs and bedgraphs" )
228
228
}
229
229
}
230
-
230
+
231
231
if (! is.null(contigs )) {
232
232
bdg_dat = bdg_dat [chr %in% as.character(contigs )]
233
233
}
234
-
234
+
235
235
if (synced_coordinates ) {
236
236
bdg_dat = bdg_dat [strand == " -" , `:=`(start , start + 1L )]
237
237
}
238
-
238
+
239
239
data.table :: setkey(x = bdg_dat , " chr" , " start" )
240
240
dup_rows = nrow(bdg_dat [duplicated(bdg_dat , by = c(" chr" , " start" ))])
241
- if (nrow( dup_rows ) > 0 ){
241
+ if (dup_rows > 0 ){
242
242
message(paste0(" -- Removed duplicated CpGs: " , format(dup_rows , big.mark = " ," )))
243
243
bdg_dat = bdg_dat [! duplicated(bdg_dat , by = c(" chr" , " start" ))]
244
244
}
245
245
data.table :: setkey(x = genome , " chr" , " start" )
246
-
246
+
247
247
missing_cpgs = genome [! bdg_dat [, list (chr , start )], on = c(" chr" , " start" )]
248
-
248
+
249
249
# Write missing CpGs to an op_dir
250
250
if (! is.null(file_uncovered ) && nrow(missing_cpgs ) > 0 ) {
251
251
fwrite(x = missing_cpgs , file = paste0(file_uncovered ,
252
252
gsub(" \\ .[[:alnum:]]+(\\ .gz)?$" ,
253
253
" " , basename(bdg )), " _uncovered.bed" ),
254
254
sep = " \t " , row.names = FALSE )
255
255
}
256
-
256
+
257
257
if (verbose ) {
258
258
if (nrow(missing_cpgs ) > 0 ) {
259
259
message(paste0(" --CpGs missing: " , format(nrow(missing_cpgs ), big.mark = " ," )), " (from known reference CpGs)" )
@@ -271,7 +271,7 @@ read_bdg = function(bdg, col_list = NULL, genome = NULL, verbose = TRUE,
271
271
# crucial to make sure everything is in order
272
272
is_identical = all.equal(target = bdg_dat [, .(chr , start )],
273
273
current = genome [, .(chr , start )], ignore.row.order = FALSE )
274
-
274
+
275
275
if (is(is_identical , " character" )) {
276
276
non_ref_cpgs = bdg_dat [! genome [, list (chr , start )], on = c(" chr" ,
277
277
" start" )]
@@ -289,30 +289,30 @@ read_bdg = function(bdg, col_list = NULL, genome = NULL, verbose = TRUE,
289
289
# Re-assign strand info from genome (since some bedgraphs have no
290
290
# strand info, yet cover CpGs from both strands. i,e MethylDackel)
291
291
bdg_dat [, `:=`(strand , genome $ strand )]
292
-
292
+
293
293
if (strand_collapse ) {
294
294
# If strand information needs to collapsed, bring start position of
295
295
# crick strand to previous base (on watson base) and estimate new M, U
296
296
# and beta values
297
297
if (! all(c(" M" , " U" ) %in% names(bdg_dat ))) {
298
298
stop(" strand_collapse works only when M and U are available!" )
299
299
}
300
-
300
+
301
301
bdg_dat [, `:=`(start , ifelse(strand == " -" , yes = start - 1 , no = start ))]
302
302
bdg_dat = bdg_dat [, .(M = sum(M , na.rm = TRUE ), U = sum(U , na.rm = TRUE )),
303
303
.(chr , start )]
304
304
bdg_dat [, `:=`(cov , M + U )]
305
305
bdg_dat [, `:=`(beta , M / cov )]
306
306
bdg_dat [, `:=`(strand , " +" )]
307
307
}
308
-
308
+
309
309
# data.table::set(bdg_dat, which(is.nan(bdg_dat[,beta])), 'beta', NA)
310
310
# If coverage is 0, convert corresponding beta as well as coverage
311
311
# values to NA
312
312
data.table :: set(bdg_dat , which(bdg_dat [, cov ] == 0 ), c(" cov" , " beta" ),
313
313
NA )
314
314
bdg_dat = bdg_dat [, .(chr , start , beta , cov , strand )]
315
-
315
+
316
316
bdg_genome_stat = bdg_dat [! is.na(beta ), .(mean_meth = mean(beta ),
317
317
median_meth = median(beta ),
318
318
mean_cov = mean(cov ),
@@ -322,7 +322,7 @@ read_bdg = function(bdg, col_list = NULL, genome = NULL, verbose = TRUE,
322
322
mean_cov = mean(cov ),
323
323
median_cov = median(cov )), .(chr )]
324
324
bdg_ncpg_stat = bdg_dat [! is.na(beta ), .N , .(chr )]
325
-
325
+
326
326
return (list (bdg = bdg_dat , genome_stat = bdg_genome_stat ,
327
327
chr_stat = bdg_chr_stat ,
328
328
ncpg = bdg_ncpg_stat ))
@@ -338,13 +338,13 @@ vect_code_batch <- function(files, col_idx, batch_size, col_data = NULL,
338
338
. <- NULL
339
339
batches <- split(files , ceiling(seq_along(files )/ batch_size ))
340
340
batches_samp_names <- split(rownames(col_data ), ceiling(seq_along(rownames(col_data ))/ batch_size ))
341
-
341
+
342
342
beta_mat_final <- data.table :: data.table()
343
343
cov_mat_final <- data.table :: data.table()
344
344
genome_stat_final <- data.table :: data.table()
345
345
chr_stat_final <- data.table :: data.table()
346
346
ncpg_final <- data.table :: data.table()
347
-
347
+
348
348
for (i in seq_along(batches )) {
349
349
# browser()
350
350
message(paste0(" -Batch: " , i , " /" , length(batches )))
@@ -364,14 +364,14 @@ vect_code_batch <- function(files, col_idx, batch_size, col_data = NULL,
364
364
file_uncovered = file_uncovered , zero_based = zero_based )
365
365
}
366
366
names(bdgs ) <- samp_names
367
-
367
+
368
368
if (i == 1 ) {
369
369
cov_mat_final <- data.frame (lapply(bdgs , function (x ) x $ bdg [,
370
370
.(cov )]), stringsAsFactors = FALSE )
371
371
beta_mat_final <- data.frame (lapply(bdgs , function (x ) x $ bdg [,
372
372
.(beta )]), stringsAsFactors = FALSE )
373
373
colnames(cov_mat_final ) <- colnames(beta_mat_final ) <- samp_names
374
-
374
+
375
375
genome_stat_final <- data.table :: rbindlist(lapply(bdgs , function (x ) x $ genome_stat ),
376
376
use.names = TRUE , fill = TRUE , idcol = " Sample_Name" )
377
377
chr_stat_final <- data.table :: rbindlist(lapply(bdgs , function (x ) x $ chr_stat ),
@@ -386,7 +386,7 @@ vect_code_batch <- function(files, col_idx, batch_size, col_data = NULL,
386
386
colnames(cov_mat ) <- colnames(beta_mat ) <- samp_names
387
387
cov_mat_final <- cbind(cov_mat_final , cov_mat )
388
388
beta_mat_final <- cbind(beta_mat_final , beta_mat )
389
-
389
+
390
390
genome_stat_final <- rbind(genome_stat_final , data.table :: rbindlist(lapply(bdgs ,
391
391
function (x ) x $ genome_stat ), use.names = TRUE , fill = TRUE ,
392
392
idcol = " Sample_Name" ))
@@ -395,7 +395,7 @@ vect_code_batch <- function(files, col_idx, batch_size, col_data = NULL,
395
395
idcol = " Sample_Name" ))
396
396
ncpg_final <- rbind(ncpg_final , data.table :: rbindlist(lapply(bdgs ,
397
397
function (x ) x $ ncpg ), use.names = TRUE , fill = TRUE , idcol = " Sample_Name" ))
398
-
398
+
399
399
rm(cov_mat )
400
400
rm(beta_mat )
401
401
gc()
@@ -404,7 +404,7 @@ vect_code_batch <- function(files, col_idx, batch_size, col_data = NULL,
404
404
gc()
405
405
ncpg_final <- data.table :: dcast(data = ncpg_final , chr ~ Sample_Name ,
406
406
value.var = " N" )
407
-
407
+
408
408
return (list (beta_matrix = data.table :: setDT(beta_mat_final ), cov_matrix = data.table :: setDT(cov_mat_final ),
409
409
genome_stat = genome_stat_final , chr_stat = chr_stat_final , ncpg = ncpg_final ))
410
410
}
@@ -417,14 +417,14 @@ vect_code_batch <- function(files, col_idx, batch_size, col_data = NULL,
417
417
non_vect_code <- function (files , col_idx , coldata , verbose = TRUE , genome = NULL ,
418
418
h5temp = NULL , h5 = FALSE , strand_collapse = FALSE , contigs = contigs ,
419
419
synced_coordinates , file_uncovered = NULL , zero_based = TRUE ) {
420
-
420
+
421
421
Sample_Name <- . <- chr <- NULL
422
422
if (strand_collapse ) {
423
423
dimension <- as.integer(nrow(genome )/ 2 )
424
424
} else {
425
425
dimension <- as.integer(nrow(genome ))
426
426
}
427
-
427
+
428
428
if (h5 ) {
429
429
if (is.null(h5temp )) {
430
430
h5temp <- tempdir()
@@ -433,11 +433,11 @@ non_vect_code <- function(files, col_idx, coldata, verbose = TRUE, genome = NULL
433
433
while (any(c(paste0(" M_sink_" , sink_counter , " .h5" ), paste0(" cov_sink_" ,
434
434
sink_counter , " .h5" )) %in% dir(h5temp ))) {
435
435
sink_counter <- sink_counter + 1
436
-
436
+
437
437
}
438
438
grid <- DelayedArray :: RegularArrayGrid(refdim = c(dimension , length(files )),
439
439
spacings = c(dimension , 1L ))
440
-
440
+
441
441
M_sink <- HDF5Array :: HDF5RealizationSink(dim = c(dimension , length(files )),
442
442
dimnames = NULL , type = " double" ,
443
443
filepath = file.path(h5temp , paste0(" M_sink_" , sink_counter , " .h5" )), name = " M" , level = 6 )
@@ -449,14 +449,14 @@ non_vect_code <- function(files, col_idx, coldata, verbose = TRUE, genome = NULL
449
449
beta_mat <- data.table :: data.table()
450
450
cov_mat <- data.table :: data.table()
451
451
}
452
-
452
+
453
453
if (h5 ) {
454
454
for (i in seq_along(files )) {
455
455
if (i == 1 ) {
456
456
b <- read_bdg(bdg = files [i ], col_list = col_idx , genome = genome ,
457
457
strand_collapse = strand_collapse , contigs = contigs , synced_coordinates = synced_coordinates ,
458
458
file_uncovered = file_uncovered , zero_based = zero_based )
459
-
459
+
460
460
DelayedArray :: write_block(block = as.matrix(b $ bdg [, .(beta )]),
461
461
viewport = grid [[i ]], sink = M_sink )
462
462
DelayedArray :: write_block(block = as.matrix(b $ bdg [, .(cov )]),
@@ -471,7 +471,7 @@ non_vect_code <- function(files, col_idx, coldata, verbose = TRUE, genome = NULL
471
471
b <- read_bdg(bdg = files [i ], col_list = col_idx , genome = genome ,
472
472
strand_collapse = strand_collapse , contigs = contigs , synced_coordinates = synced_coordinates ,
473
473
file_uncovered = file_uncovered , zero_based = zero_based )
474
-
474
+
475
475
DelayedArray :: write_block(block = as.matrix(b $ bdg [, .(beta )]),
476
476
viewport = grid [[i ]], sink = M_sink )
477
477
DelayedArray :: write_block(block = as.matrix(b $ bdg [, .(cov )]),
@@ -498,7 +498,7 @@ non_vect_code <- function(files, col_idx, coldata, verbose = TRUE, genome = NULL
498
498
strand_collapse = strand_collapse , contigs = contigs ,
499
499
synced_coordinates = synced_coordinates , file_uncovered = file_uncovered ,
500
500
zero_based = zero_based )
501
-
501
+
502
502
beta_mat <- b $ bdg [, .(chr , start , beta )]
503
503
cov_mat <- b $ bdg [, .(chr , start , cov )]
504
504
genome_stat_final <- b $ genome_stat [, `:=`(Sample_Name ,
@@ -510,7 +510,7 @@ non_vect_code <- function(files, col_idx, coldata, verbose = TRUE, genome = NULL
510
510
strand_collapse = strand_collapse , contigs = contigs ,
511
511
synced_coordinates = synced_coordinates , file_uncovered = file_uncovered ,
512
512
zero_based = zero_based )
513
-
513
+
514
514
beta_mat <- cbind(beta_mat , b $ bdg [, .(beta )])
515
515
cov_mat <- cbind(cov_mat , b $ bdg [, .(cov )])
516
516
genome_stat_final <- rbind(genome_stat_final , b $ genome_stat [,
@@ -523,7 +523,7 @@ non_vect_code <- function(files, col_idx, coldata, verbose = TRUE, genome = NULL
523
523
colnames(beta_mat )[ncol(beta_mat )] <- colnames(cov_mat )[ncol(cov_mat )] <- rownames(coldata )[i ]
524
524
}
525
525
526
-
526
+
527
527
ncpg_final <- data.table :: dcast(data = ncpg_final , chr ~ Sample_Name ,
528
528
value.var = " N" )
529
529
return (list (beta_matrix = beta_mat [, - (seq_len(2 ))], cov_matrix = cov_mat [, - (seq_len(2 ))],
@@ -560,7 +560,7 @@ cast_ranges <- function(regions, set.key = TRUE) {
560
560
} else {
561
561
stop(" Invalid input class for regions. Must be a data.table or GRanges object" )
562
562
}
563
-
563
+
564
564
target_regions
565
565
}
566
566
@@ -569,7 +569,7 @@ cast_ranges <- function(regions, set.key = TRUE) {
569
569
giveme_this <- function (mat , stat = " mean" , na_rm = TRUE , ish5 = FALSE ) {
570
570
stat <- match.arg(arg = stat , choices = c(" mean" , " median" , " min" ,
571
571
" max" , " sum" ))
572
-
572
+
573
573
if (ish5 ) {
574
574
if (stat == " mean" ) {
575
575
res <- DelayedMatrixStats :: colMeans2(mat , na.rm = na_rm )
@@ -595,25 +595,25 @@ giveme_this <- function(mat, stat = "mean", na_rm = TRUE, ish5 = FALSE) {
595
595
res <- matrixStats :: colSums2(mat , na.rm = na_rm )
596
596
}
597
597
}
598
-
598
+
599
599
res
600
600
}
601
601
602
602
603
603
# --------------------------------------------------------------------------------------------------------------------------
604
604
# Tiny script to get axis and limits
605
605
get_y_lims <- function (vec ) {
606
-
606
+
607
607
y_lims <- range(vec )
608
608
y_at <- pretty(y_lims )
609
-
609
+
610
610
if (y_at [1 ] > min(vec , na.rm = TRUE )) {
611
611
y_at [1 ] <- min(vec , na.rm = TRUE )
612
612
}
613
613
if (y_at [length(y_at )] < max(vec , na.rm = TRUE )) {
614
614
y_at [length(y_at )] <- max(vec , na.rm = TRUE )
615
615
}
616
616
y_lims <- range(y_at , na.rm = TRUE )
617
-
617
+
618
618
list (y_lims = y_lims , y_at = y_at )
619
619
}
0 commit comments