From 14eb43c3b8c7e718d6038eee9b96f3f2c34df025 Mon Sep 17 00:00:00 2001 From: tkike Date: Tue, 22 Dec 2020 01:45:00 +0100 Subject: [PATCH 1/4] DelayedArray write_block x back to sink --- R/accessory_funcs.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/accessory_funcs.R b/R/accessory_funcs.R index d1deb5b..8b4e9d7 100644 --- a/R/accessory_funcs.R +++ b/R/accessory_funcs.R @@ -448,9 +448,9 @@ non_vect_code <- function(files, col_idx, coldata, verbose = TRUE, genome = NULL file_uncovered = file_uncovered, zero_based = zero_based) DelayedArray::write_block(block = as.matrix(b$bdg[, .(beta)]), - viewport = grid[[i]], x = M_sink) + viewport = grid[[i]], sink = M_sink) DelayedArray::write_block(block = as.matrix(b$bdg[, .(cov)]), - viewport = grid[[i]], x = cov_sink) + viewport = grid[[i]], sink = cov_sink) genome_stat_final <- b$genome_stat[, `:=`(Sample_Name, rownames(coldata)[i])] chr_stat_final <- b$chr_stat[, `:=`(Sample_Name, rownames(coldata)[i])] @@ -463,9 +463,9 @@ non_vect_code <- function(files, col_idx, coldata, verbose = TRUE, genome = NULL file_uncovered = file_uncovered, zero_based = zero_based) DelayedArray::write_block(block = as.matrix(b$bdg[, .(beta)]), - viewport = grid[[i]], x = M_sink) + viewport = grid[[i]], sink = M_sink) DelayedArray::write_block(block = as.matrix(b$bdg[, .(cov)]), - viewport = grid[[i]], x = cov_sink) + viewport = grid[[i]], sink = cov_sink) genome_stat_final <- rbind(genome_stat_final, b$genome_stat[, `:=`(Sample_Name, rownames(coldata)[i])]) chr_stat_final <- rbind(chr_stat_final, b$chr_stat[, `:=`(Sample_Name, From 6cd7fcf8bbebe6f0f60a63ea4e91fd81021e79c8 Mon Sep 17 00:00:00 2001 From: tkike Date: Tue, 22 Dec 2020 01:45:44 +0100 Subject: [PATCH 2/4] better handle of chromosome names Add MT support to remove_snps() --- R/extract_CpGs.R | 6 +++--- R/read_bedgraphs.R | 16 ++++++++++------ R/snp_removal.R | 9 +++++---- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/R/extract_CpGs.R b/R/extract_CpGs.R index e4ac97b..8b0ac9c 100644 --- a/R/extract_CpGs.R +++ b/R/extract_CpGs.R @@ -49,9 +49,9 @@ extract_CPGs = function(ref_genome = NULL) { ref_build = NA } - chrom_sizes = data.table::data.table(contig = names(seqlengths(x = ref_genome)), - length = seqlengths(x = ref_genome)) - chrs = names(ref_genome) + chrom_sizes = data.table::data.table(contig = standardChromosomes(ref_genome), + length = seqlengths(x = ref_genome)[names(seqlengths(x = ref_genome)) %in% standardChromosomes(ref_genome)]) + chrs = standardChromosomes(ref_genome) message("-Extracting CpGs") diff --git a/R/read_bedgraphs.R b/R/read_bedgraphs.R index 175384c..976d498 100644 --- a/R/read_bedgraphs.R +++ b/R/read_bedgraphs.R @@ -101,24 +101,28 @@ read_bedgraphs <- function(files = NULL, pipeline = NULL, zero_based = TRUE, } } - if (is.null(contigs)) { - # Work with only main contigs (either with chr prefix - UCSC style, or - # ensemble style) - contigs <- c(paste0("chr", c(seq_len(22), "X", "Y", "M")), seq_len(22), - "X", "Y", "MT") - } + # Extract CpG's conig_lens <- NA if (is(ref_cpgs, "list") & all(names(ref_cpgs) == c("cpgs", "contig_lens", "release_name"))) { + if (is.null(contigs)) { + contigs <- ref_cpgs$contig_lens$contig + } conig_lens <- ref_cpgs$contig_lens[contig %in% contigs] genome <- data.table::copy(x = ref_cpgs$cpgs) } else if (any(is(ref_cpgs[1], "character"), is(ref_cpgs[1], "BSgenome"))) { genome <- extract_CPGs(ref_genome = ref_cpgs) + if (is.null(contigs)) { + contigs <- genome$contig_lens$contig + } conig_lens <- genome$contig_lens genome <- genome$cpgs } else if (is(ref_cpgs[1], "data.frame")) { + if (is.null(contigs)) { + contigs <- unique(genome$chr) + } genome <- data.table::copy(x = ref_cpgs) data.table::setkey(x = genome, chr, start) } else { diff --git a/R/snp_removal.R b/R/snp_removal.R index f5750af..d1dba01 100644 --- a/R/snp_removal.R +++ b/R/snp_removal.R @@ -62,6 +62,7 @@ remove_snps <- function(m, populations = NULL, maf_threshold = 0.01, reduce_filt } else { stop("Only hg19 and hg38 genomes are currently supported..\n") } + if (is.null(populations)) { populations <- GenomicScores::populations(mafdb) } else if (!all(populations %in% GenomicScores::populations(mafdb))) { @@ -70,14 +71,14 @@ remove_snps <- function(m, populations = NULL, maf_threshold = 0.01, reduce_filt } - regions <- gr.nochr(GenomicRanges::makeGRangesFromDataFrame(elementMetadata(m), start.field = "start", end.field = "start")) - + regions <- GenomicRanges::makeGRangesFromDataFrame(elementMetadata(m), start.field = "start", end.field = "start") + seqlevelsStyle(regions) <- "NCBI" if(n_cores==1) { - snp_rows<-which((rowSums(as.matrix(score(mafdb, regions, pop = populations))>= maf_threshold,na.rm=T)>0) | + snp_rows <- which((rowSums(as.matrix(score(mafdb, regions, pop = populations))>= maf_threshold,na.rm=T)>0) | (rowSums(as.matrix(score(mafdb, IRanges::shift(regions, 1), pop = populations))>= maf_threshold,na.rm=T)>0)) } else { - snp_rows<-which(unlist(mclapply(mc.cores=n_cores,1:n_chunks, function(i){ + snp_rows <- which(unlist(mclapply(mc.cores=n_cores,1:n_chunks, function(i){ sub_dat = regions[((i-1)*ceiling(length(regions)/n_chunks)+1):min(i*ceiling(length(regions)/n_chunks),length(regions))] (rowSums(as.matrix(score(mafdb, sub_dat, pop = populations))>= maf_threshold,na.rm=T)>0) | (rowSums(as.matrix(score(mafdb, IRanges::shift(sub_dat, 1), pop = populations))>= maf_threshold,na.rm=T)>0) From 4f4b4c36af189c4f371208690f7352bd303654c0 Mon Sep 17 00:00:00 2001 From: PoisonAlien Date: Mon, 8 Feb 2021 16:20:20 +0100 Subject: [PATCH 3/4] add citation --- DESCRIPTION | 2 +- README.md | 4 ++++ inst/CITATION | 22 ++++++++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 inst/CITATION diff --git a/DESCRIPTION b/DESCRIPTION index e2747ca..88912c2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: methrix Title: Fast and efficient summarization of generic bedGraph files from Bisufite sequencing -Version: 1.3.25 +Version: 1.4.05 Authors@R: c( person("Anand","Mayakonda", role = c("aut", "cre"), email = "anand_mt@hotmail.com", comment = c(ORCID = "0000-0003-1162-687X")), person("Reka","Toth", role = "aut", email = "r.toth@dkfz.de", comment = c(ORCID = "0000-0002-6096-1052")), diff --git a/README.md b/README.md index 8fbd9be..be7ca30 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,10 @@ Bedgraph files generated by BS pipelines often come in various flavors. Critical * A exemplary complete data analysis with steps from reading in to annotation and differential methylation calling can be find in our [WGBS best practices workflow](https://compepigen.github.io/methrix_docs/articles/methrix.html). +### Citation + +**_Mayakonda A, Schönung M, Hey J, Batra RN, Feuerstein-Akgoz C, Köhler K, Lipka DB, Sotillo R, Plass C, Lutsik P, Toth R. Methrix: an R/bioconductor package for systematic aggregation and analysis of bisulfite sequencing data. Bioinformatics. 2020 Dec 21:btaa1048. doi: 10.1093/bioinformatics/btaa1048. Epub ahead of print. PMID: [33346800](https://pubmed.ncbi.nlm.nih.gov/33346800/)._** + ### Installation ```r diff --git a/inst/CITATION b/inst/CITATION new file mode 100644 index 0000000..0836536 --- /dev/null +++ b/inst/CITATION @@ -0,0 +1,22 @@ +citHeader("methrix can be cited as:") + +citEntry( + entry="article", + title = "Methrix: an R/Bioconductor package for systematic aggregation and analysis of bisulfite sequencing data", + author = personList(as.person("Anand Mayakonda"), + as.person("Maximilian Schonung"), + as.person("Joschka Hey"), + as.person("Rajbir Nath Batra1"), + as.person("Clarissa Feuerstein-Akgoz"), + as.person("Kristin Kohler"), + as.person("Daniel B. Lipka"), + as.person("Rocio Sotillo"), + as.person("Christoph Plass"), + as.person("Pavlo Lutsik"), + as.person("Reka Toth"), + ), + journal = "Bioinformatics", + year = 2020, + doi = "https://doi.org/10.1093/bioinformatics/btaa1048", + textVersion = "Anand Mayakonda, Maximilian Schonung, Joschka Hey1, Rajbir Nath Batra, Clarissa Feuerstein-Akgoz1, Kristin Kohler, Daniel B. Lipka, Rocio Sotillo, Christoph Plass, Pavlo Lutsik and Reka Toth. 2020. Methrix: an R/Bioconductor package for systematic aggregation and analysis of bisulfite sequencing data. Bioinformatics. https://doi.org/10.1093/bioinformatics/btaa1048" +) \ No newline at end of file From 3f01422c1aca3b7268fae02c069895014c58999b Mon Sep 17 00:00:00 2001 From: PoisonAlien Date: Mon, 8 Feb 2021 17:10:44 +0100 Subject: [PATCH 4/4] update screeplot/report, version bump --- NEWS | 8 +++++++- R/methrix_plot.R | 20 ++++++++------------ inst/CITATION | 3 +-- inst/report/summarize_methrix.Rmd | 4 ++-- 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/NEWS b/NEWS index 0af00cb..1004773 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,10 @@ -CHANGES IN VERSION 1.4.00 (For Bioconductor 3.12 release) +CHANGES IN VERSION 1.4.05 +------------------------- + o rotate x axis labels by 45 degrees in report + o update `methrix_pca()` screeplot + o added CITATION + +CHANGES IN VERSION 1.4.00 (Bioconductor version) ------------------------- o read_bedgraphs() supports bedgraphs files from "Bismark_cov", "MethylDackel", "MethylcTools", "BisSNP", "BSseeker2_CGmap" o write_bedgraphs() supports output to multiBed and metilene file formats diff --git a/R/methrix_plot.R b/R/methrix_plot.R index a38c2a7..523915e 100644 --- a/R/methrix_plot.R +++ b/R/methrix_plot.R @@ -219,18 +219,14 @@ methrix_pca <- function(m, var = "top", top_var = 1000, ranges = NULL, # Draw cumulative variance explained by PCs if (do_plot) { - par(bty = "n", mgp = c(2.5, 0.5, 0), mar = c(3, 4, 2, 2) + 0.1, - tcl = -0.25, las = 1) - plot(pc_vars, type = "h", col = "red", xlab = "", ylab = "variance Explained", - ylim = c(0, 1), yaxs = "i") - mtext(side = 1, "Principal component", line = 2) - cum_var <- cumsum(meth_pca$sdev^2)/sum(meth_pca$sdev^2) * meth_pca$sdev[1]^2/sum(meth_pca$sdev^2) - lines(cumsum(cum_var), type = "s") - axis(side = 4, at = pretty(c(0, 1)), labels = pretty(c(0, 1))) - legend("topright", col = c("red", "black"), lty = 1, c("Per PC", "Cumulative"), bty = "n") - #lines(x = c(length(meth_pca$sdev), n_pc, n_pc), y = c(cum_var[n_pc], cum_var[n_pc], 0), lty = 3) - title(main = paste0("Variance explained by ", n_pc, " PC: ", round(sum(c(meth_pca$sdev^2/sum(meth_pca$sdev^2))[seq_len(n_pc)]), - digits = 2)), adj = 0) + par(mar = c(3, 4, 1, 4)) + b = barplot(height = pc_vars, names.arg = NA, col = "#2c3e50", ylim = c(0, 1), las = 2, axes = FALSE, ylab = "Variance Explained") + points(x = b, y = cumsum(pc_vars), type = 'l', lty = 2, lwd = 1.2, xpd = TRUE, col = "#c0392b") + points(x = b, y = cumsum(pc_vars), type = 'p', pch = 19, xpd = TRUE, col = "#c0392b") + mtext(text = paste0("PC", 1:length(pc_vars)), side = 1, at = b, las = 2, line = 0.5, cex = 0.8) + axis(side = 2, at = seq(0, 1, 0.1), line = 0, las = 2, cex.axis = 0.8) + axis(side = 4, at = seq(0, 1, 0.1), line = 0, las = 2, cex.axis = 0.8) + legend(x = "topleft", legend = "Cumulative", col = "#c0392b", pch = 19, lwd = 1, cex = 0.75, bty = "n") } #----------------------------------------------------------------------------------------------------------------------- diff --git a/inst/CITATION b/inst/CITATION index 0836536..d771bf8 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -13,8 +13,7 @@ citEntry( as.person("Rocio Sotillo"), as.person("Christoph Plass"), as.person("Pavlo Lutsik"), - as.person("Reka Toth"), - ), + as.person("Reka Toth")), journal = "Bioinformatics", year = 2020, doi = "https://doi.org/10.1093/bioinformatics/btaa1048", diff --git a/inst/report/summarize_methrix.Rmd b/inst/report/summarize_methrix.Rmd index 7b9366d..9e2e471 100644 --- a/inst/report/summarize_methrix.Rmd +++ b/inst/report/summarize_methrix.Rmd @@ -44,13 +44,13 @@ contig_lens = data.table::fread(input = params$chr_lens) chr_order = contig_lens$contig[contig_lens$contig %in% unique(n_covered[,chr])] n_covered[, chr := factor(as.character(chr), levels = chr_order)] -p = ggplot(data = n_covered, aes(x = chr, y = n_covered, label = Sample_Name))+geom_boxplot()+geom_jitter(size = 0.6)+theme_minimal()+theme(axis.title.x = element_blank(), axis.title.y = element_blank()) +p = ggplot(data = n_covered, aes(x = chr, y = n_covered, label = Sample_Name))+geom_boxplot()+geom_jitter(size = 0.6)+theme_minimal()+theme(axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1)) plotly::ggplotly(p = p) ``` ### Fraction ```{r} -p = ggplot(data = n_covered, aes(x = chr, y = fraction, label = Sample_Name))+geom_boxplot()+geom_jitter(size = 0.6)+theme_minimal()+theme(axis.title.x = element_blank(), axis.title.y = element_blank()) +p = ggplot(data = n_covered, aes(x = chr, y = fraction, label = Sample_Name))+geom_boxplot()+geom_jitter(size = 0.6)+theme_minimal()+theme(axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1)) plotly::ggplotly(p = p) ``` Number of reference CpGs covered by each sample in each chromosome