-
Notifications
You must be signed in to change notification settings - Fork 12
/
snp_removal.R
179 lines (134 loc) · 7.82 KB
/
snp_removal.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#' Removes CpG sites from the object if they overlap with common SNPs
#' @details Takes \code{\link{methrix}} object and removes common SNPs. SNPs overlapping with a CpG site and have a minor allele frequency (MAF)
#' above a threshold in any of the populations used will be selected and the corresponding CpG sites will be removed from the \code{\link{methrix}} object.
#' With the reduce_filtering option, SNPs with MAP < 0.1 will be further evaluated. If they show low variance in the dataset, there is probably no genotype
#' variability in the population, therefore the corresponding CpG site won't be removed. Please keep in mind that variance thresholds are
#' @param m \code{\link{methrix}} object
#' @param populations Populations to use. Default is all.
#' @param maf_threshold The frequency threshold, above which the SNPs will be removed. Default is 0.01
#' @param reduce_filtering If TRUE, the SNPs with a MAF < 0.1 will be evaluated and only the highly variable ones will be removed.
#' Default FALSE.
#' @param forced the reduce_filtering is not recommended with less than 10 samples, but can be forced. Default is FALSE.
#' @param keep Do you want to keep the sites that were filtered out? In this case, the function will return with a list of wo methrix objects.
#' @param n_chunks Number of chunks to split the \code{\link{methrix}} object in case it is very large. Can only be used if input data is in HDF5 format. Default = 1.
#' @param n_cores Number of parallel instances. Can only be used if input data is in HDF5 format. \code{n_cores} should be less than or equal to \code{n_chunks}. If \code{n_chunks} is not specified, then \code{n_chunks} is initialized to be equal to \code{n_cores}. Default = 1.
#' @return methrix object or a list of methrix objects
#' @importFrom BSgenome score
#' @examples
#' data('methrix_data')
#' remove_snps(m = methrix_data, maf_threshold=0.01)
#' @export
remove_snps <- function(m, populations = NULL, maf_threshold = 0.01, reduce_filtering = FALSE, forced = FALSE, keep = FALSE, n_chunks=1, n_cores=1) {
start_proc_time <- proc.time()
genome <- S4Vectors::metadata(m)$genome
chr <- m2 <- NULL
if (!is_h5(m) & n_cores!=1){
stop("Parallel processing not supported for a non-HDF5 methrix object due to probable high memory usage. \nNumber of cores (n_cores) needs to be 1.")
}
if (!is_h5(m) & n_chunks!=1){
stop("Splitting into chunks not supported for a non-HDF5 methrix object. \nNumber of chunks (n_chunks) needs to be 1.")
}
if (n_cores>n_chunks){
n_chunks<-n_cores
message("n_cores should be set to be less than or equal to n_chunks.","\n","n_chunks has been set to be equal to n_cores = ",n_cores)
}
if (grepl("hg19|GRCh37|Hs37|hs37", genome)) {
if (requireNamespace("MafDb.1Kgenomes.phase3.hs37d5", quietly = TRUE) &
requireNamespace("GenomicScores", quietly = TRUE)) {
message("Used SNP database: MafDb.1Kgenomes.phase3.hs37d5. \n")
mafdb <- MafDb.1Kgenomes.phase3.hs37d5::MafDb.1Kgenomes.phase3.hs37d5
} else {
stop("Packages MafDb.1Kgenomes.phase3.hs37d5 and/or GenomicScores not found.
Please install them before proceed.")
}
} else if (grepl("hg38|GRCh38|Hs38|hs38", genome)) {
if (requireNamespace("MafDb.1Kgenomes.phase3.GRCh38", quietly = TRUE) &
requireNamespace("GenomicScores", quietly = TRUE)) {
message("Used SNP database: MafDb.1Kgenomes.phase3.hs38. \n")
mafdb <- MafDb.1Kgenomes.phase3.GRCh38::MafDb.1Kgenomes.phase3.GRCh38
} else {
stop("Packages MafDb.1Kgenomes.phase3.GRCh38 and/or GenomicScores not found.
Please install them before proceed.")
}
} else {
stop("Only hg19 and hg38 genomes are currently supported..\n")
}
if (is.null(populations)) {
populations <- GenomicScores::populations(mafdb)
} else if (!all(populations %in% GenomicScores::populations(mafdb))) {
stop("The population selected is not available. Please select from ",
paste(GenomicScores::populations(mafdb), collapse = ", "), ". \n")
}
regions <- GenomicRanges::makeGRangesFromDataFrame(elementMetadata(m), start.field = "start", end.field = "start")
GenomeInfoDb::seqlevelsStyle(regions) <- "NCBI"
if(n_cores==1) {
snp_rows <- which((rowSums(as.matrix(score(mafdb, regions, pop = populations))>= maf_threshold,na.rm=T)>0) |
(rowSums(as.matrix(score(mafdb, IRanges::shift(regions, 1), pop = populations))>= maf_threshold,na.rm=T)>0))
} else {
snp_rows <- which(unlist(mclapply(mc.cores=n_cores,1:n_chunks, function(i){
sub_dat = regions[((i-1)*ceiling(length(regions)/n_chunks)+1):min(i*ceiling(length(regions)/n_chunks),length(regions))]
(rowSums(as.matrix(score(mafdb, sub_dat, pop = populations))>= maf_threshold,na.rm=T)>0) |
(rowSums(as.matrix(score(mafdb, IRanges::shift(sub_dat, 1), pop = populations))>= maf_threshold,na.rm=T)>0)
})))
}
snp_rows <- snp_rows[order(snp_rows)]
if (reduce_filtering) {
message("Keep in mind that the filtering is in experimental state, the cut-off is arbitrary.")
if (ncol(m) < 10 & forced == FALSE) {
stop("The reduce filtering option is not recommended for sample number below 10.
Use the forced option if you still want to do it.")
} else if (ncol(m) < 10 & forced == TRUE) {
message("The reduce filtering option is not recommended for sample number below 10.")
}
if(n_chunks==1) {
snp_test<-which((rowSums(as.matrix(score(mafdb, regions, pop = populations))<=0.1,na.rm=T)>0) |
(rowSums(as.matrix(score(mafdb, IRanges::shift(regions, 1), pop = populations))<=0.1,na.rm=T)>0))
} else {
snp_test<-which(unlist(mclapply(mc.cores=n_cores,1:n_chunks, function(i){
sub_dat = regions[((i-1)*ceiling(length(regions)/n_chunks)+1):min(i*ceiling(length(regions)/n_chunks),length(regions))]
(rowSums(as.matrix(score(mafdb, sub_dat, pop = populations))<=0.1,na.rm=T)>0) |
(rowSums(as.matrix(score(mafdb, IRanges::shift(sub_dat, 1), pop = populations))<=0.1,na.rm=T)>0)
})))
}
snp_test <- snp_test[order(snp_test)]
snp_test <- intersect(snp_rows, snp_test)
snp_rows <- snp_rows[!(snp_rows %in% snp_test)]
m_mat <- get_matrix(m[snp_test, ])
if (is_h5(m)) {
if(n_chunks==1) {
vars <- DelayedMatrixStats::rowVars(m_mat, na.rm = TRUE)
} else {
vars<-unlist(mclapply(mc.cores=n_cores,1:n_chunks, function(i){
m_mat_dat = m_mat[((i-1)*ceiling(nrow(m_mat)/n_chunks)+1):min(i*ceiling(nrow(m_mat)/n_chunks),nrow(m_mat)),]
vars<-DelayedMatrixStats::rowVars(m_mat_dat, na.rm = TRUE)
}))
}
} else {
vars <- matrixStats::rowVars(m_mat, na.rm = TRUE)
}
snp_test <- snp_test[which(vars > quantile(vars[complete.cases(vars)],0.25))]
snp_rows <- c(snp_rows, snp_test)
}
removed_snps <- data.table::as.data.table(rowData(m))[snp_rows, ]
message("Number of SNPs removed:")
print(removed_snps[, .N, by = chr])
message("Sum: ")
print(removed_snps[, .N])
if (keep){
m2 <- m[snp_rows,]
}
m <- m[-snp_rows, ]
gc()
message("-Finished in: ", data.table::timetaken(start_proc_time))
if (keep){
return(list("snp_filtered" = m, "removed_snps" = m2))
} else {
return(m)
}
}
gr.nochr <- function(gr) {
if (grepl("^chr", GenomeInfoDb::seqlevels(gr)[1])) {
GenomeInfoDb::seqlevels(gr) <- gsub("^chr", "", GenomeInfoDb::seqlevels(gr))
}
return(gr)
}