Skip to content

Commit

Permalink
Fix tabix-indexing
Browse files Browse the repository at this point in the history
  • Loading branch information
bschilder committed Sep 16, 2022
1 parent 2f6eb37 commit c1a8e43
Show file tree
Hide file tree
Showing 15 changed files with 324 additions and 99 deletions.
5 changes: 2 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ Imports:
biocViews:
SNP, WholeGenome, Genetics, ComparativeGenomics,
GenomeWideAssociation, GenomicVariation, Preprocessing
RoxygenNote: 7.2.1
RoxygenNote: 7.2.1.9000
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
Suggests:
Expand All @@ -70,8 +70,7 @@ Suggests:
testthat (>= 3.0.0),
UpSetR,
BiocStyle,
covr,
seqminer,
covr,
Rsamtools,
MatrixGenerics,
badger,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ export(liftover)
export(list_sumstats)
export(load_snp_loc_data)
export(parse_logs)
export(read_header)
export(read_sumstats)
export(read_vcf)
export(standardise_header)
Expand Down
18 changes: 18 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,21 @@
## CHANGES IN VERSION 1.5.14

### New features

* `read_header`:
- Can now read entire files by setting `n=NULL`.
- Improved reading in of VCF files (can read *.vcf.bgz* now).
- Now exported.
- Added unit tests.
* Remove `seqminer` from all code (too buggy).
* Automatically remove residual .tsv files after tabix indexing.
* `import_sumstats`:
- Use `@inheritDotParams format_sumstats` for better documentation.

### Bug fixes

* `index_tabular`: Fixed by replacing `seqminer` with `Rsamtools`.

## CHANGES IN VERSION 1.5.13

### Bug fixes
Expand Down
2 changes: 1 addition & 1 deletion R/drop_duplicate_cols.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
drop_duplicate_cols <- function(dt){
dups <- which(duplicated(names(dt)))
if(length(dups)>0){
messager("Dropping",length(dups),"duplicate columns.")
messager("Dropping",length(dups),"duplicate column(s).")
dt[, which(duplicated(names(dt))):= NULL]
}
}
1 change: 1 addition & 0 deletions R/import_sumstats.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#' @param ... Additional arguments passed to
#' \link[MungeSumstats]{format_sumstats}.
#' @inheritParams format_sumstats
#' @inheritDotParams format_sumstats
#' @inheritParams downloader
#'
#' @return Either a named list of data objects or paths,
Expand Down
67 changes: 34 additions & 33 deletions R/index_tabular.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,69 +7,70 @@
#' echotabix}.
#'
#' @param path Path to GWAS summary statistics file.
#' @param remove_tmp Remove the temporary uncompressed version of the file
#' (\emph{.tsv}).
#' @param verbose Print messages.
#' @inheritParams liftover
#' @inheritParams Rsamtools::bgzip
#'
#' @return Path to tabix-indexed tabular file
#'
#' @family tabix
#' @export
#' @importFrom R.utils gunzip isGzipped
#' @examples
#' sumstats_dt <- MungeSumstats::formatted_example()
#' sumstats_dt <- MungeSumstats:::sort_coords(sumstats_dt = sumstats_dt)
#' sumstats_dt <- MungeSumstats::formatted_example()
#' path <- tempfile(fileext = ".tsv")
#' MungeSumstats::write_sumstats(sumstats_dt = sumstats_dt, save_path = path)
#'
#' indexed_file <- MungeSumstats::index_tabular(path = path)
#' @export
#' @importFrom R.utils gunzip isGzipped
index_tabular <- function(path,
chrom_col = "CHR",
start_col = "BP",
end_col = start_col,
overwrite = TRUE,
remove_tmp = TRUE,
verbose = TRUE) {
msg <- paste("Converting full summary stats file to",
"tabix format for fast querying...")
message(msg)
#### Read header and make dictionary ####
cdict <- column_dictionary(file_path = path)
#### Check column exist ####
if(!chrom_col %in% names(cdict)) stop("chrom_col not found in file.")
if(!start_col %in% names(cdict)) stop("start_col not found in file.")
if(!end_col %in% names(cdict)) stop("end_col not found in file.")
messager(msg,v=verbose)
#### Make sure input file isn't empty ####
if(!file.exists(path)){
stp <- paste("File does not exist: ",path)
stop(stp)
}
if (file.size(path) == 0) {
msg2 <- paste("Removing empty file:", path)
message(msg2)
messager(msg2,v=verbose)
out <- file.remove(path)
}
#### Ensure it's not already compressed ####
if(R.utils::isGzipped(path)){
path <- R.utils::gunzip(path, overwrite = TRUE)
}
#### Read header and make dictionary ####
cdict <- column_dictionary(file_path = path)
#### Check column exist ####
if(!chrom_col %in% names(cdict)) stop("chrom_col not found in file.")
if(!start_col %in% names(cdict)) stop("start_col not found in file.")
if(!end_col %in% names(cdict)) stop("end_col not found in file.")
### File MUST be bgzipped first
message("Ensuring file is bgzipped.")
messager("Ensuring file is bgzipped.",v=verbose)
bgz_file <- Rsamtools::bgzip(file = path,
dest = sprintf("%s.bgz",
sub("\\.gz$|\\.bgz$", "",
path)),
overwrite = TRUE)
overwrite = overwrite)
### Tabix-index file
message("Tabix-indexing file.")
# Rsamtools::indexTabix is not user-friendly,
# and is prone to errors without additional wrappers functions
#(ie seqminer::tabix.createIndex)
# tbx <- Rsamtools::indexTabix(file = bgz_file,
# seq = chrom_col,
# start = start_col,
# comment = "SNP",
# end = end_col)
seqminer::tabix.createIndex(
bgzipFile = bgz_file,
sequenceColumn = cdict[[chrom_col]],
startColumn = cdict[[start_col]],
endColumn = cdict[[end_col]],
## Just use the first columns name
metaChar = names(cdict)[1]
)
messager("Tabix-indexing file.",v=verbose)
tbi_file <- Rsamtools::indexTabix(file = bgz_file,
seq = cdict[[chrom_col]],
start = cdict[[start_col]],
end = cdict[[end_col]],
comment = names(cdict)[1])
if(isTRUE(remove_tmp) &&
file.exists(path)){
messager("Removing temproary .tsv file.",v=verbose)
out <- file.remove(path)
}
return(bgz_file)
}
}
58 changes: 32 additions & 26 deletions R/read_header.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,43 +5,49 @@
#' @inheritParams format_sumstats
#' @param n integer. The (maximal) number of lines to read. Negative values
#' indicate that one should read up to the end of input on the connection.
#' @param skip_vcf_metadata logical, should VCF metadata be ignored
# @inheritParams readLines
#' @param skip_vcf_metadata logical, should VCF metadata be ignored
#' @inheritParams format_sumstats
#'
#' @importFrom VariantAnnotation scanVcfHeader
#' @keywords internal
#' @export
#' @examples
#' path <- system.file("extdata", "eduAttainOkbay.txt",
#' package = "MungeSumstats")
#' header <- read_header(path = path)
read_header <- function(path,
n = 2,
skip_vcf_metadata = FALSE) {
message("Reading header.")
n = 2L,
skip_vcf_metadata = FALSE,
nThread = 1) {
if(is.null(n)) {
message("Reading entire file.")
n <- -2L
} else {
message("Reading header.")
}
vcf_suffixes <- supported_suffixes(tabular = FALSE,
tabular_compressed = FALSE)
if (startsWith(path, "https://gwas.mrcieu.ac.uk") |
any(endsWith(path, vcf_suffixes))) {
#### Read VCFs ####
if (skip_vcf_metadata) {
# gr <- GenomicRanges::GRanges(seqnames = 1, ranges = 1:10000)
# param <- VariantAnnotation::ScanVcfParam(save_path, which=gr)
# preview <- VariantAnnotation::readVcf(save_path, param = param)
# preview <- VariantAnnotation::scanVcf(save_path, param=param)
# header <- read_vcf(path = path, which = gr)
header <- readLines(path, n = 1000)
if(length(header)==0) stop("header has length zero.")
i <- which(startsWith(header, "#CHR"))
if(length(i)==0) stop("Cannot find row in VCF starting with #CHR.")
header <- data.table::fread(text = header[seq(i, i + n)],
nThread = 1)
if (isTRUE(skip_vcf_metadata)) {
if(endsWith(path,".bgz")){
header <- data.table::fread(text = readLines(path,
n = n+1000),
nrows = n,
skip = "#CHR",
nThread = nThread)
} else {
header <- data.table::fread(input = path,
nrows = n,
skip = "#CHR",
nThread = nThread)
}
} else {
#### Attempt to read in VCF header as well
header <- readLines(path, n = 1000)
}
} else if (endsWith(path,".bgz")){
#### Read tabix-indexed tabular ####
# data.table::fread currently can't handle bgzipped files by default
#
# header <- seqminer::tabix.read.header(tabixFile = path)$header
# header <- Rsamtools::headerTabix(file = path)$header
# header <- rep(header,n)
# header <- colnames(data.table::fread(text = header))
#### Read tabix-indexed tabular ####
header <- data.table::fread(text=readLines(con = path,
n = n+1L))
} else if(any(startsWith(path, c("https:","http:")))){
Expand All @@ -58,4 +64,4 @@ read_header <- function(path,
header <- data.table::fread(text = header)
}
return(header)
}
}
5 changes: 2 additions & 3 deletions R/validate_parameters.R
Original file line number Diff line number Diff line change
Expand Up @@ -355,10 +355,9 @@ validate_parameters <- function(path,
}

if(tabix_index &&
any(!requireNamespace("Rsamtools", quietly = TRUE),
!requireNamespace("seqminer", quietly = TRUE),
any(!requireNamespace("Rsamtools", quietly = TRUE),
!requireNamespace("MatrixGenerics", quietly = TRUE)) ){
pkgs <- c("Rsamtools","seqminer","MatrixGenerics")
pkgs <- c("Rsamtools","MatrixGenerics")
missing_pkgs <- pkgs[!pkgs %in% rownames(utils::installed.packages())]
tbx_msg <- paste0(
"The following packages must be installed when tabix_index=TRUE:\n",
Expand Down
12 changes: 7 additions & 5 deletions R/write_sumstats.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ write_sumstats <- function(sumstats_dt,
ref_genome,
sep = "\t",
write_vcf = FALSE,
save_format=NULL,
save_format = NULL,
tabix_index = FALSE,
nThread = 1,
return_path = FALSE,
Expand All @@ -51,14 +51,16 @@ write_sumstats <- function(sumstats_dt,
save_path <- check$save_path
}
#### Sort again just to be sure when tabix-indexing ####
if(isTRUE(tabix_index) | isTRUE(write_vcf)) {
if(isTRUE(tabix_index) |
isTRUE(write_vcf)) {
sumstats_dt <- sort_coords(sumstats_dt = sumstats_dt)
}
#### Select write format ####
if (isTRUE(write_vcf)) {
tmp_save_path <- gsub("\\.bgz|\\.gz","",save_path)
#convert to IEU OpenGWAS VCF format (column naming and RSID position)
if(!is.null(save_format) && tolower(save_format)=="opengwas"){
if(!is.null(save_format) &&
tolower(save_format)=="opengwas"){
#first check genome build - all of openGWAS is GRCh37 currently so
#warn user if their data isn't
gen_build_err <- paste0("Your sumstats has been built on the ",
Expand All @@ -75,7 +77,7 @@ write_sumstats <- function(sumstats_dt,
#SNP -> RSID in INFO col
if("SNP" %in% colnames(sumstats_dt)){
setnames(sumstats_dt,"SNP","RSID")
}else{
} else{
stop("SNP/RSID is required for IEU OpenGWAS format VCFs")
}
#remove any extra columns
Expand Down Expand Up @@ -177,5 +179,5 @@ write_sumstats <- function(sumstats_dt,
verbose = TRUE)
}
}
if(return_path) return(save_path)
if(isTRUE(return_path)) return(save_path)
}
Loading

0 comments on commit c1a8e43

Please sign in to comment.