Fix tabix-indexing

Al-Murphy · Sep 16, 2022 · c1a8e43 · c1a8e43
1 parent 2f6eb37
commit c1a8e43
Show file tree

Hide file tree

Showing 15 changed files with 324 additions and 99 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -52,7 +52,7 @@ Imports:
 biocViews: 
     SNP, WholeGenome, Genetics, ComparativeGenomics, 
     GenomeWideAssociation, GenomicVariation, Preprocessing 
-RoxygenNote: 7.2.1
+RoxygenNote: 7.2.1.9000
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
 Suggests: 
@@ -70,8 +70,7 @@ Suggests:
     testthat (>= 3.0.0),
     UpSetR,
     BiocStyle,
-    covr,
-    seqminer,
+    covr, 
     Rsamtools,
     MatrixGenerics,
     badger,

diff --git a/NAMESPACE b/NAMESPACE
@@ -12,6 +12,7 @@ export(liftover)
 export(list_sumstats)
 export(load_snp_loc_data)
 export(parse_logs)
+export(read_header)
 export(read_sumstats)
 export(read_vcf)
 export(standardise_header)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,21 @@
+## CHANGES IN VERSION 1.5.14
+
+### New features
+
+* `read_header`: 
+    - Can now read entire files by setting `n=NULL`.
+    - Improved reading in of VCF files (can read *.vcf.bgz* now). 
+    - Now exported.
+    - Added unit tests. 
+* Remove `seqminer` from all code (too buggy). 
+* Automatically remove residual .tsv files after tabix indexing. 
+* `import_sumstats`:
+    - Use `@inheritDotParams format_sumstats` for better documentation. 
+
+### Bug fixes
+
+* `index_tabular`: Fixed by replacing `seqminer` with `Rsamtools`. 
+
 ## CHANGES IN VERSION 1.5.13
 
 ### Bug fixes

diff --git a/R/drop_duplicate_cols.R b/R/drop_duplicate_cols.R
@@ -7,7 +7,7 @@
 drop_duplicate_cols <- function(dt){
     dups <- which(duplicated(names(dt)))
     if(length(dups)>0){
-        messager("Dropping",length(dups),"duplicate columns.")
+        messager("Dropping",length(dups),"duplicate column(s).")
         dt[,  which(duplicated(names(dt))):= NULL] 
     } 
 }
diff --git a/R/import_sumstats.R b/R/import_sumstats.R
@@ -19,6 +19,7 @@
 #' @param ... Additional arguments passed to 
 #' \link[MungeSumstats]{format_sumstats}.
 #' @inheritParams format_sumstats
+#' @inheritDotParams format_sumstats
 #' @inheritParams downloader
 #'
 #' @return Either a named list of data objects or paths,

diff --git a/R/index_tabular.R b/R/index_tabular.R
@@ -7,69 +7,70 @@
 #' echotabix}.
 #' 
 #' @param path Path to GWAS summary statistics file.
+#' @param remove_tmp Remove the temporary uncompressed version of the file 
+#' (\emph{.tsv}). 
 #' @param verbose Print messages.
 #' @inheritParams liftover
+#' @inheritParams Rsamtools::bgzip
 #' 
 #' @return Path to tabix-indexed tabular file
 #'
 #' @family tabix
+#' @export
+#' @importFrom R.utils gunzip isGzipped
 #' @examples
-#' sumstats_dt <- MungeSumstats::formatted_example()
-#' sumstats_dt <- MungeSumstats:::sort_coords(sumstats_dt = sumstats_dt)
+#' sumstats_dt <- MungeSumstats::formatted_example() 
 #' path <- tempfile(fileext = ".tsv")
 #' MungeSumstats::write_sumstats(sumstats_dt = sumstats_dt, save_path = path)
-#' 
 #' indexed_file <- MungeSumstats::index_tabular(path = path)
-#' @export
-#' @importFrom R.utils gunzip isGzipped
 index_tabular <- function(path,
                           chrom_col = "CHR",
                           start_col = "BP",
                           end_col = start_col,
+                          overwrite = TRUE,
+                          remove_tmp = TRUE,
                           verbose = TRUE) {
     msg <- paste("Converting full summary stats file to",
                  "tabix format for fast querying...")
-    message(msg)
-    #### Read header and make dictionary ####
-    cdict <- column_dictionary(file_path = path)
-    #### Check column exist ####
-    if(!chrom_col %in% names(cdict)) stop("chrom_col not found in file.")
-    if(!start_col %in% names(cdict)) stop("start_col not found in file.")
-    if(!end_col %in% names(cdict)) stop("end_col not found in file.")
+    messager(msg,v=verbose) 
     #### Make sure input file isn't empty ####
+    if(!file.exists(path)){
+        stp <- paste("File does not exist: ",path)
+        stop(stp)
+    }
     if (file.size(path) == 0) {
         msg2 <- paste("Removing empty file:", path)
-        message(msg2)
+        messager(msg2,v=verbose)
         out <- file.remove(path)
     }
     #### Ensure it's not already compressed ####
     if(R.utils::isGzipped(path)){
         path <- R.utils::gunzip(path, overwrite = TRUE)
     }
+    #### Read header and make dictionary ####
+    cdict <- column_dictionary(file_path = path)
+    #### Check column exist ####
+    if(!chrom_col %in% names(cdict)) stop("chrom_col not found in file.")
+    if(!start_col %in% names(cdict)) stop("start_col not found in file.")
+    if(!end_col %in% names(cdict)) stop("end_col not found in file.")
     ### File MUST be bgzipped first
-    message("Ensuring file is bgzipped.")
+    messager("Ensuring file is bgzipped.",v=verbose)
     bgz_file <- Rsamtools::bgzip(file = path,
                                  dest = sprintf("%s.bgz",
                                                 sub("\\.gz$|\\.bgz$", "",
                                                     path)),
-                                 overwrite = TRUE)
+                                 overwrite = overwrite)
     ### Tabix-index file
-    message("Tabix-indexing file.") 
-    # Rsamtools::indexTabix is not user-friendly, 
-    # and is prone to errors without additional wrappers functions
-    #(ie seqminer::tabix.createIndex)
-    # tbx <- Rsamtools::indexTabix(file = bgz_file,
-    #                               seq = chrom_col,
-    #                               start = start_col,
-    #                               comment = "SNP", 
-    #                               end = end_col)
-    seqminer::tabix.createIndex(
-        bgzipFile = bgz_file,
-        sequenceColumn = cdict[[chrom_col]],
-        startColumn = cdict[[start_col]],
-        endColumn = cdict[[end_col]],
-        ## Just use the first columns name
-        metaChar = names(cdict)[1]
-    )
+    messager("Tabix-indexing file.",v=verbose)  
+    tbi_file <- Rsamtools::indexTabix(file = bgz_file, 
+                                      seq = cdict[[chrom_col]],
+                                      start = cdict[[start_col]],
+                                      end = cdict[[end_col]],
+                                      comment = names(cdict)[1]) 
+    if(isTRUE(remove_tmp) && 
+       file.exists(path)){
+        messager("Removing temproary .tsv file.",v=verbose)  
+        out <- file.remove(path)
+    }
     return(bgz_file)
-}
+}
diff --git a/R/read_header.R b/R/read_header.R
@@ -5,43 +5,49 @@
 #' @inheritParams format_sumstats
 #' @param n integer. The (maximal) number of lines to read. Negative values
 #' indicate that one should read up to the end of input on the connection.
-#' @param skip_vcf_metadata logical, should VCF metadata be ignored
-# @inheritParams readLines
+#' @param skip_vcf_metadata logical, should VCF metadata be ignored 
+#' @inheritParams format_sumstats
+#' 
 #' @importFrom VariantAnnotation scanVcfHeader
-#' @keywords internal
+#' @export
+#' @examples
+#' path <- system.file("extdata", "eduAttainOkbay.txt", 
+#'                     package = "MungeSumstats") 
+#' header <- read_header(path = path)                    
 read_header <- function(path,
-                        n = 2,
-                        skip_vcf_metadata = FALSE) {
-    message("Reading header.")
+                        n = 2L,
+                        skip_vcf_metadata = FALSE,
+                        nThread = 1) {
+    if(is.null(n)) {
+        message("Reading entire file.")
+        n <- -2L
+    } else {
+        message("Reading header.")
+    } 
     vcf_suffixes <- supported_suffixes(tabular = FALSE,
                                        tabular_compressed = FALSE)
     if (startsWith(path, "https://gwas.mrcieu.ac.uk") | 
         any(endsWith(path, vcf_suffixes))) {
         #### Read VCFs ####
-        if (skip_vcf_metadata) {
-            # gr <- GenomicRanges::GRanges(seqnames = 1, ranges = 1:10000)
-            # param <- VariantAnnotation::ScanVcfParam(save_path, which=gr)
-            # preview <- VariantAnnotation::readVcf(save_path, param = param)
-            # preview <- VariantAnnotation::scanVcf(save_path, param=param) 
-            # header <- read_vcf(path = path, which = gr)
-            header <- readLines(path, n = 1000)
-            if(length(header)==0) stop("header has length zero.")
-            i <- which(startsWith(header, "#CHR"))
-            if(length(i)==0) stop("Cannot find row in VCF starting with #CHR.")
-            header <- data.table::fread(text = header[seq(i, i + n)],
-                                        nThread = 1)
+        if (isTRUE(skip_vcf_metadata)) {  
+            if(endsWith(path,".bgz")){
+                header <- data.table::fread(text = readLines(path,
+                                                              n = n+1000),
+                                            nrows = n,
+                                            skip = "#CHR",
+                                            nThread = nThread)
+            } else {
+                header <- data.table::fread(input = path,
+                                            nrows = n,
+                                            skip = "#CHR",
+                                            nThread = nThread)
+            } 
         } else {
             #### Attempt to read in VCF header as well
             header <- readLines(path, n = 1000)
         }
     } else if (endsWith(path,".bgz")){
-        #### Read tabix-indexed tabular #### 
-        # data.table::fread currently can't handle bgzipped files by default
-        # 
-        # header <- seqminer::tabix.read.header(tabixFile = path)$header 
-        # header <- Rsamtools::headerTabix(file = path)$header
-        # header <- rep(header,n)
-        # header <- colnames(data.table::fread(text = header))  
+        #### Read tabix-indexed tabular ####  
         header <- data.table::fread(text=readLines(con = path,
                                                    n = n+1L)) 
     } else if(any(startsWith(path, c("https:","http:")))){
@@ -58,4 +64,4 @@ read_header <- function(path,
         header <- data.table::fread(text = header)
     }
     return(header)
-}
+}
diff --git a/R/validate_parameters.R b/R/validate_parameters.R
@@ -355,10 +355,9 @@ validate_parameters <- function(path,
     }
 
     if(tabix_index && 
-       any(!requireNamespace("Rsamtools", quietly = TRUE),
-           !requireNamespace("seqminer", quietly = TRUE),
+       any(!requireNamespace("Rsamtools", quietly = TRUE), 
            !requireNamespace("MatrixGenerics", quietly = TRUE)) ){
-        pkgs <- c("Rsamtools","seqminer","MatrixGenerics")
+        pkgs <- c("Rsamtools","MatrixGenerics")
         missing_pkgs <- pkgs[!pkgs %in% rownames(utils::installed.packages())]
         tbx_msg <- paste0(
             "The following packages must be installed when tabix_index=TRUE:\n",

diff --git a/R/write_sumstats.R b/R/write_sumstats.R
@@ -35,7 +35,7 @@ write_sumstats <- function(sumstats_dt,
                            ref_genome,
                            sep = "\t",
                            write_vcf = FALSE,
-                           save_format=NULL,
+                           save_format = NULL,
                            tabix_index = FALSE,
                            nThread = 1,
                            return_path = FALSE,
@@ -51,14 +51,16 @@ write_sumstats <- function(sumstats_dt,
         save_path <- check$save_path
     } 
     #### Sort again just to be sure when tabix-indexing ####
-    if(isTRUE(tabix_index) | isTRUE(write_vcf)) {
+    if(isTRUE(tabix_index) |
+       isTRUE(write_vcf)) {
       sumstats_dt <- sort_coords(sumstats_dt = sumstats_dt)
     }
     #### Select write format ####
     if (isTRUE(write_vcf)) { 
         tmp_save_path <- gsub("\\.bgz|\\.gz","",save_path)
         #convert to IEU OpenGWAS VCF format (column naming and RSID position)
-        if(!is.null(save_format) && tolower(save_format)=="opengwas"){
+        if(!is.null(save_format) && 
+           tolower(save_format)=="opengwas"){
           #first check genome build - all of openGWAS is GRCh37 currently so 
           #warn user if their data isn't
           gen_build_err <- paste0("Your sumstats has been built on the ",
@@ -75,7 +77,7 @@ write_sumstats <- function(sumstats_dt,
           #SNP -> RSID in INFO col
           if("SNP" %in% colnames(sumstats_dt)){
             setnames(sumstats_dt,"SNP","RSID")
-          }else{
+          } else{
             stop("SNP/RSID is required for IEU OpenGWAS format VCFs")
           }
           #remove any extra columns
@@ -177,5 +179,5 @@ write_sumstats <- function(sumstats_dt,
                                        verbose = TRUE)
         } 
     }
-    if(return_path) return(save_path)
+    if(isTRUE(return_path)) return(save_path)
 }