Skip to content

Commit

Permalink
variant_count in seqVCF2GDS()
Browse files Browse the repository at this point in the history
  • Loading branch information
zhengxwen committed Jan 10, 2025
1 parent fa70842 commit 4cb50f9
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 29 deletions.
4 changes: 4 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ UTILITIES
`seqVCF2GDS()` is faster when obtaining the number of variants for
splitting files.

o new 'variant_count' in `seqVCF2GDS()` to specify the number of variants
in the VCF file when it is known; it is only applicable when multiple
cores are used.


CHANGES IN VERSION 1.46.0
-------------------------
Expand Down
50 changes: 23 additions & 27 deletions R/ConvVCF2GDS.R
Original file line number Diff line number Diff line change
Expand Up @@ -545,7 +545,8 @@ seqVCF2GDS <- function(vcf.fn, out.fn, header=NULL,
storage.option="LZMA_RA", info.import=NULL, fmt.import=NULL,
genotype.var.name="GT", ignore.chr.prefix="chr",
scenario=c("general", "imputation"), reference=NULL, start=1L, count=-1L,
optimize=TRUE, raise.error=TRUE, digest=TRUE, parallel=FALSE, verbose=TRUE)
variant_count=NA_integer_, optimize=TRUE, raise.error=TRUE, digest=TRUE,
parallel=FALSE, verbose=TRUE)
{
# check
if (!inherits(vcf.fn, "connection"))
Expand Down Expand Up @@ -592,20 +593,15 @@ seqVCF2GDS <- function(vcf.fn, out.fn, header=NULL,
{
if (pnum > 1L)
stop("No parallel support when the input is a connection object.")
}

if (is.character(vcf.fn))
variant_count <- attr(vcf.fn, "variant_count")
else
variant_count <- NULL
if (!is.null(variant_count))
if (length(variant_count)!=1L || !is.na(variant_count))
warning("'variant_count' is not used in seqVCF2GDS() when 'vcf.fn' is a connection object.")
} else if (!identical(variant_count, NA_integer_))
{
if (!is.numeric(variant_count))
stop("the attribute 'variant_count' of 'vcf.fn' should be a numeric vector.")
stop("'variant_count' should be a numeric vector.")
if (length(variant_count) != length(vcf.fn))
stop("the attribute 'variant_count' of 'vcf.fn' should be as the same length as 'vcf.fn'.")
stop("'variant_count' and 'vcf.fn' should have the same length.")
}

if (verbose) cat(date(), "\n", sep="")

genotype.storage <- "bit2"
Expand Down Expand Up @@ -785,12 +781,18 @@ seqVCF2GDS <- function(vcf.fn, out.fn, header=NULL,
}

# get the number of variants in each VCF file
num_array <- vapply(vcf.fn, function(fn)
for (i in seq_along(vcf.fn))
{
v <- seqVCF_Header(fn, getnum=TRUE, parallel=parallel, verbose=FALSE)
v$num.variant
}, 0L)
num_var <- sum(num_array)
v <- variant_count[i]
if (is.na(v) || (v < 0L))
{
fn <- vcf.fn[i]
variant_count[i] <- seqVCF_Header(fn, getnum=TRUE,
parallel=parallel, verbose=FALSE)$num.variant
}
}
num_var <- sum(variant_count)
if (anyNA(num_var)) stop("Getting invalid # of variants.")

if (start < 1L)
stop("'start' should be a positive integer if conversion in parallel.")
Expand Down Expand Up @@ -822,31 +824,25 @@ seqVCF2GDS <- function(vcf.fn, out.fn, header=NULL,
seqParallel(parallel, NULL, FUN = function(
vcf.fn, header, storage.option, info.import, fmt.import,
genotype.var.name, ignore.chr.prefix, scenario, optim,
raise.err, ptmpfn, psplit, num_array)
raise.err, ptmpfn, psplit, variant_count)
{
# load package
library(SeqArray, quietly=TRUE, verbose=FALSE)

attr(vcf.fn, "variant_count") <- num_array
i <- process_index # the process id, starting from one

seqVCF2GDS(vcf.fn, ptmpfn[i], header=oldheader,
SeqArray::seqVCF2GDS(vcf.fn, ptmpfn[i], header=oldheader,
storage.option=storage.option, info.import=info.import,
fmt.import=fmt.import, genotype.var.name=genotype.var.name,
ignore.chr.prefix=ignore.chr.prefix,
start = psplit[[1L]][i], count = psplit[[2L]][i],
variant_count=variant_count,
optimize=optim, scenario=scenario, raise.error=raise.err,
digest=FALSE, parallel=FALSE, verbose=FALSE)

invisible()

invisible() # return
}, split="none",
vcf.fn=vcf.fn, header=header, storage.option=storage.option,
info.import=info.import, fmt.import=fmt.import,
genotype.var.name=genotype.var.name,
ignore.chr.prefix=ignore.chr.prefix, scenario=scenario,
optim=optimize, raise.err=raise.error,
ptmpfn=ptmpfn, psplit=psplit, num_array=num_array)
ptmpfn=ptmpfn, psplit=psplit, variant_count=variant_count)

if (verbose)
cat(" >>> Done (", date(), ") <<<\n", sep="")
Expand Down
10 changes: 8 additions & 2 deletions man/seqVCF2GDS.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
seqVCF2GDS(vcf.fn, out.fn, header=NULL, storage.option="LZMA_RA",
info.import=NULL, fmt.import=NULL, genotype.var.name="GT",
ignore.chr.prefix="chr", scenario=c("general", "imputation"),
reference=NULL, start=1L, count=-1L, optimize=TRUE, raise.error=TRUE,
digest=TRUE, parallel=FALSE, verbose=TRUE)
reference=NULL, start=1L, count=-1L, variant_count=NA_integer_,
optimize=TRUE, raise.error=TRUE, digest=TRUE, parallel=FALSE,
verbose=TRUE)
seqBCF2GDS(bcf.fn, out.fn, header=NULL, storage.option="LZMA_RA",
info.import=NULL, fmt.import=NULL, genotype.var.name="GT",
ignore.chr.prefix="chr", scenario=c("general", "imputation"),
Expand Down Expand Up @@ -50,6 +51,11 @@ seqBCF2GDS(bcf.fn, out.fn, header=NULL, storage.option="LZMA_RA",
\item{start}{the starting variant if importing part of VCF files}
\item{count}{the maximum count of variant if importing part of VCF files,
-1 indicates importing to the end}
\item{variant_count}{\code{NA_integer_} (default) or a numeric vector
specifying the numbers of variants in the VCF file(s) in \code{vcf.fn};
only applicable when multiple cores are used; if the number of variants
is known, the conversion can skip counting the variants before
splitting the file(s); \code{variant_count} could be an approximate}
\item{optimize}{if \code{TRUE}, optimize the access efficiency by calling
\code{\link{cleanup.gds}}}
\item{raise.error}{\code{TRUE}: throw an error if numeric conversion fails;
Expand Down

0 comments on commit 4cb50f9

Please sign in to comment.