Skip to content
Merged

Dev #18

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions randPedPCA/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: randPedPCA
Type: Package
Title: Fast PCA for large pedigrees
Version: 0.9.4
Version: 0.9.6
Authors@R: c(
person("Hanbin", "Lee", email = "hblee@umich.edu", role = "aut"),
person("Hannes", "Becher", email = "h.becher@ed.ac.uk", role = "cre"),
Expand All @@ -23,7 +23,8 @@ Depends:
Suggests:
knitr,
rmarkdown,
testthat (>= 3.0.0)
testthat (>= 3.0.0),
rgl
RoxygenNote: 7.3.2
Config/testthat/edition: 3
VignetteBuilder: knitr
2 changes: 2 additions & 0 deletions randPedPCA/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ S3method(plot,rppca)
S3method(rppca,pedigree)
S3method(rppca,spam)
S3method(summary,rppca)
export(dspc)
export(hutchpp)
export(importLinv)
export(plot3D)
export(randRangeFinder)
export(randSVD)
export(rppca)
Expand Down
56 changes: 56 additions & 0 deletions randPedPCA/R/downsample.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@



#' Add downsampling index to rppca object
#'
#' This index is used by plot.rppca to downsample the col (colour) values. It
#' is stored in the rppca object's ds slot.
#'
#' @param pc an object of class rppca
#' @param to The down-sampling parameter. A numeric > 0 or a vector or NA. Interpreted
#' as a proportion or integer or a index vector, see details.
#'
#' @details
#' The parameter `to` is used to specify and possibly which individuals are sampled.
#' If NA, all individuals are retained. If `to` is of length one and is between 0 and 1,
#' then it is interpreted as a proportion. If it is greater than 1, it is taken to be
#' the number of individuals to be sampled (possibly rounded by sample.int). If
#' `to` is a logical or an integer vector, it is used for logical or integer indexing, respectively.
#' The integer indices of the sample individuals are written to the `ds` slot.
#' If `ds` exists, it is overwritten with a warning.
#'
#' @return An (invisible) object of class `rppca` with a slot `ds` added.
#'
#' @export
dspc <- function(pc, to=10000){
stopifnot(inherits(pc, "rppca"))

nr <- nrow(pc$x)


if(length(to)==1){ # If 'to' is a scalar
if(is.na(to)){ # "sample" all
pc$ds <- 1:nr
return(invisible(pc))
}

stopifnot(to > 0)
if(to <1) n <- ceiling(nr * to) else n <- to
if( n < nr){
n <- min(c(n, nr))
message(paste0("Downsampling to ", n, " individuals."))
ind <- sample.int(nr, n)
} else {
ind <- 1:nr
}
if(!is.null(pc$ds)) warning("The existing downsampling slot was overwritten.")
pc$ds <- ind
return(invisible(pc))
} else { # If 'to' is a vector
pc$ds <- (1:nr)[to]
message(paste0("Downsampling to ", length(pc$ds), " individuals."))
return(invisible(pc))
}

stop("Should never reach this point.")
}
26 changes: 21 additions & 5 deletions randPedPCA/R/plot.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,34 @@

#' @method plot rppca
#' @export
plot.rppca <- function(x, dims=c(1,2), ...){
ss <- summary(x)$importance
if(dim(ss)[1] == 3){
plot.rppca <- function(x, dims=c(1,2), to=10000, col=NULL, ...){
ss <- summary(x)$importance # get variance components

if(is.null(x$ds)) x <- dspc(x, to) # add index for downsampling (if not present)

if(!is.null(col)) {
if(length(col)>1){
# warn if col has different length to number of individuals
if(length(col) != nrow(x$x)) warning(paste0("The length of col was ", length(col), " but there were ", nrow(x$x), " individuals."))

message("Downsampling colours.")
cols <- col[x$ds] # downsample colours if any
} else {
cols <- col
}
}
if(dim(ss)[1] == 3){ # add var proportions if any
plot(
x$x[,dims],
x$x[x$ds,dims],
xlab=paste0("PC", dims[1], " (", signif(100*ss[2,dims[1]], 3), "%)"),
ylab=paste0("PC", dims[2], " (", signif(100*ss[2,dims[2]], 3), "%)"),
col = cols,
...
)
} else {
plot(
x$x[,dims],
x$x[x$ds,dims],
col = cols,
...
)
}
Expand Down
44 changes: 44 additions & 0 deletions randPedPCA/R/plot3D.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@




#' 3D plot using rgl
#'
#' A simple wrapper around rgl's pot3d function.
#'
#' @param x, an rppca object
#'
#' @param dims, vector of length 3 - indices of the PCs to plot
#' @param ... additional arguments passed to rgl::plot3d
#'
#' @details
#' Note, different to `plot.rppca`, which is relatively slow, `plot3D` does
#' not down-sample the principal components and it ignores the `ds` slot of an
#' `rppca` object if present.
#'
#'
#' @export
plot3D <- function(x, dims=c(1,2,3), ...) {
if (!requireNamespace("rgl", quietly = TRUE)) {
stop(
"Package \"rgl\" must be installed to use this function.",
call. = FALSE
)
} else {
ss <- summary(x)$importance
if(dim(ss)[1] == 3){
rgl::plot3d(
x$x[,dims],
xlab=paste0("PC", dims[1], " (", signif(100*ss[2,dims[1]], 3), "%)"),
ylab=paste0("PC", dims[2], " (", signif(100*ss[2,dims[2]], 3), "%)"),
zlab=paste0("PC", dims[3], " (", signif(100*ss[2,dims[3]], 3), "%)"),
...
)
} else {
plot3d(
x$x[,dims],
...
)
}
}
}
30 changes: 30 additions & 0 deletions randPedPCA/man/dspc.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 23 additions & 0 deletions randPedPCA/man/plot3D.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file added randPedPCA/tests/testthat/Rplots.pdf
Binary file not shown.
21 changes: 21 additions & 0 deletions randPedPCA/tests/testthat/test_rppca.R
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,29 @@ test_that("Comparing Hutch++ estimate to inbreeding-based vals (with centring)",



# Subsampling -------------------------------------------------------------

test_that("Sub-sampling", {
pc <- rppca(pedLInv)

expect_error(dspc(1)) # error, input must inherit 'rppca'

# default val of 'to' is 10k, greateer then number of individuals, no downsampling and no message
expect_no_condition(pcd <- dspc(pc))

expect_warning(pcd <- dspc(pcd)) # warning about overwriting existing index
expect_message(dspc(pc, c(T, F))) # message about to which number of individuals we downsampled
expect_message(dspc(pc, c(1,3))) # same here

# though 100000 is greater than the number of individuals in this PCA
# there is no error/warning. This is handled by R. To high index results in NA.
expect_message(dspc(pc, c(1,3, 100000)))
})


# Plotting ----------------------------------------------------------------
test_that("Plotting",{
pc <- rppca(pedLInv, center=T, totVar=2694.038)
# length of 'col' is different from number of individuals in the PCA
expect_warning(plot(pc, col=c("yellow", "green"), to = 0.5))
})
20 changes: 20 additions & 0 deletions randPedPCA/vignettes/pedigree-pca.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,26 @@ plot(pc06, col=pedMeta$population, main="My pedigree PCA")
plot(pc06, dims=c(1,3), col=pedMeta$population, main="My pedigree PCA,\ncustom PCs shown")
```

## Down-sampling
Because `plot.rppca` relies on R's relatively slow base plotting function, it
is useful to down-sample the individuals to be plotted. Thus, `plot.rppca` will
automatically down-sample to 10,000 individuals if there are more in the dataset.
Technically there is no down-sampling and all the data are retained. Instead, an
integer vector is added in the `ds` slot of the `rppca` object.
That vector is automatically used by `plot.rppca` for individuals and any
colours specified by the `col` parameter. The value of `col` should thus have
the same length as there were individuals in the full dataset.
To adjust the down-sampling, one can use the parameter `to` of `plot.rppca`.
The value of `to` is passed to the function `dspc`, which carries out the down-sampling/vector
generation. See `?dspc` for details. To plot wihthout any down-sampling, do `plot(myPc, to=NA)`.

One potential issue is that every time `plot.rppca` is run, a new set of
individuals is sampled, so that repeated plots of the same dataset will look different.
To retain a specific down-sampling, one can run `dspc` outside of `plot.rppca`,
like `pcDown <- dspc(pc)`. Calling `plot(pcDown)` repeatedly will generate the same plot every time.




# Using one's own data
Users will mainly want to analyse their own empirical or simulated datasets. This
Expand Down