Skip to content

Commit

Permalink
Updating package to include source DropletQC code
Browse files Browse the repository at this point in the history
  • Loading branch information
AlicenJoyHenning committed Aug 13, 2024
1 parent 51838ec commit 0b4f6fd
Show file tree
Hide file tree
Showing 10 changed files with 522 additions and 16 deletions.
1 change: 0 additions & 1 deletion .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ jobs:
- name: Install GitHub dependencies
run: |
R -e "devtools::install_github('powellgenomicslab/DropletQC')"
R -e "devtools::install_deps(dependencies = TRUE)"
- uses: r-lib/actions/check-r-package@v2
Expand Down
13 changes: 7 additions & 6 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
Package: limiric
Title: ScRNA-seq Quality Control by Damaged Cell Detection
Title: Damaged Cell Detection for Single Cell RNA Sequencing Quality Control
Version: 1.0.0
Authors@R:
person("Alicen", "Henning", email = "alicen.jhb@gmail.com", role = c("aut", "cre"))
person("Alicen", "Henning", email = "alicen.jhb@gmail.com", role = c("aut", "cre", "cph"))
Description:
Quality control package for high precision, sample-specific damaged cell detection through low dimension mitochondrial and ribosomal cluster selection.
A quality control tool for single cell RNA sequencing that enables
high precision, sample-specific damaged cell detection through
low-dimension mitochondrial and ribosomal cluster selection.
License: GPL (>= 3)
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
Expand All @@ -15,15 +17,14 @@ LazyData: true
Imports:
cowplot,
dplyr,
DropletQC,
ggplot2,
ks,
magick,
mclust,
Matrix,
png,
Seurat,
SoupX
Remotes:
powellgenomicslab/DropletQC@main
Suggests:
knitr,
devtools,
Expand Down
6 changes: 5 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Generated by roxygen2: do not edit by hand

export(identify_damage_cells)
export(identify_empty_droplets)
export(limiric)
export(limiricCore)
import(DropletQC)
import(Matrix)
import(Seurat)
import(SoupX)
Expand All @@ -18,6 +19,9 @@ importFrom(dplyr,mutate)
importFrom(dplyr,pull)
importFrom(dplyr,slice)
importFrom(dplyr,summarise)
importFrom(ks,hpi)
importFrom(ks,kde)
importFrom(mclust,mclustBIC)
importFrom(stats,setNames)
importFrom(utils,data)
importFrom(utils,write.csv)
238 changes: 238 additions & 0 deletions R/identify_damage_cells.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
#' Assess EM model results and assign cells as damaged or not
#'
#' @name assess_EM
#'
#' @description powellgenomicslab/DropletQC function accepts the EM results for a particular cell type,
#' as well as the UMI and nuclear fraction thresholds, and assigns cells as
#' "cell" or "damaged_cell" This is a helper function called by
#' `identify_damaged_cells` and was not intended for more general use.
#'
#' @param em Mclust, result of running EM on the log10(UMI counts) and the
#' nuclear fraction to asses whether there is likely to be two cell
#' populations; cells and damaged cells, or just cells
#' @param umi_thresh numeric, percentage of the cell disitribution UMI counts
#' which the damaged_cell distribution must be below in order to be accepted
#' as damaged cells. For example if the mean UMI count of the distribution fit
#' to the cell population is 1,000 and `umi_thresh` is set to 30, the
#' damaged_cell distribution mean must be less than 700 to be classified as
#' damaged_cells, otherwise all barcodes will be returned as cells
#' @param nf_thresh numeric, the minimum difference in the nuclear fraction
#' score required between the two distributions (cells and damaged_cells) for
#' the damaged_cell distribution to be accepted as real
#'
#' @return data frame, with three columns containing the log10(UMI counts),
#' nuclear fraction score and the assigned cell status; "cell" or
#' "damaged_cell"
#'
#' @keywords internal
assess_EM <- function(em, umi_thresh, nf_thresh){

# Separately for each cell type, assign a barcode as "cell" or "damaged_cell"
# based on the EM results using the following sequential procedure:

# 1. If the EM model selected contained only one distribution, score all
# barcodes as "cell".
check_1 <- em$G==2

# 2. If two distirbutions were fit, check the distribution with the higher
# nuclear_fraction mean also has a lower umi mean. If this criteria is
# satisfied, we consider the population with the lower umi count and higher
# nuclear_fraction scores the damaged_cell population and move on to step 3.
# If this is not the case we score all barcodes as "cell".
if(check_1){
nf_means <- em$parameters$mean["nf",]
umi_means <- em$parameters$mean["umi",]
check_2 <- umi_means[which.max(nf_means)] < umi_means[which.min(nf_means)]
} else {
check_2 <- FALSE
}

# 3. If 2. was satisfied, we check the damaged_cell nuclear_fraction
# distribution mean is at least nf_thresh greater than the cell mean. Also
# check the damaged_cell umi distribution mean is at least umi_thresh
# percent lower than the cell umi distribution mean. If these two criteria
# are satisfied we assign the damaged cells, otherwise all barcodes are
# labelled "cell".
if(check_2){
# Check nuclear fraction threshold is satisfied
nf_check <- nf_means[which.max(nf_means)] - nf_means[which.min(nf_means)] > nf_thresh
# Check umi threshold is satisfied
damaged_cell_umi <- 10^umi_means[which.max(nf_means)]
cell_umi <- 10^umi_means[which.min(nf_means)]
umi_check <- damaged_cell_umi < (cell_umi - cell_umi*(umi_thresh/100))

if(all(nf_check, umi_check)){ check_3 <- TRUE } else { check_3 <- FALSE }
} else {
check_3 <- FALSE
}

# Assign barcodes
em_classification <- data.frame(nf = em$data[,"nf"],
umi = em$data[,"umi"],
classification= em$classification)
row.names(em_classification) <- row.names(em$data)

if(all(check_1, check_2, check_3)){
damaged_cells <- em_classification$classification==which.max(em$parameters$mean["nf",])
em_classification$classification[damaged_cells] <- "damaged_cell"
em_classification$classification[!damaged_cells] <- "cell"
} else {
em_classification$classification <- "cell"
}

return(em_classification)
}


#' Identify damage cells
#'
#' @name identify_damage_cells
#'
#' @description powellgenomicslab/DropletQC function uses a combination of the cell UMI counts and the
#' nuclear fraction score to assign each cell one of two values; "cell" or
#' "damaged_cell". This is based on the idea that damaged cells have a lower
#' UMI count and higher nuclear fraction than whole cells. The expected input
#' is a data frame with four columns. The first three columns should contain;
#' the nuclear fraction score, total UMIs and a character vector describing
#' each cell as "cell" or "empty_droplet". This is the format output by the
#' `identify_empty_drops` function. The fourth column should be a character
#' vector with user-assigned cell types. Internally, the provided data frame
#' is split by cell type and a Gaussian mixture model with a maximum of two
#' components is fit to the umi counts and nuclear fraction scores. The
#' parameters of the model are estimated using expectation maximisation (EM)
#' with the `mclust` package. The best model is selected using the Bayesian
#' Information Criterion (BIC). The two populations (cells and damaged cells)
#' are assumed to have equal variance (mclust model name "EEI").
#'
#' @param nf_umi_ed_ct data frame, with four columns. The first three columns
#' should match the output from the `identify_empty_drops` function. The
#' fourth column should contain cell type names.
#' @param nf_sep numeric, the minimum separation of the nuclear fraction score
#' required between the cell and damage cell populations
#' @param umi_sep_perc numeric, this is the minimum percentage of UMIs which the
#' damaged cell population is required to have compared to the cell
#' population. For example, if the mean UMI of the distribution fit to the
#' whole cell population is 10,000 UMIs, the mean of the distribution fit to
#' the damaged cell population must be at less than 7,000 UMIs if the umi_sep
#' parameter is 30 (%)
#' @param verbose logical, whether to print updates and progress while fitting
#' with EM
#'
#' @return list, of length two. The first element in the list contains a data
#' frame with the same dimensions input to the `nf_umi_ed_ct` argument, with
#' "damaged_cell" now recorded in the third column.
#'
#' @export
#'
#' @importFrom ks kde hpi
#' @importFrom mclust mclustBIC
#'
#' @examples
#' \dontrun{
#' data("qc_examples")
#' gbm <- qc_examples[qc_examples$sample=="MB",]
#' gbm.ed <- gbm[,c("nuclear_fraction_droplet_qc","umi_count")]
#' gbm.ed <- identify_empty_drops(nf_umi = gbm.ed)
#' gbm.ed$cell_type <- gbm$cell_type
#' gbm.ed.dc <- identify_damaged_cells(gbm.ed, verbose=FALSE)
#' gbm.ed.dc <- gbm.ed.dc[[1]]
#' head(gbm.ed.dc)
#' table(gbm.ed.dc$cell_status)
#' }
#'

utils::globalVariables(c("nf_umi"))

identify_damage_cells <- function(nf_umi_ed_ct,
nf_sep=0.15,
umi_sep_perc=50, # UMI counts percentage less than cell
verbose=TRUE){

# Check nf_umi_ed_ct argument
if (any(class(nf_umi_ed_ct) == "data.frame")) {

# Check four columns exist
if(ncol(nf_umi_ed_ct)!=4){
stop(paste0("nf_umi_ed_ct should be a data frame with four columns, see function arguments"), call.=FALSE)
}

# Assume nuclear fraction is in the first column
nf <- unlist(nf_umi_ed_ct[, 1], use.names = FALSE)
# Assume UMI counts are in the second column
umi <- unlist(nf_umi_ed_ct[, 2], use.names = FALSE)
# Assume third column contains "cell" or "empty_droplet"
ed <- unlist(nf_umi_ed_ct[, 3], use.names = FALSE)
# Assume fourth column contains cell types
ct <- unlist(nf_umi_ed_ct[, 4], use.names = FALSE)


# Check values are reasonable
if(any(c(max(nf)>1, min(nf)<0))){
warning(paste0("The nuclear fraction values provided in the first column of 'nf_umi_ed_ct' should be between 0 and 1, but values outside this range were identified : minimum = ",min(nf),", maximum = ",max(nf)), call.=FALSE)
}

if(!all(umi == floor(umi))){
non_integer_examples <- which(umi != floor(umi))
if(length(non_integer_examples)>5){
non_integer_examples <- non_integer_examples[1:5]
}
non_integer_examples <- paste(umi[non_integer_examples], collapse = ",")
warning(paste0("Non-integer values detected in the second column of 'nf_umi_ed_ct' (e.g. ",non_integer_examples,") where umi counts were expected"), call.=FALSE)
}

if(max(umi)<100){
warning(paste0("The total umi counts provided in the second column of 'nf_umi_ed_ct' appear to be quite low (max = ",max(umi),"), are these the total UMI counts per cell?"), call.=FALSE)
}

if(!all(unique(ed)%in%c("cell", "empty_droplet"))){
ed_output <- unique(ed)
if(length(ed_output)>5){
ed_output <- ed_output[1:5]
ed_output <- paste(ed_output, collapse = ",")
}
warning(paste0("The third column of 'nf_umi_ed_ct' was expected to contain either 'cell' or 'empty_droplet' but contains; ",ed_output), call.=FALSE)
}

if(verbose){
ct <- unique(ct)
ct <- paste(ct, collapse = ",")
print(paste0("The following cell types were provided; ", ct))
}

} else {
stop(paste0("A data frame should be supplied to the nf_umi_ed_ct argument, but an object of class ",paste(class(nf_umi), collapse = "/")," was provided"), call.=FALSE)
}

# Extract data for EM
em.data <- data.frame(nf = unlist(nf_umi_ed_ct[,1], use.names = FALSE),
umi = log10(unlist(nf_umi_ed_ct[,2], use.names = FALSE)),
ct = unlist(nf_umi_ed_ct[,4], use.names = FALSE))
row.names(em.data) <- 1:nrow(em.data)

# Filter out any empty droplets
em.data <- em.data[nf_umi_ed_ct[,3]=="cell",]

# Split by cell type
em.data.ct <-split(em.data, em.data$ct)

# Run EM for all cell types
if(verbose){ print("Fitting models with EM") }
em_mods <- lapply(em.data.ct, function(x) mclust::Mclust(data = x[,1:2], G = 1:2, modelNames = "EEI", verbose = verbose))

# Assign barcodes as "cell" or "damaged_cell" using the `assess_EM` function
em_mods_assessed <- lapply(em_mods,
assess_EM,
nf_thresh = nf_sep,
umi_thresh = umi_sep_perc)


# Update the input data frame "nf_umi_ed_ct" (which still contains empty
# droplets) any with damaged_cell info
names(em_mods_assessed) <- NULL
em_mods_assessed <- do.call(rbind, em_mods_assessed)
nf_umi_ed_ct$cell_status[as.integer(row.names(em_mods_assessed))] <- em_mods_assessed$classification

# Return results as a data frame
return(list(df=nf_umi_ed_ct, plots=NULL))

}
Loading

0 comments on commit 0b4f6fd

Please sign in to comment.