This repository has been archived by the owner on Aug 11, 2023. It is now read-only.
forked from davismcc/archive-scater
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcalculate-expression.R
175 lines (156 loc) · 6.36 KB
/
calculate-expression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
## A set of functions for calculating and summarising expression values
#' Calculate which features are expressed in which cells using a threshold on
#' observed counts, transcripts-per-million, counts-per-million, FPKM, or
#' defined expression levels.
#'
#' @param object an SCESet object with expression and/or count data.
#' @param lowerDetectionLimit numeric scalar giving the minimum expression level
#' for an expression observation in a cell for it to qualify as expressed.
#' @param exprs_data character scalar indicating whether the count data
#' (\code{"counts"}), the transformed expression data (\code{"exprs"}),
#' transcript-per-million (\code{"tpm"}), counts-per-million (\code{"cpm"}) or
#' FPKM (\code{"fpkm"}) should be used to define if an observation is expressed
#' or not.
#' @return a logical matrix indicating whether or not a feature in a particular
#' cell is expressed.
#' @export
#' @examples
#' data("sc_example_counts")
#' data("sc_example_cell_info")
#' example_sceset <- newSCESet(countData=sc_example_counts)
#' is_exprs(example_sceset) <- calcIsExprs(example_sceset, lowerDetectionLimit = 1,
#' exprs_data = "exprs")
calcIsExprs <- function(object, lowerDetectionLimit = NULL, exprs_data = "counts")
{
if ( !is(object, "SCESet") )
stop("Object must be an SCESet.")
## Check that args are appropriate
exprs_data <- match.arg(exprs_data, c("counts", "exprs", "tpm", "cpm", "fpkm"))
dat_matrix <- switch(exprs_data,
counts = counts(object),
exprs = exprs(object),
tpm = tpm(object),
cpm = cpm(object),
fpkm = fpkm(object))
if ( is.null(dat_matrix) )
stop(paste0("Tried to use ", exprs_data, " as expression data, but ",
exprs_data, "(object) is null."))
#
# if ( exprs_data == "counts" ) {
# dat_matrix <- counts(object)
# }
# else {
# if ( exprs_data == "exprs" )
# dat_matrix <- exprs(object)
# else
#
# }
## Extract lowerDetectionLimit if not provided
if ( is.null(lowerDetectionLimit) )
lowerDetectionLimit <- object@lowerDetectionLimit
## Decide which observations are above detection limit and return matrix
isexprs <- dat_matrix > lowerDetectionLimit
rownames(isexprs) <- rownames(dat_matrix)
colnames(isexprs) <- colnames(dat_matrix)
isexprs
}
#' Calculate transcripts-per-million (TPM)
#'
#' Calculate transcripts-per-million (TPM) values for expression from counts for
#' a set of features.
#'
#' @param object an \code{SCESet} object
#' @param effective_length vector of class \code{"numeric"} providing the
#' effective length for each feature in the \code{SCESet} object
#' @param calc_from character string indicating whether to compute TPM from
#' \code{"counts"}, \code{"norm_counts"}, \code{"fpkm"} or \code{"norm_fpkm"}.
#' Default is to use \code{"counts"}, in which case the \code{effective_length}
#' argument must be supplied.
#'
#' @return Matrix of TPM values.
#' @export
#' @examples
#' data("sc_example_counts")
#' data("sc_example_cell_info")
#' pd <- new("AnnotatedDataFrame", data = sc_example_cell_info)
#' example_sceset <- newSCESet(countData = sc_example_counts, phenoData = pd)
#' effective_length <- rep(1000, 2000)
#' tpm(example_sceset) <- calculateTPM(example_sceset, effective_length,
#' calc_from = "counts")
#'
calculateTPM <- function(object, effective_length = NULL, calc_from = "counts") {
if ( !is(object, "SCESet"))
stop("object must be an SCESet")
## Check that arguments are correct
calc_from <- match.arg(calc_from, c("counts", "norm_counts", "fpkm",
"norm_fpkm"), several.ok = FALSE)
if ( calc_from == "counts" || calc_from == "norm_counts" ) {
if ( is.null(effective_length) )
stop("effective_length argument is required if computing TPM from counts")
}
## Compute values to return
tpm_to_add <- switch(calc_from,
counts = .countToTpm(counts(object), effective_length),
norm_counts = .countToTpm(norm_counts(object),
effective_length),
fpkm = .fpkmToTpm(fpkm(object)),
norm_fpkm = .fpkmToTpm(norm_fpkm(object)))
## Return TPM values
tpm_to_add
}
#' Calculate fragments per kilobase of exon per million reads mapped (FPKM)
#'
#' Calculate fragments per kilobase of exon per million reads mapped (FPKM)
#' values for expression from counts for a set of features.
#'
#' @param object an \code{SCESet} object
#' @param effective_length vector of class \code{"numeric"} providing the
#' effective length for each feature in the \code{SCESet} object
#'
#' @return Matrix of FPKM values.
#' @export
#' @examples
#' data("sc_example_counts")
#' data("sc_example_cell_info")
#' example_sceset <- newSCESet(countData = sc_example_counts)
#' effective_length <- rep(1000, 2000)
#' fpkm(example_sceset) <- calculateFPKM(example_sceset, effective_length)
#'
calculateFPKM <- function(object, effective_length) {
if ( !is(object, "SCESet"))
stop("object must be an SCESet")
## Compute values to add
fpkm_to_add <- .countToFpkm(counts(object), effective_length)
## Return matrix of FPKM values
fpkm_to_add
}
.countToTpm <- function(counts, eff_len) {
## Expecting a count matrix of nfeatures x ncells
## can't have any zero counts, so expect to apply offset
counts0 <- counts
counts0[counts == 0] <- NA
rate <- log(counts0) - log(eff_len)
denom <- log(colSums(exp(rate)))
out <- exp( t(t(rate) - denom) + log(1e6) )
out[is.na(out)] <- 0
out
}
.countToFpkm <- function(counts, eff_len) {
## Expecting a count matrix of nfeatures x ncells
## Need to be careful with zero counts
counts0 <- counts
counts0[counts == 0] <- NA
N <- colSums(counts)
logfpkm <- log(counts0) + log(1e9) - log(eff_len)
logfpkm <- t(t(logfpkm) - log(N))
out <- exp( logfpkm )
out[is.na(out)] <- 0
out
}
.fpkmToTpm <- function(fpkm) {
## Expecting an FPKM matrix of nfeatures x ncells
exp( t(t(log(fpkm)) - log(colSums(fpkm))) + log(1e6) )
}
.countToEffCounts <- function(counts, len, eff_len) {
counts * (len / eff_len)
}