Skip to content

Commit

Permalink
refactor(PAB): moved functions definitions to a separate script
Browse files Browse the repository at this point in the history
  • Loading branch information
hdescobarh committed May 28, 2024
1 parent 59e563b commit bc46ee6
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 90 deletions.
101 changes: 101 additions & 0 deletions PAB_Lab13_diff_expr/Code/DE_customized_functions.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# author: hdescobarh

missing_packages <- character()
for (package in c(
"edgeR"
)) {
if (!require(package, quietly = TRUE, character.only = TRUE)) {
missing_packages <- append(missing_packages, package)
}
}

if (length(missing_packages) > 0) {
stop(
sprintf(
"Missing the following packages: %s",
paste(missing_packages, collapse = ", ")
),
call. = FALSE
)
}

normalization_and_dispersion <- function(
non_normalized_data, normalization_method, design, output_directory_path) {
# Calculate scaling factors for transforming
# library sizes to effective library sizes
data_normalized <- edgeR::normLibSizes(
non_normalized_data,
method = normalization_method
)

edgeR::plotMDS(
data_normalized,
col = as.numeric(data_normalized$samples$treat)
)
# Estimates Common, Trended and Tagwise NB dispersions
data_normalized <- edgeR::estimateDisp(data_normalized, design)
edgeR::plotBCV(data_normalized)
edgeR::plotMeanVar(data_normalized, show.tagwise.vars = TRUE, NBline = TRUE)
data_normalized
}

model_fit <- function(
data_normalized, normalization_method, contrasts, output_directory_path) {
# The QL F-test's statistic reflects the uncertainty in the dispersion
# estimation for each gene and gives more control over type I error rate.

fit <- edgeR::glmQLFit(data_normalized)
edgeR::plotQLDisp(fit)

# Heat map
# The documentation suggest to use "moderated log-counts-per-million"
# for drawing a head map of individual RNA-seq samples

# get cpm log2 values using the fitted coefficients
log_cpm <- edgeR::cpm(fit, log = TRUE)
heatmap(log_cpm, scale = "row", col = heat.colors(256))

fit
}

test_models <- function(
fit, normalization_method, contrasts, output_directory_path) {
for (current_contrast in colnames(contrasts)) {
# Performs genewise quasi F-tests for the given contrast
qlf_test <- edgeR::glmQLFTest(
fit,
contrast = contrasts[, current_contrast]
)

# Table rows =min(n, number of genes with
# adjusted p-value <= cutoff p-value), the default p-value cutoff is 1!!!
top <- edgeR::topTags(qlf_test, n = Inf)
# Save top table
top_table_file_name <- sprintf(
"%s/top_DE_%s.tsv",
output_directory_path,
current_contrast
)
write.table(
top,
file = top_table_file_name, append = FALSE, quote = FALSE, sep = "\t"
)

# Make a mean-difference plot with smearing of points with very low counts
# Highlight top FDR < 0.05
fdr_threshold <- 0.05
top_selected_ids <- top$table[top$table$FDR < fdr_threshold, 1]

smear_file_name <- sprintf(
"%s/smear_MDplot_%s.pdf",
output_directory_path,
current_contrast
)
pdf(file = smear_file_name)
edgeR::plotSmear(qlf_test, de.tags = top_selected_ids)
abline(
h = c(-0.5, 0, 0.5), col = c("#0459ad", "red", "#0459ad"), lty = 2
)
dev.off()
}
}
94 changes: 4 additions & 90 deletions PAB_Lab13_diff_expr/Code/do_DE_analysis.r
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ if (!dir.exists(output_directory_path)) {
dir.create(output_directory_path)
}

# Load customized routines
source("DE_customized_functions.r")

########################## LOAD AND VALIDATE DATA ##########################

Expand Down Expand Up @@ -58,7 +60,7 @@ data <- DGEList(counts = data, samples = samples)
########################## ANALYSIS ##########################


############ 1. Set experiment design and contrasts
############ Set experiment design and contrasts

design <- model.matrix(~ 0 + group, data = data$samples)
colnames(design) <- levels(data$samples$group)
Expand Down Expand Up @@ -93,98 +95,10 @@ my_contrasts <- makeContrasts(
levels = design
)

############ 2. Filtering
############ Filtering

# keeps genes with CPM >= CPM.cutoff in MinSampleSize samples
# such that CPM.cutoff = min.count/median(lib.size)*1e6

keep <- filterByExpr(data, group = group, min.count = 200)
data <- data[keep, , keep.lib.sizes = FALSE]

############ 3. Data Normalization & Dispersion estimation


normalization_and_dispersion <- function(
non_normalized_data, normalization_method, design) {
# Calculate scaling factors for transforming
# library sizes to effective library sizes
data_normalized <- edgeR::normLibSizes(
non_normalized_data,
method = normalization_method
)

edgeR::plotMDS(
data_normalized,
col = as.numeric(data_normalized$samples$treat)
)
# Estimates Common, Trended and Tagwise NB dispersions
data_normalized <- edgeR::estimateDisp(data_normalized, design)
edgeR::plotBCV(data_normalized)
edgeR::plotMeanVar(data_normalized, show.tagwise.vars = TRUE, NBline = TRUE)
data_normalized
}


############ 4. Model fit



model_fit <- function(data_normalized, contrasts) {
# The QL F-test's statistic reflects the uncertainty in the dispersion
# estimation for each gene and gives more control over type I error rate.

fit <- edgeR::glmQLFit(data_normalized)
edgeR::plotQLDisp(fit)

# Heat map
# The documentation suggest to use "moderated log-counts-per-million"
# for drawing a head map of individual RNA-seq samples

# get cpm log2 values using the fitted coefficients
log_cpm <- edgeR::cpm(fit, log = TRUE)
heatmap(log_cpm, scale = "row", col = heat.colors(256))

fit
}
############ 6. differential expression tests

test_models <- function(fit, contrasts) {
for (current_contrast in colnames(my_contrasts)) {
# Performs genewise quasi F-tests for the given contrast
qlf_test <- edgeR::glmQLFTest(
fit,
contrast = my_contrasts[, current_contrast]
)

# Table rows =min(n, number of genes with
# adjusted p-value <= cutoff p-value), the default p-value cutoff is 1!!!
top <- edgeR::topTags(qlf_test, n = Inf)
# Save top table
top_table_file_name <- sprintf(
"%s/top_DE_%s.tsv",
output_directory_path,
current_contrast
)
write.table(
top,
file = top_table_file_name, append = FALSE, quote = FALSE, sep = "\t"
)

# Make a mean-difference plot with smearing of points with very low counts
# Highlight top FDR < 0.05
fdr_threshold <- 0.05
top_selected_ids <- top$table[top$table$FDR < fdr_threshold, 1]

smear_file_name <- sprintf(
"%s/smear_MDplot_%s.pdf",
output_directory_path,
current_contrast
)
pdf(file = smear_file_name)
edgeR::plotSmear(qlf_test, de.tags = top_selected_ids)
abline(
h = c(-0.5, 0, 0.5), col = c("#0459ad", "red", "#0459ad"), lty = 2
)
dev.off()
}
}

0 comments on commit bc46ee6

Please sign in to comment.