refactor(PAB): moved functions definitions to a separate script

hdescobarh · May 28, 2024 · bc46ee6 · bc46ee6
1 parent 59e563b
commit bc46ee6
Show file tree

Hide file tree

Showing 2 changed files with 105 additions and 90 deletions.
diff --git a/PAB_Lab13_diff_expr/Code/DE_customized_functions.r b/PAB_Lab13_diff_expr/Code/DE_customized_functions.r
@@ -0,0 +1,101 @@
+# author: hdescobarh
+
+missing_packages <- character()
+for (package in c(
+  "edgeR"
+)) {
+  if (!require(package, quietly = TRUE, character.only = TRUE)) {
+    missing_packages <- append(missing_packages, package)
+  }
+}
+
+if (length(missing_packages) > 0) {
+  stop(
+    sprintf(
+      "Missing the following packages: %s",
+      paste(missing_packages, collapse = ", ")
+    ),
+    call. = FALSE
+  )
+}
+
+normalization_and_dispersion <- function(
+    non_normalized_data, normalization_method, design, output_directory_path) {
+  # Calculate scaling factors for transforming
+  #   library sizes to effective library sizes
+  data_normalized <- edgeR::normLibSizes(
+    non_normalized_data,
+    method = normalization_method
+  )
+
+  edgeR::plotMDS(
+    data_normalized,
+    col = as.numeric(data_normalized$samples$treat)
+  )
+  # Estimates Common, Trended and Tagwise NB dispersions
+  data_normalized <- edgeR::estimateDisp(data_normalized, design)
+  edgeR::plotBCV(data_normalized)
+  edgeR::plotMeanVar(data_normalized, show.tagwise.vars = TRUE, NBline = TRUE)
+  data_normalized
+}
+
+model_fit <- function(
+    data_normalized, normalization_method, contrasts, output_directory_path) {
+  # The QL F-test's statistic reflects the uncertainty in the dispersion
+  #   estimation for each gene and gives more control over type I error rate.
+
+  fit <- edgeR::glmQLFit(data_normalized)
+  edgeR::plotQLDisp(fit)
+
+  # Heat map
+  # The documentation suggest to use "moderated log-counts-per-million"
+  # for drawing a head map of individual RNA-seq samples
+
+  # get cpm log2 values using the fitted coefficients
+  log_cpm <- edgeR::cpm(fit, log = TRUE)
+  heatmap(log_cpm, scale = "row", col = heat.colors(256))
+
+  fit
+}
+
+test_models <- function(
+    fit, normalization_method, contrasts, output_directory_path) {
+  for (current_contrast in colnames(contrasts)) {
+    # Performs genewise quasi F-tests for the given contrast
+    qlf_test <- edgeR::glmQLFTest(
+      fit,
+      contrast = contrasts[, current_contrast]
+    )
+
+    # Table rows =min(n, number of genes with
+    #   adjusted p-value <= cutoff p-value), the default p-value cutoff is 1!!!
+    top <- edgeR::topTags(qlf_test, n = Inf)
+    # Save top table
+    top_table_file_name <- sprintf(
+      "%s/top_DE_%s.tsv",
+      output_directory_path,
+      current_contrast
+    )
+    write.table(
+      top,
+      file = top_table_file_name, append = FALSE, quote = FALSE, sep = "\t"
+    )
+
+    # Make a mean-difference plot with smearing of points with very low counts
+    # Highlight top FDR < 0.05
+    fdr_threshold <- 0.05
+    top_selected_ids <- top$table[top$table$FDR < fdr_threshold, 1]
+
+    smear_file_name <- sprintf(
+      "%s/smear_MDplot_%s.pdf",
+      output_directory_path,
+      current_contrast
+    )
+    pdf(file = smear_file_name)
+    edgeR::plotSmear(qlf_test, de.tags = top_selected_ids)
+    abline(
+      h = c(-0.5, 0, 0.5), col = c("#0459ad", "red", "#0459ad"), lty = 2
+    )
+    dev.off()
+  }
+}
diff --git a/PAB_Lab13_diff_expr/Code/do_DE_analysis.r b/PAB_Lab13_diff_expr/Code/do_DE_analysis.r
@@ -25,6 +25,8 @@ if (!dir.exists(output_directory_path)) {
   dir.create(output_directory_path)
 }
 
+# Load customized routines
+source("DE_customized_functions.r")
 
 ########################## LOAD AND VALIDATE DATA ##########################
 
@@ -58,7 +60,7 @@ data <- DGEList(counts = data, samples = samples)
 ########################## ANALYSIS ##########################
 
 
-############ 1. Set experiment design and contrasts
+############ Set experiment design and contrasts
 
 design <- model.matrix(~ 0 + group, data = data$samples)
 colnames(design) <- levels(data$samples$group)
@@ -93,98 +95,10 @@ my_contrasts <- makeContrasts(
   levels = design
 )
 
-############ 2. Filtering
+############ Filtering
 
 # keeps genes with CPM >= CPM.cutoff in MinSampleSize samples
 #   such that CPM.cutoff = min.count/median(lib.size)*1e6
 
 keep <- filterByExpr(data, group = group, min.count = 200)
 data <- data[keep, , keep.lib.sizes = FALSE]
-
-############ 3. Data Normalization & Dispersion estimation
-
-
-normalization_and_dispersion <- function(
-    non_normalized_data, normalization_method, design) {
-  # Calculate scaling factors for transforming
-  #   library sizes to effective library sizes
-  data_normalized <- edgeR::normLibSizes(
-    non_normalized_data,
-    method = normalization_method
-  )
-
-  edgeR::plotMDS(
-    data_normalized,
-    col = as.numeric(data_normalized$samples$treat)
-  )
-  # Estimates Common, Trended and Tagwise NB dispersions
-  data_normalized <- edgeR::estimateDisp(data_normalized, design)
-  edgeR::plotBCV(data_normalized)
-  edgeR::plotMeanVar(data_normalized, show.tagwise.vars = TRUE, NBline = TRUE)
-  data_normalized
-}
-
-
-############ 4. Model fit
-
-
-
-model_fit <- function(data_normalized, contrasts) {
-  # The QL F-test's statistic reflects the uncertainty in the dispersion
-  #   estimation for each gene and gives more control over type I error rate.
-
-  fit <- edgeR::glmQLFit(data_normalized)
-  edgeR::plotQLDisp(fit)
-
-  # Heat map
-  # The documentation suggest to use "moderated log-counts-per-million"
-  # for drawing a head map of individual RNA-seq samples
-
-  # get cpm log2 values using the fitted coefficients
-  log_cpm <- edgeR::cpm(fit, log = TRUE)
-  heatmap(log_cpm, scale = "row", col = heat.colors(256))
-
-  fit
-}
-############ 6. differential expression tests
-
-test_models <- function(fit, contrasts) {
-  for (current_contrast in colnames(my_contrasts)) {
-    # Performs genewise quasi F-tests for the given contrast
-    qlf_test <- edgeR::glmQLFTest(
-      fit,
-      contrast = my_contrasts[, current_contrast]
-    )
-
-    # Table rows =min(n, number of genes with
-    #   adjusted p-value <= cutoff p-value), the default p-value cutoff is 1!!!
-    top <- edgeR::topTags(qlf_test, n = Inf)
-    # Save top table
-    top_table_file_name <- sprintf(
-      "%s/top_DE_%s.tsv",
-      output_directory_path,
-      current_contrast
-    )
-    write.table(
-      top,
-      file = top_table_file_name, append = FALSE, quote = FALSE, sep = "\t"
-    )
-
-    # Make a mean-difference plot with smearing of points with very low counts
-    # Highlight top FDR < 0.05
-    fdr_threshold <- 0.05
-    top_selected_ids <- top$table[top$table$FDR < fdr_threshold, 1]
-
-    smear_file_name <- sprintf(
-      "%s/smear_MDplot_%s.pdf",
-      output_directory_path,
-      current_contrast
-    )
-    pdf(file = smear_file_name)
-    edgeR::plotSmear(qlf_test, de.tags = top_selected_ids)
-    abline(
-      h = c(-0.5, 0, 0.5), col = c("#0459ad", "red", "#0459ad"), lty = 2
-    )
-    dev.off()
-  }
-}