feat(PAB): added enviroment set up, data preparation and model fitting

hdescobarh · May 28, 2024 · 6d42213 · 6d42213
1 parent 7280dff
commit 6d42213
Showing 1 changed file with 93 additions and 11 deletions.
diff --git a/PAB_Lab13_diff_expr/Code/do_DE_analysis.r b/PAB_Lab13_diff_expr/Code/do_DE_analysis.r
@@ -48,39 +48,121 @@ if (length(missing_packages) > 0) {
 
 
 data <- read.table(
-  file = "Example_Data/Segemehl_filTotal3.txt",
-  header = TRUE, row.names = 1
+  file = input_file_path, header = TRUE, row.names = 1
 )
 
 # Table columns names verification
 
+pattern <- "[DM]\\d+R\\d+"
+
 if (!all(
-  grepl("[DM]\\d+R\\d+", colnames(data))
+  grepl(pattern, colnames(data))
 )) {
   stop(sprintf(
-    "Invalid columns names. They must follow the format '[DM]\\d+R\\d+'",
-    paste("", collapse = ", ")
+    "Invalid columns names. They must follow the format %s",
+    pattern
   ))
 }
 
+# Set samples data frame
+
+treat <- factor(substr(names(data), 1, 1))
+time <- factor(substr(names(data), 2, 3))
+group <- factor(paste(treat, time, sep = "_"))
+samples <- data.frame(row.names = names(data), treat, time, group)
+
+data <- DGEList(counts = data, samples = samples)
+
 ########################## ANALYSIS ##########################
 
 
+############ 1. Filtering
+
+# filtering keeps genes that have CPM >= CPM.cutoff in MinSampleSize samples,
+#   where CPM.cutoff = min.count/median(lib.size)*1e6 and MinSampleSize is the
+#   smallest group sample size or, more generally, the minimum inverse leverage
+#   computed from the design matrix.
+
+keep <- filterByExpr(data, group = group, min.count = 15)
+data <- data[keep, , keep.lib.sizes = FALSE]
+
+
+############ 2. Data Normalization
+
+# Calculate scaling factors to convert the raw library sizes for a
+#   set of sequenced samples into normalized effective library sizes.
 
+# method TMMwsp: This is a variant of TMM that is intended to have more stable
+#   performance when the counts have a high proportion of zeros.
+
+data_normalized <- normLibSizes(data, method = "TMMwsp")
+plotMDS(data_normalized, col = as.numeric(data_normalized$samples$treat))
+
+
+############ 3. Set experiment design and contrasts
+
+# define a coefficient for the expression level of each group
+
+design <- model.matrix(~ 0 + group, data = data_normalized$samples)
+colnames(design) <- levels(data_normalized$samples$group)
+
+
+my_contrasts <- makeContrasts(
+  # Inside D
+  DD_12vs03 = D_12 - D_03,
+  DD_24vs12 = D_24 - D_12,
+  DD_48vs24 = D_48 - D_24,
+  DD_24vs03 = D_24 - D_03,
+  DD_48vs12 = D_48 - D_12,
+  DD_48vs03 = D_48 - D_03,
+  # Inside M
+  MM_12vs03 = M_12 - M_03,
+  MM_24vs12 = M_24 - M_12,
+  MM_48vs24 = M_48 - M_24,
+  MM_24vs03 = M_24 - M_03,
+  MM_48vs12 = M_48 - M_12,
+  MM_48vs03 = M_48 - M_03,
+  # Between D and M - same time
+  DM_same_03 = D_03 - M_03,
+  DM_same_12 = D_12 - M_12,
+  DM_same_24 = D_24 - M_24,
+  DM_same_48 = D_48 - M_48,
+  # Between D and M - differences between t and t+1
+  DM_delta_12vs03 = (D_12 - D_03) - (M_12 - M_03),
+  DM_delta_24vs12 = (D_24 - D_12) - (M_24 - M_12),
+  DM_delta_48vs24 = (D_48 - D_24) - (M_48 - M_24),
+  DM_delta_24vs03 = (D_24 - D_03) - (M_24 - M_03),
+  DM_delta_48vs03 = (D_48 - D_03) - (M_48 - M_03),
+  DM_delta_48vs12 = (D_48 - D_12) - (M_48 - M_12),
+  levels = design
+)
 
 
-############ 2. Filtering
+############ 4. Dispersion estimation
 
+# Estimate Common, Trended and Tagwise Negative Binomial dispersions
+#   by weighted likelihood empirical Bayes
+data_normalized <- estimateDisp(data_normalized, design)
 
-############ 3. Data Normalization
+cat("Common dispersion: ", data_normalized$common.dispersion, "\n")
+plotBCV(data_normalized)
+plotMeanVar(data_normalized, show.tagwise.vars = TRUE, NBline = TRUE)
 
+############ 5. Model fit
 
-############ 4. Set experiment design
+# The QL F-test statistic reflects the uncertainty in the dispersion estimation
+#   for each gene and gives more control over type I error rate.
 
+fit <- glmQLFit(data_normalized)
+plotQLDisp(fit)
 
-############ 5. Dispersion estimation
+# Heat map
+# The documentation suggest to use "moderated log-counts-per-million"
+# for drawing a head map of individual RNA-seq samples
 
 
-############ 6. Model fit
+# get cpm log2 values using the fitted coefficients
+log_cpm <- cpm(fit, log = TRUE)
+heatmap(log_cpm, scale = "row", col = heat.colors(256))
 
-############ 7. differential expression tests
+############ 6. differential expression tests