Skip to content

Commit

Permalink
feat(PAB): added enviroment set up, data preparation and model fitting
Browse files Browse the repository at this point in the history
  • Loading branch information
hdescobarh committed May 28, 2024
1 parent 7280dff commit 6d42213
Showing 1 changed file with 93 additions and 11 deletions.
104 changes: 93 additions & 11 deletions PAB_Lab13_diff_expr/Code/do_DE_analysis.r
Original file line number Diff line number Diff line change
Expand Up @@ -48,39 +48,121 @@ if (length(missing_packages) > 0) {


data <- read.table(
file = "Example_Data/Segemehl_filTotal3.txt",
header = TRUE, row.names = 1
file = input_file_path, header = TRUE, row.names = 1
)

# Table columns names verification

pattern <- "[DM]\\d+R\\d+"

if (!all(
grepl("[DM]\\d+R\\d+", colnames(data))
grepl(pattern, colnames(data))
)) {
stop(sprintf(
"Invalid columns names. They must follow the format '[DM]\\d+R\\d+'",
paste("", collapse = ", ")
"Invalid columns names. They must follow the format %s",
pattern
))
}

# Set samples data frame

treat <- factor(substr(names(data), 1, 1))
time <- factor(substr(names(data), 2, 3))
group <- factor(paste(treat, time, sep = "_"))
samples <- data.frame(row.names = names(data), treat, time, group)

data <- DGEList(counts = data, samples = samples)

########################## ANALYSIS ##########################


############ 1. Filtering

# filtering keeps genes that have CPM >= CPM.cutoff in MinSampleSize samples,
# where CPM.cutoff = min.count/median(lib.size)*1e6 and MinSampleSize is the
# smallest group sample size or, more generally, the minimum inverse leverage
# computed from the design matrix.

keep <- filterByExpr(data, group = group, min.count = 15)
data <- data[keep, , keep.lib.sizes = FALSE]


############ 2. Data Normalization

# Calculate scaling factors to convert the raw library sizes for a
# set of sequenced samples into normalized effective library sizes.

# method TMMwsp: This is a variant of TMM that is intended to have more stable
# performance when the counts have a high proportion of zeros.

data_normalized <- normLibSizes(data, method = "TMMwsp")
plotMDS(data_normalized, col = as.numeric(data_normalized$samples$treat))


############ 3. Set experiment design and contrasts

# define a coefficient for the expression level of each group

design <- model.matrix(~ 0 + group, data = data_normalized$samples)
colnames(design) <- levels(data_normalized$samples$group)


my_contrasts <- makeContrasts(
# Inside D
DD_12vs03 = D_12 - D_03,
DD_24vs12 = D_24 - D_12,
DD_48vs24 = D_48 - D_24,
DD_24vs03 = D_24 - D_03,
DD_48vs12 = D_48 - D_12,
DD_48vs03 = D_48 - D_03,
# Inside M
MM_12vs03 = M_12 - M_03,
MM_24vs12 = M_24 - M_12,
MM_48vs24 = M_48 - M_24,
MM_24vs03 = M_24 - M_03,
MM_48vs12 = M_48 - M_12,
MM_48vs03 = M_48 - M_03,
# Between D and M - same time
DM_same_03 = D_03 - M_03,
DM_same_12 = D_12 - M_12,
DM_same_24 = D_24 - M_24,
DM_same_48 = D_48 - M_48,
# Between D and M - differences between t and t+1
DM_delta_12vs03 = (D_12 - D_03) - (M_12 - M_03),
DM_delta_24vs12 = (D_24 - D_12) - (M_24 - M_12),
DM_delta_48vs24 = (D_48 - D_24) - (M_48 - M_24),
DM_delta_24vs03 = (D_24 - D_03) - (M_24 - M_03),
DM_delta_48vs03 = (D_48 - D_03) - (M_48 - M_03),
DM_delta_48vs12 = (D_48 - D_12) - (M_48 - M_12),
levels = design
)


############ 2. Filtering
############ 4. Dispersion estimation

# Estimate Common, Trended and Tagwise Negative Binomial dispersions
# by weighted likelihood empirical Bayes
data_normalized <- estimateDisp(data_normalized, design)

############ 3. Data Normalization
cat("Common dispersion: ", data_normalized$common.dispersion, "\n")
plotBCV(data_normalized)
plotMeanVar(data_normalized, show.tagwise.vars = TRUE, NBline = TRUE)

############ 5. Model fit

############ 4. Set experiment design
# The QL F-test statistic reflects the uncertainty in the dispersion estimation
# for each gene and gives more control over type I error rate.

fit <- glmQLFit(data_normalized)
plotQLDisp(fit)

############ 5. Dispersion estimation
# Heat map
# The documentation suggest to use "moderated log-counts-per-million"
# for drawing a head map of individual RNA-seq samples


############ 6. Model fit
# get cpm log2 values using the fitted coefficients
log_cpm <- cpm(fit, log = TRUE)
heatmap(log_cpm, scale = "row", col = heat.colors(256))

############ 7. differential expression tests
############ 6. differential expression tests

0 comments on commit 6d42213

Please sign in to comment.