From e20e477fbddb9d063a9906422d70d28243c2df7a Mon Sep 17 00:00:00 2001 From: Tania Date: Tue, 12 Jul 2022 11:01:54 +0200 Subject: [PATCH] add_discussion_day3 --- scripts/discussion/discussion_day3.R | 578 +++++++++++++++++++++++++++ 1 file changed, 578 insertions(+) create mode 100644 scripts/discussion/discussion_day3.R diff --git a/scripts/discussion/discussion_day3.R b/scripts/discussion/discussion_day3.R new file mode 100644 index 0000000..158115c --- /dev/null +++ b/scripts/discussion/discussion_day3.R @@ -0,0 +1,578 @@ +# -------- Day 3 + +library(Seurat) +# BiocManager::install("edgeR") +library(edgeR) +library(limma) +library(dittoSeq) +library(dplyr) + +# Differential gene expression +setwd("/export/scratch/twyss/SIB_scRNAseq_course/July2022/data/") +seu_int <- readRDS("seu_int_day2_part2.rds") + +# Use wilcoxon test from Seurat (make sure the default cell identity +# is set to what you want it to be) +Idents(seu_int) # res 0.3 with clusters 0-12 +# make sure the Default assay is set to RNA: +DefaultAssay(seu_int) +?FindAllMarkers + +# this takes a while: +# de_genes <- Seurat::FindAllMarkers(seu_int, min.pct = 0.25, +# only.pos = TRUE) +head(de_genes) + +# sometimes p>0.05 are included, remove non-significant genes +range(de_genes$p_val_adj) # [1] 0 1 +de_genes<-subset(de_genes, de_genes$p_val_adj<0.05) + +# write to csv file if you want to store (and to avoid loosing time recalculating) +write.csv(de_genes, "de_genes_FindAllMarkers.csv", row.names = F, quote = F) + +# create dotplot with top 3 markers per cluster: +library(dplyr) +# use of %>% pipe of magrittr package +# What the function does is to pass the left hand side of the operator +# to the first argument of the right hand side of the operator. +top_specific_markers <- de_genes %>% + group_by(cluster) %>% + top_n(3, avg_log2FC) # select top 3 rows by value + +View(top_specific_markers) +dittoSeq::dittoDotPlot(seu_int, vars = unique(top_specific_markers$gene), + group.by = "integrated_snn_res.0.3") +dittoSeq::dittoDotPlot(seu_int, vars = unique(top_specific_markers$gene), + group.by = "SingleR_annot") + +# check if T cell genes are within DE genes +tcell_genes <- c("IL7R", "LTB", "TRAC", "CD3D") +de_genes[de_genes$gene %in% tcell_genes,] + +# pairwise DE between CD4+ and CD8+ T cells +seu_int <- Seurat::SetIdent(seu_int, value = "SingleR_annot") + +deg_cd8_cd4 <- Seurat::FindMarkers(seu_int, + ident.1 = "CD8+ T cells", + ident.2 = "CD4+ T cells", + group.by = seu_int$SingleR_annot, + test.use = "wilcox") +deg_cd8_cd4<-subset(deg_cd8_cd4, deg_cd8_cd4$p_val_adj<0.05) + +View(deg_cd8_cd4) +# to have info on the output, see "Value" section in help: +?FindMarkers +deg_cd8_cd4[c("CD4", "CD8A", "CD8B"),] + +# Plotting the T cell genes only in T cells: +Seurat::VlnPlot(seu_int, + features = c("CD4", "CD8A", "CD8B"), + idents = c("CD8+ T cells", "CD4+ T cells"), + group.by = "SingleR_annot") +# note on surface markers, eg CD4, which is often low at the mRNA level +# CITE-seq for surface marker identification at the protein level + + + + + + + +# Pseudo-bulk DGE analysis with limma, summing counts per cell: +##New script for generating pseudobulk +##This script follows the vignette on this page +#http://bioconductor.org/books/3.14/OSCA.multisample/multi-sample-comparisons.html +#here 591-596 lines are the same as done above +#taking the proB data +proB <- readRDS("course_data/proB.rds") + +DimPlot(proB, group.by = "orig.ident") + +table(proB@meta.data$type) +# ETV6-RUNX1 PBMMC +# 2000 1021 + +head(proB@meta.data) + +Seurat::DefaultAssay(proB) <- "RNA" +Seurat::Idents(proB) <- proB$orig.ident + +## add the patient id also for paired DGE +proB$patient.id<-gsub("ETV6-RUNX1", "ETV6_RUNX1", proB$orig.ident) +proB$patient.id<-sapply(strsplit(proB$patient.id, "-"), '[', 2) + +## here it is new, to allow to perform pseudo-bulk: +##first a mandatory column of sample needs to be added to the meta data that is the grouping factor, should be the samples +proB$sample <- factor(proB$orig.ident) + +##first an sce object is needed +sce_proB <- as.SingleCellExperiment(proB) + +#The needed package has to be installed if not already done: +if (!require("BiocManager", quietly = TRUE)) + install.packages("BiocManager") +BiocManager::install("scuttle") + +library(scuttle) + +##aggregateAcrossCells here it is only aggregated by sample, one could imagine +##to aggregate by sample and by celltype for instance +summed <- aggregateAcrossCells(sce_proB, + id=colData(sce_proB)[,c("sample")]) + +##have a look at the counts +counts(summed)[1:3,] + +#have a look at the colData of our new object summed, can you see type and +#patient.id are there +head(colData(summed)) + +#As in the standard limma analysis generate a DGE object + +y <- DGEList(counts(summed), samples=colData(summed)$sample) + +##filter lowly expressed (recommanded for limma) +keep <- filterByExpr(y, group=summed$type) +y <- y[keep,] + +##see how many genes were kept +summary(keep) + +## Create the design matrix and include the technology as a covariate: +design <- model.matrix(~0 + summed$type + summed$patient.id) + +# Have a look +design + +# change column/rownames names to more simple group names: +colnames(design) <- make.names(c("ETV6-RUNX1", "PBMMC","patient2","patient3")) +rownames(design)<-colData(summed)$sample + +# Create contrasts, i.e. specify which groups we want to compare, here we want +# to find genes differentially expressed between cluster 1 and cluster 2. +contrast.mat <- limma::makeContrasts(ETV6.RUNX1 - PBMMC, + levels = design) + + +dge <- edgeR::calcNormFactors(y) + +#Do limma +vm <- limma::voom(dge, design = design, plot = TRUE) +fit <- limma::lmFit(vm, design = design) +fit.contrasts <- limma::contrasts.fit(fit, contrast.mat) +fit.contrasts <- limma::eBayes(fit.contrasts) + +# Show the top differentially expressed genes: +limma::topTable(fit.contrasts, number = 10, sort.by = "P") +limma_de <- limma::topTable(fit.contrasts, number = Inf, sort.by = "P") +length(which(limma_de$adj.P.Val<0.05)) + +# +tum_vs_norm <- Seurat::FindMarkers(proB, + ident.1 = "ETV6-RUNX1", + ident.2 = "PBMMC", + group.by = "type") +tum_vs_norm <- subset(tum_vs_norm, tum_vs_norm$p_val_adj<0.05) + +merge_limma_FindMarkers <- merge(tum_vs_norm, limma_de, by="row.names", + all.x=T) + + +par(mar=c(4,4,4,4)) +plot(merge_limma_FindMarkers$avg_log2FC, + merge_limma_FindMarkers$logFC, + xlab="log2FC Wilcoxon", ylab="log2FC limma", + pch=15, cex=0.5) +abline(a=0, b=1, col="red") + + + + +# ---- Enrichment analysis +library(clusterProfiler) +library(enrichplot) +# Need a gene annotation package for Human: +# BiocManager::install("org.Hs.eg.db", update = FALSE) +library(org.Hs.eg.db) +# check gene label types allowed, we can use SYMBOL: +AnnotationDbi::keytypes(org.Hs.eg.db) + +# select genes down-regulated in tumor: +tum_down <- subset(tum_vs_norm, + tum_vs_norm$avg_log2FC < -1 & + tum_vs_norm$p_val_adj < 0.05) +tum_down_genes <- rownames(tum_down) # 62 genes + +# over-representation analysis (Fisher test) for down-reg genes: +?enrichGO +tum_vs_norm_go <- clusterProfiler::enrichGO(gene = tum_down_genes, + OrgDb = "org.Hs.eg.db", + keyType = "SYMBOL", + ont = "BP", + minGSSize = 50) +View(tum_vs_norm_go@result) +# remove redundant gene sets: +enr_go <- clusterProfiler::simplify(tum_vs_norm_go) +View(enr_go@result) + +# check if significant gene sets share overlapping genes: +enrichplot::emapplot(enrichplot::pairwise_termsim(enr_go), + showCategory = 30, cex_label_category = 0.5) + +# Test for hallmark gene set over-representation: +?msigdbr +gmt <- msigdbr::msigdbr(species = "human", category = "H") +?clusterProfiler::read.gmt + +tum_vs_norm_enrich <- clusterProfiler::enricher(gene = tum_down_genes, + universe = rownames(proB), + pAdjustMethod = "BH", + TERM2GENE = gmt[,c("gs_name", "gene_symbol")]) +View(tum_vs_norm_enrich@result[which(tum_vs_norm_enrich@result$p.adjust<0.05),]) + + +# GSEA example using gseGO(), with t-statistic of limma output: +# create ranked list of t-statistics: +gene.list<-limma_de$t +names(gene.list)<-rownames(limma_de) +gene.list<-sort(gene.list, decreasing = T) +head(gene.list) +# RPS4Y2 SDC2 CTGF AP005530.2 GNG11 HLA-DQA1 +# 15.39674 15.33434 14.89301 14.27130 13.70183 13.39202 + +set.seed(1234) +ego<-gseGO(geneList = gene.list, + ont="BP", + OrgDb = "org.Hs.eg.db", + keyType = "SYMBOL", + minGSSize = 60, + eps=1e-60, + seed=T) +ego <- clusterProfiler::simplify(ego) +head(ego@result[,c(2:7)]) +# Description setSize enrichmentScore NES pvalue p.adjust +# GO:0000280 nuclear division 292 -0.5139084 -2.583266 3.544230e-23 4.377997e-20 +# GO:0007059 chromosome segregation 267 -0.5233834 -2.608855 6.808704e-23 4.377997e-20 +# GO:0140014 mitotic nuclear division 225 -0.5404579 -2.661770 2.584499e-21 8.309163e-19 +# GO:0098813 nuclear chromosome segregation 208 -0.5369762 -2.606928 2.340434e-19 4.299712e-17 +# GO:0051301 cell division 420 -0.4367175 -2.300783 3.118075e-19 5.012305e-17 +# GO:0010564 regulation of cell cycle process 443 -0.4125029 -2.183002 2.049784e-17 2.928913e-15 + +# barcode plot of the top GO term: +gseaplot(ego, geneSetID = "GO:0000280", title="GO:0000280: nuclear division") + +# Trajectory analysis +library(Seurat) +library(SingleCellExperiment) +library(scater) +library(slingshot) +library(ggplot2) +library(ggbeeswarm) +library(batchelor) +# sudo apt-get update +# sudo apt-get install libcairo2-dev libxt-dev +# devtools::install_github('cole-trapnell-lab/monocle3') +# devtools::install_github('cole-trapnell-lab/monocle3', ref="develop") +library(monocle3) + +# ---- Trajectory with slingshot: + +# Download the dataset from github within Terminal tab: +# cd course_data/ +# wget https://github.com/hemberg-lab/nrg-paper-figures/blob/master/deng-reads.rds?raw=true +# mv deng-reads.rds\?raw\=true deng-reads.rds + +# Import SingleCellExperiment object: +deng_SCE <- readRDS("deng-reads.rds") + +# Change levels as developmental order: +deng_SCE$cell_type2 <- factor(deng_SCE$cell_type2, + levels = c("zy", + "early2cell", + "mid2cell", + "late2cell", + "4cell", + "8cell", + "16cell", + "earlyblast", + "midblast", + "lateblast")) + +# Run PCA using scater: +deng_SCE <- scater::runPCA(deng_SCE, ncomponents = 50) + +# Use the reducedDim function to access the PCA and store the results. +pca <- SingleCellExperiment::reducedDim(deng_SCE, "PCA") + +# Describe how the PCA is stored in a matrix. Why does it have this structure? +head(pca) + +# Add 2 first principal components to SCE object: +deng_SCE$PC1 <- pca[, 1] +deng_SCE$PC2 <- pca[, 2] + +# PCA plot with cells colored according to developmental stage: +ggplot(as.data.frame(colData(deng_SCE)), aes(x = PC1, y = PC2, color = cell_type2)) + + geom_point(size=2, shape=20) + + theme_classic() + + xlab("PC1") + ylab("PC2") + ggtitle("PC biplot") + +# Plot PC1 vs cell_type2. +deng_SCE$pseudotime_PC1 <- rank(deng_SCE$PC1) # rank cells by their PC1 score + +# Create a jitter plot of developmental stage against PC1: +ggplot(as.data.frame(colData(deng_SCE)), aes(x = pseudotime_PC1, y = cell_type2, + colour = cell_type2)) + + ggbeeswarm::geom_quasirandom(groupOnX = FALSE) + + theme_classic() + + xlab("PC1") + ylab("Timepoint") + + ggtitle("Cells ordered by first principal component") + +# Run Slingshot +?slingshot::slingshot +sce <- slingshot::slingshot(deng_SCE, reducedDim = 'PCA') + + +# custom function to plot the PCA based on a slingshot object: + PCAplot_slingshot <- function(sce, draw_lines = TRUE, variable = NULL, legend = FALSE, ...){ + # set palette for factorial variables + palf <- colorRampPalette(RColorBrewer::brewer.pal(8, "Set2")) + # set palette for numeric variables + paln <- colorRampPalette(RColorBrewer::brewer.pal(9, "Blues")) + # extract pca from SingleCellExperiment object + pca <- SingleCellExperiment::reducedDims(sce)$PCA + + if(is.null(variable)){ + col <- "black" + } + if(is.character(variable)){ + variable <- as.factor(variable) + } + if(is.factor(variable)){ + colpal <- palf(length(levels(variable))) + colors <- colpal[variable] + } + if(is.numeric(variable)){ + colpal <- paln(50) + colors <- colpal[cut(variable,breaks=50)] + } + + # draw the plot + plot(pca, bg = colors, pch = 21) + # draw lines + if(draw_lines){ + lines(slingshot::SlingshotDataSet(sce), lwd = 2, ... ) + } + # add legend + if(legend & is.factor(variable)){ + legend("bottomright", pt.bg = colpal,legend = levels(variable),pch=21) + + } +} + +# Plot PC1 versus PC2 with slingshot pseudotime added as line and coloring the cells: +PCAplot_slingshot(sce, variable = sce$slingPseudotime_1, draw_lines = TRUE) + +# Plot developmental stage against pseudotime: + ggplot(as.data.frame(colData(deng_SCE)), aes(x = sce$slingPseudotime_1, + y = cell_type2, + colour = cell_type2)) + + ggbeeswarm::geom_quasirandom(groupOnX = FALSE) + + theme_classic() + + xlab("Slingshot pseudotime") + ylab("Timepoint") + + ggtitle("Cells ordered by Slingshot pseudotime") + + +# Generate unsupervised clustering of the cells using seurat: +gcdata <- Seurat::CreateSeuratObject(counts = SingleCellExperiment::counts(deng_SCE), + project = "slingshot") + +gcdata <- Seurat::NormalizeData(object = gcdata, + normalization.method = "LogNormalize", + scale.factor = 10000) + +gcdata <- Seurat::FindVariableFeatures(object = gcdata, + mean.function = ExpMean, + dispersion.function = LogVMR) + +gcdata <- Seurat::ScaleData(object = gcdata, + do.center = T, + do.scale = F) + +gcdata <- Seurat::RunPCA(object = gcdata, + pc.genes = gcdata@var.genes) + +gcdata <- Seurat::FindNeighbors(gcdata, + reduction = "pca", + dims = 1:5) + +# clustering with resolution of 0.6 +gcdata <- Seurat::FindClusters(object = gcdata, + resolution = 0.6) + +# Now we can add these clusters to the SCE object and re-run slingshot providing cluster labels : +deng_SCE$Seurat_clusters <- as.character(Idents(gcdata)) # go from factor to character + +sce <- slingshot::slingshot(deng_SCE, + clusterLabels = 'Seurat_clusters', + reducedDim = 'PCA', + start.clus = "2") + +# Check how the slingshot object has evolved, it now contains 2 curves: +SlingshotDataSet(sce) + +# Plot PC1 versus PC2 colored by slingshot pseudotime: +PCAplot_slingshot(sce, variable = sce$slingPseudotime_2) + +# Plot Slingshot pseudotime vs cell stage. +ggplot(data.frame(cell_type2 = deng_SCE$cell_type2, + slingPseudotime_1 = sce$slingPseudotime_1), + aes(x = slingPseudotime_1, y = cell_type2, + colour = cell_type2)) + + ggbeeswarm::geom_quasirandom(groupOnX = FALSE) + + theme_classic() + + xlab("Slingshot pseudotime") + ylab("Timepoint") + + ggtitle("Cells ordered by Slingshot pseudotime") + +ggplot(data.frame(cell_type2 = deng_SCE$cell_type2, + slingPseudotime_2 = sce$slingPseudotime_2), + aes(x = slingPseudotime_2, y = cell_type2, + colour = cell_type2)) + + ggbeeswarm::geom_quasirandom(groupOnX = FALSE) + + theme_classic() + + xlab("Slingshot pseudotime") + ylab("Timepoint") + + ggtitle("Cells ordered by Slingshot pseudotime") + +# Particularly the later stages, separation seems to improve. Since we have included the Seurat clustering, we can plot the PCA, with colors according to these clusters: +PCAplot_slingshot(sce, + variable = deng_SCE$Seurat_clusters, + type = 'lineages', + col = 'black', + legend = TRUE) + +PCAplot_slingshot(sce, + variable = deng_SCE$cell_type2, + type = 'lineages', + col = 'black', + legend = TRUE) + +# Slingshot analysis with end clusters indicated: +sce <- slingshot::slingshot(deng_SCE, + clusterLabels = 'Seurat_clusters', + reducedDim = 'PCA', + end.clus = c("0", "3", "5")) ## check which would be the best according to bio + +# PCA plot with curves ending at 3 clusters: +PCAplot_slingshot(sce, + variable = deng_SCE$Seurat_clusters, + type = 'lineages', + col = 'black', + legend = F) + + +# ---- Trajectory with Monocle3 +# Generate a monocle3 object (with class cell_data_set) from our Seurat object: + +# get matrix and filter for minimum number of cells and +# features (the latter is a fix for backward compatibility) +mat_tmp <- seu_int@assays$RNA@counts +seu_tmp <- Seurat::CreateSeuratObject(mat_tmp, min.cells = 3, + min.features = 100) + +feature_names <- as.data.frame(rownames(seu_tmp)) +rownames(feature_names) <- rownames(seu_tmp) +colnames(feature_names) <- "gene_short_name" + +seu_int_monocl <- monocle3::new_cell_data_set(seu_tmp@assays$RNA@counts, + cell_metadata = seu_int@meta.data, + gene_metadata = feature_names) + +# # Preprocess the dataset: +?preprocess_cds +# already ran: (takes a while) +seu_int_monocl <- monocle3::preprocess_cds(seu_int_monocl) + +# elbow plot: +monocle3::plot_pc_variance_explained(seu_int_monocl) + +# Perform UMAP using the implementation in the monocle3 package and its default parameters: +seu_int_monocl <- monocle3::reduce_dimension(seu_int_monocl, reduction_method = "UMAP") + +# Plot the monocle3 UMAP coloring cells according to the cluster ID ran with Seurat: +monocle3::plot_cells(seu_int_monocl, + color_cells_by = "integrated_snn_res.0.3", + cell_size = 1, + show_trajectory_graph = FALSE) +# Slightly different UMAP between Seurat and Monocle3: +p1<-DimPlot(seu_int, group.by = "integrated_snn_res.0.3") +p2<-monocle3::plot_cells(seu_int_monocl, + color_cells_by = "integrated_snn_res.0.3", + cell_size = 0.7, + show_trajectory_graph = FALSE, + label_cell_groups = F) +cowplot::plot_grid(p1,p2, ncol = 2) + +# plot B cell marker: +monocle3::plot_cells(seu_int_monocl, genes = "CD79A", + show_trajectory_graph = FALSE, + cell_size = 1) +# Cluster cells using monocle3‘s clustering function: +?cluster_cells +seu_int_monocl <- monocle3::cluster_cells(seu_int_monocl, resolution=0.00025) +monocle3::plot_cells(seu_int_monocl, label_cell_groups = F) + +# learn graph (i.e. identify trajectory) using monocle3 UMAP and clustering: +seu_int_monocl <- monocle3::learn_graph(seu_int_monocl) +monocle3::plot_cells(seu_int_monocl) + +# Find the CD34+ B-cell cluster in the monocle UMAP. This cluster has a high expressession of CD79A and expresses CD34. +monocle3::plot_cells(seu_int_monocl, genes = c("CD79A", "CD34"), + show_trajectory_graph = FALSE, + cell_size = 0.7, group_label_size = 4) + +# Select the “initial” cells in the B-cell cluster to calculate pseudotime. +# The initial cells in this case are the CD34+ B-cells we have just identified. +# A pop up window will open and you need to click on the “initial” cells (one node per trajectory), then click “Done”. +seu_int_monocl<-monocle3::order_cells(seu_int_monocl) + +monocle3::plot_cells(seu_int_monocl, + color_cells_by = "pseudotime", + label_cell_groups=F, + label_leaves=F, + label_branch_points=FALSE, + graph_label_size=1.5, cell_size = 1) + +# In order to find genes which expression is affected by pseudotime, +# we first have to isolate the B-cell cluster. Therefore, extract +# all cells in the B-cell cluster with the interactive choose_cells function: +seuB <- choose_cells(seu_int_monocl) +class(seuB) + +# Check whether you have selected the right cells: +plot_cells(seuB, show_trajectory_graph = FALSE, cell_size = 1) +# Now we can use the cells in this trajectory to test which genes are affected by the trajectory: +?graph_test # Moran's I test +pr_test <- graph_test(seuB, + cores=4, + neighbor_graph = "principal_graph") +# order by test statistic +pr_test <- pr_test[order(pr_test$morans_test_statistic, + decreasing = TRUE),] +View(pr_test) +# There are some interesting genes in there, for example related to cell cycling (MKI67, CKS2), +# related to B-cell development +# (CD34, MS4A1) and immunoglobulins (IGLL1 and IGLL5). We can plot those in the UMAP: +goi <- c("CD34", "MS4A1", "IGLL1", "IGLL5", + "MKI67", "CKS2") +plot_cells(seuB, label_cell_groups=FALSE, genes = goi, + show_trajectory_graph=FALSE, cell_size = 1) + +# But also against pseudotime: +?monocle3::clusters +table(clusters(seuB)) +# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 +# 0 770 0 0 0 0 464 0 0 0 0 0 0 0 0 0 +seuB@colData$monocle_cluster <- clusters(seuB) +plot_genes_in_pseudotime(subset(seuB, + rowData(seuB)$gene_short_name %in% goi), + min_expr=0.5, color_cells_by = "monocle_cluster")