Description
#Example 1: With Regression
pbmc_data <- Seurat::Read10X(data.dir = FilePath)
pbmc <- CreateSeuratObject(counts = pbmc_data)
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
pbmc <- NormalizeData(pbmc)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst")
s.genes <- cc.genes.updated.2019$s.genes #Cell cycle markers loaded from Seurat
s.genes <- sapply(s.genes, str_to_title)
g2m.genes <- cc.genes.updated.2019$g2m.genes #Separating into S and G2M markers
g2m.genes <- sapply(g2m.genes, str_to_title)
pbmc <- CellCycleScoring(pbmc, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
pbmc <- PercentageFeatureSet(pbmc, pattern = "^mt-", col.name = "percent.mt")
pbmc <- SCTransform(pbmc, vars.to.regress = c("percent.mt", "S.Score", "G2M.Score"), vst.flavor = "v2", method = "glmGamPoi", verbose = T)
pbmc <- RunPCA(pbmc, verbose = T)
pbmc <- RunUMAP(pbmc, dims = 1:30, verbose = T)
pbmc <- FindNeighbors(pbmc, dims = 1:30, verbose = T)
pbmc <- FindClusters(pbmc, verbose = T)
#Example 2: Without Regression
pbmc2 <- CreateSeuratObject(counts = pbmc_data)
pbmc2 <- NormalizeData(pbmc2)
pbmc2 <- FindVariableFeatures(pbmc2, selection.method = "vst")
pbmc2 <- CellCycleScoring(pbmc2, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
pbmc2 <- PercentageFeatureSet(pbmc2, pattern = "^mt-", col.name = "percent.mt")
pbmc2 <- SCTransform(pbmc2, vst.flavor = "v2", method = "glmGamPoi", verbose = T)
pbmc2 <- RunPCA(pbmc2, verbose = T)
pbmc2 <- RunUMAP(pbmc2, dims = 1:30, verbose = T)
pbmc2 <- FindNeighbors(pbmc2, dims = 1:30, verbose = T)
pbmc2 <- FindClusters(pbmc2, verbose = T)
Plot1 <- DimPlot(pbmc, label = TRUE) + ggtitle("With MT and CC Regression")
Plot2 <- DimPlot(pbmc2, label = TRUE) + ggtitle("Without MT and CC Regression")
Plot1 + Plot2
identical(pbmc@meta.data, pbmc2@meta.data)
pbmc_data <- Seurat::Read10X(data.dir = FilePath)
pbmc <- CreateSeuratObject(counts = pbmc_data)
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
pbmc <- NormalizeData(pbmc)
Normalizing layer: counts
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst")
Finding variable features for layer counts
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
s.genes <- cc.genes.updated.2019$s.genes #Cell cycle markers loaded from Seurat
s.genes <- sapply(s.genes, str_to_title)
g2m.genes <- cc.genes.updated.2019$g2m.genes #Separating into S and G2M markers
g2m.genes <- sapply(g2m.genes, str_to_title)
pbmc <- CellCycleScoring(pbmc, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
pbmc <- PercentageFeatureSet(pbmc, pattern = "^mt-", col.name = "percent.mt")
pbmc <- SCTransform(pbmc, vars.to.regress = c("percent.mt", "S.Score", "G2M.Score"), vst.flavor = "v2", method = "glmGamPoi", verbose = T)
Running SCTransform on assay: RNA
Running SCTransform on layer: counts
vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.
Variance stabilizing transformation of count matrix of size 22452 by 8924
Model formula is y ~ log_umi
Get Negative Binomial regression parameters per gene
Using 2000 genes, 5000 cells
Found 93 outliers - those will be ignored in fitting/regularization step
Second step: Get residuals using fitted parameters for 22452 genes
Computing corrected count matrix for 22452 genes
Calculating gene attributes
Wall clock passed: Time difference of 23.15094 secs
Determine variable features
Regressing out percent.mt, S.Score, G2M.Score
|=================================================================================================================================================================================================================================================================================| 100%
Centering data matrix
|=================================================================================================================================================================================================================================================================================| 100%
Getting residuals for block 1(of 2) for counts dataset
Getting residuals for block 2(of 2) for counts dataset
Centering data matrix
|=================================================================================================================================================================================================================================================================================| 100%
Finished calculating residuals for counts
Set default assay to SCT
pbmc <- RunPCA(pbmc, verbose = T)
PC_ 1
Positive: Col1a1, Col1a2, Col3a1, Tnfaip6, Dlk1, Cxcl1, Cxcl2, Meg3, Ccl2, Sparc
Itm2a, Tnc, Cdkn1c, Mfap4, Lum, Gm32647, Fstl1, Aspn, Dcn, Mest
Col6a1, Col5a2, Dlc1, Thbs1, Ebf1, Ccl7, Nrk, Igf2, Pcolce, Ccn1
Negative: Krt14, Krt15, Krt5, Krt1, Ly6d, Krt10, Lgals7, Cstdc5, Stfa3, Krt17
Krtdap, Perp, Dmkn, Stfa1, Sfn, Dsp, Srgn, Sox6, Grip1, Cpa3
Hdc, Cma1, Lgals3, Tpsb2, mt-Cytb, Calm4, mt-Atp6, Fcer1g, Bmpr1b, S100a14
PC_ 2
Positive: Krt14, Krt15, Krt1, Krt5, Ly6d, Krt10, Lgals7, Stfa3, Cstdc5, Krt17
Krtdap, Perp, Dmkn, Sfn, Dsp, Grip1, Fabp5, Sox6, Stfa1, Calm4
Cxcl14, S100a14, Bmpr1b, Fras1, Fgfbp1, Ptma, Tspear, Trp63, Dsc3, Serpinb5
Negative: Srgn, Fcer1g, Hdc, Cpa3, Cma1, Tpsb2, Serpinb1a, Tyrobp, Plek, Slc6a4
Csf2rb, Cxcl2, Gata2, Fxyd5, Mcpt4, C1qb, Calca, Slco2b1, Mrc1, Cd14
Cyp11a1, Pf4, C1qc, Slc18a2, C1qa, F13a1, Ccl7, Otulinl, Ccl4, Rac2
PC_ 3
Positive: C1qb, Pf4, Mrc1, C1qc, C1qa, F13a1, Cxcl2, Cd14, Ccl12, Cd86
Ccl4, C5ar1, Il1b, Csf1r, Apoe, Ccl7, Ccl3, Slfn2, Lyz2, Fcrls
Lyve1, Selenop, Aoah, Ptprj, Ifi207, Nlrp3, Cd83, Hmox1, Ms4a6c, Ms4a7
Negative: Cpa3, Cma1, Hdc, Tpsb2, Serpinb1a, Slc6a4, Gata2, Srgn, Csf2rb, Calca
Mcpt4, Cyp11a1, Slc18a2, Hs6st2, Rab27b, Itk, Slco2b1, Hs3st1, Fdx1, Il13
Otulinl, St6galnac3, Il4, Il1rl1, Fxyd5, Ctla2a, Mrgprb1, Csf2, Edil3, Adora3
PC_ 4
Positive: Krt1, Krt10, Ly6d, Stfa3, Cstdc5, Krtdap, Col1a1, Col1a2, Stfa1, Lgals7
Tnfaip6, Krt15, Dlk1, Perp, Fabp5, Dmkn, Lgals3, Krt14, Col3a1, Cxcl2
Krt5, Npl, Sfn, Itm2a, Cxcl1, Mfap4, Dsp, Dsg1a, Calm4, Krt16
Negative: Dct, Pmel, Ptgds, Tyr, Cpne4, Trps1, Pax3, Chsy3, Fmn1, Mlana
Pde3a, Ednrb, Col23a1, Rarb, Robo2, Acta2, Ebf1, Aff3, Rgs5, Fstl4
Synpr, Ptprj, Prex2, Sobp, Ctnna3, Myh11, Myl9, Syt4, Tagln, Mcoln3
PC_ 5
Positive: Dlk1, Dct, Pmel, Ptgds, Tyr, Itm2a, Mest, Cpne4, Chsy3, Mfap4
Pax3, Nrk, Cdkn1c, Clec3b, Aspn, Postn, Mfap5, Fmn1, Acta2, Tmeff2
Mlana, Igfbp7, Dpysl3, Col14a1, Ednrb, Krt1, Rgs5, Igf2, Krt10, Myh11
Negative: Gm32647, Robo2, Trps1, Col3a1, Col1a1, Egfl6, Cntn4, Cxcl2, Col1a2, Lum
Wif1, Slit3, Twist2, Ccl2, Grem2, Tmem132c, Tnc, Col23a1, Zfp536, Lsamp
Twist1, Cped1, Rspo3, Lgals1, St6galnac3, Cntn5, Itih5, Pth1r, Hecw2, Crabp1
pbmc <- RunUMAP(pbmc, dims = 1:30, verbose = T)
16:41:19 UMAP embedding parameters a = 0.9922 b = 1.112
Found more than one class "dist" in cache; using the first, from namespace 'spam'
Also defined by ‘BiocGenerics’
16:41:19 Read 8924 rows and found 30 numeric columns
16:41:19 Using Annoy for neighbor search, n_neighbors = 30
Found more than one class "dist" in cache; using the first, from namespace 'spam'
Also defined by ‘BiocGenerics’
16:41:19 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:41:19 Writing NN index file to temp file /var/folders/r7/psn_5h9s3ql986wjn3b1sxj90yv6n8/T//RtmpQyNO0H/file147d22f35981e
16:41:19 Searching Annoy index using 1 thread, search_k = 3000
16:41:21 Annoy recall = 100%
16:41:21 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
16:41:22 Initializing from normalized Laplacian + noise (using RSpectra)
16:41:22 Commencing optimization for 500 epochs, with 351934 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:41:30 Optimization finished
pbmc <- FindNeighbors(pbmc, dims = 1:30, verbose = T)
Computing nearest neighbor graph
Computing SNN
pbmc <- FindClusters(pbmc, verbose = T)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 8924
Number of edges: 274391
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9218
Number of communities: 25
Elapsed time: 0 seconds
pbmc2 <- CreateSeuratObject(counts = pbmc_data)
pbmc2 <- NormalizeData(pbmc2)
Normalizing layer: counts
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
pbmc2 <- FindVariableFeatures(pbmc2, selection.method = "vst")
Finding variable features for layer counts
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
pbmc2 <- CellCycleScoring(pbmc2, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
pbmc2 <- PercentageFeatureSet(pbmc2, pattern = "^MT-", col.name = "percent.mt")
pbmc2 <- SCTransform(pbmc2, vst.flavor = "v2", method = "glmGamPoi", verbose = T)
Running SCTransform on assay: RNA
Running SCTransform on layer: counts
vst.flavor='v2' set. Using model with fixed slope and excluding poisson genes.
Variance stabilizing transformation of count matrix of size 22452 by 8924
Model formula is y ~ log_umi
Get Negative Binomial regression parameters per gene
Using 2000 genes, 5000 cells
Found 93 outliers - those will be ignored in fitting/regularization step
Second step: Get residuals using fitted parameters for 22452 genes
Computing corrected count matrix for 22452 genes
Calculating gene attributes
Wall clock passed: Time difference of 22.24379 secs
Determine variable features
Centering data matrix
|=================================================================================================================================================================================================================================================================================| 100%
Getting residuals for block 1(of 2) for counts dataset
Getting residuals for block 2(of 2) for counts dataset
Centering data matrix
|=================================================================================================================================================================================================================================================================================| 100%
Finished calculating residuals for counts
Set default assay to SCT
pbmc2 <- RunPCA(pbmc2, verbose = T)
PC_ 1
Positive: Col1a1, Col1a2, Col3a1, Tnfaip6, Dlk1, Cxcl1, Cxcl2, Meg3, Ccl2, Sparc
Itm2a, Tnc, Cdkn1c, Mfap4, Lum, Gm32647, Fstl1, Aspn, Dcn, Mest
Col6a1, Col5a2, Dlc1, Thbs1, Ebf1, Ccl7, Nrk, Igf2, Pcolce, Ccn1
Negative: Krt14, Krt15, Krt5, Krt1, Ly6d, Krt10, Lgals7, Cstdc5, Stfa3, Krt17
Krtdap, Perp, Dmkn, Stfa1, Sfn, Dsp, Srgn, Sox6, Grip1, Cpa3
Hdc, Cma1, Lgals3, Tpsb2, mt-Cytb, Calm4, mt-Atp6, Fcer1g, Bmpr1b, S100a14
PC_ 2
Positive: Krt14, Krt15, Krt1, Krt5, Ly6d, Krt10, Lgals7, Stfa3, Cstdc5, Krt17
Krtdap, Perp, Dmkn, Sfn, Dsp, Grip1, Fabp5, Sox6, Stfa1, Calm4
Cxcl14, S100a14, Bmpr1b, Fras1, Fgfbp1, Ptma, Tspear, Trp63, Dsc3, Serpinb5
Negative: Srgn, Fcer1g, Hdc, Cpa3, Cma1, Tpsb2, Serpinb1a, Tyrobp, Plek, Slc6a4
Csf2rb, Cxcl2, Gata2, Fxyd5, Mcpt4, C1qb, Calca, Slco2b1, Mrc1, Cd14
Cyp11a1, Pf4, C1qc, Slc18a2, C1qa, F13a1, Ccl7, Otulinl, Ccl4, Rac2
PC_ 3
Positive: C1qb, Pf4, Mrc1, C1qc, C1qa, F13a1, Cxcl2, Cd14, Ccl12, Cd86
Ccl4, C5ar1, Il1b, Csf1r, Apoe, Ccl7, Ccl3, Slfn2, Lyz2, Fcrls
Lyve1, Selenop, Aoah, Ptprj, Ifi207, Nlrp3, Cd83, Hmox1, Ms4a6c, Ms4a7
Negative: Cpa3, Cma1, Hdc, Tpsb2, Serpinb1a, Slc6a4, Gata2, Srgn, Csf2rb, Calca
Mcpt4, Cyp11a1, Slc18a2, Hs6st2, Rab27b, Itk, Slco2b1, Hs3st1, Fdx1, Il13
Otulinl, St6galnac3, Il4, Il1rl1, Fxyd5, Ctla2a, Mrgprb1, Csf2, Edil3, Adora3
PC_ 4
Positive: Krt1, Krt10, Ly6d, Stfa3, Cstdc5, Krtdap, Col1a1, Col1a2, Stfa1, Lgals7
Tnfaip6, Krt15, Dlk1, Perp, Fabp5, Dmkn, Lgals3, Krt14, Col3a1, Cxcl2
Krt5, Npl, Sfn, Itm2a, Cxcl1, Mfap4, Dsp, Dsg1a, Calm4, Krt16
Negative: Dct, Pmel, Ptgds, Tyr, Cpne4, Trps1, Pax3, Chsy3, Fmn1, Mlana
Pde3a, Ednrb, Col23a1, Rarb, Robo2, Acta2, Ebf1, Aff3, Rgs5, Fstl4
Synpr, Ptprj, Prex2, Sobp, Ctnna3, Myh11, Myl9, Syt4, Tagln, Mcoln3
PC_ 5
Positive: Dlk1, Dct, Pmel, Ptgds, Tyr, Itm2a, Mest, Cpne4, Chsy3, Mfap4
Pax3, Nrk, Cdkn1c, Clec3b, Aspn, Postn, Mfap5, Fmn1, Acta2, Tmeff2
Mlana, Igfbp7, Dpysl3, Col14a1, Ednrb, Krt1, Rgs5, Igf2, Krt10, Myh11
Negative: Gm32647, Robo2, Trps1, Col3a1, Col1a1, Egfl6, Cntn4, Cxcl2, Col1a2, Lum
Wif1, Slit3, Twist2, Ccl2, Grem2, Tmem132c, Tnc, Col23a1, Zfp536, Lsamp
Twist1, Cped1, Rspo3, Lgals1, St6galnac3, Cntn5, Itih5, Pth1r, Hecw2, Crabp1
pbmc2 <- RunUMAP(pbmc2, dims = 1:30, verbose = T)
16:42:27 UMAP embedding parameters a = 0.9922 b = 1.112
Found more than one class "dist" in cache; using the first, from namespace 'spam'
Also defined by ‘BiocGenerics’
16:42:27 Read 8924 rows and found 30 numeric columns
16:42:27 Using Annoy for neighbor search, n_neighbors = 30
Found more than one class "dist" in cache; using the first, from namespace 'spam'
Also defined by ‘BiocGenerics’
16:42:27 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:42:27 Writing NN index file to temp file /var/folders/r7/psn_5h9s3ql986wjn3b1sxj90yv6n8/T//RtmpQyNO0H/file147d26795f1c8
16:42:27 Searching Annoy index using 1 thread, search_k = 3000
16:42:29 Annoy recall = 100%
16:42:29 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
16:42:30 Initializing from normalized Laplacian + noise (using RSpectra)
16:42:30 Commencing optimization for 500 epochs, with 351934 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:42:38 Optimization finished
pbmc2 <- FindNeighbors(pbmc2, dims = 1:30, verbose = T)
Computing nearest neighbor graph
Computing SNN
pbmc2 <- FindClusters(pbmc2, verbose = T)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 8924
Number of edges: 274391
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9218
Number of communities: 25
Elapsed time: 0 seconds
Plot1 <- DimPlot(pbmc, label = TRUE) + ggtitle("With MT and CC Regression")
Plot2 <- DimPlot(pbmc2, label = TRUE) + ggtitle("Without MT and CC Regression")
Plot1 + Plot2
identical(pbmc@meta.data, pbmc2@meta.data)
[1] TRUE
# insert reproducible example here