diff --git a/MAESTRO/Snakemake/scATAC/rules/sc_atac_annotation.smk b/MAESTRO/Snakemake/scATAC/rules/sc_atac_annotation.smk index 7ef3b7f..09275fe 100644 --- a/MAESTRO/Snakemake/scATAC/rules/sc_atac_annotation.smk +++ b/MAESTRO/Snakemake/scATAC/rules/sc_atac_annotation.smk @@ -19,7 +19,7 @@ rule scatac_analysis: genescore = "{sample}_gene_score.h5", outpre = "{sample}", counts = "../../QC/{sample}/{sample}_filtered_peak_count.h5", - fraggz = "../../mapping/{sample}/fragments_corrected_dedup_count.tsv.gz", + fraggz = "../../Mapping/{sample}/fragments_corrected_dedup_count.tsv.gz", giggleannotation = config["giggleannotation"], species = config["species"], signature = config["signature"], diff --git a/MAESTRO/scATAC_FragmentReshape.py b/MAESTRO/scATAC_FragmentReshape.py index 0b2fdd7..48249fa 100644 --- a/MAESTRO/scATAC_FragmentReshape.py +++ b/MAESTRO/scATAC_FragmentReshape.py @@ -34,7 +34,5 @@ def CommandLineParser(): items = line.strip().split("\t") if str(items[0]) in chr_list : frag_out.write("\t".join(items[0:]) + "\n") - else: - print("Region in" + items[0] + "is excluded." ) end_time = time.time() print("End:", end_time-start_time) diff --git a/MAESTRO/scATAC_HTMLReport_multi.py b/MAESTRO/scATAC_HTMLReport_multi.py index 16179f4..658590e 100644 --- a/MAESTRO/scATAC_HTMLReport_multi.py +++ b/MAESTRO/scATAC_HTMLReport_multi.py @@ -25,6 +25,9 @@ def CommandLineParser(): help = "The format of input files. DEFAULT: fastq.") group_input.add_argument("--input-path", dest = "input_path", type = str, default = "", help = "Directory where input files are stored") + group_input.add_argument("--mapping", dest = "mapping", type = str, default = "chromap", + choices = ["chromap", "minimap2"], + help = "Mapping Tools of scATAC-seq. DEFAULT: chromap.") group_input.add_argument("--species", dest = "species", default = "GRCh38", choices = ["GRCh38", "GRCm38"], type = str, help = "Species (GRCh38 for human and GRCm38 for mouse). DEFAULT: GRCh38.") @@ -49,7 +52,7 @@ def main(): species = myparser.species platform = myparser.platform input_format = myparser.input_format - + mapping = myparser.mapping try: os.makedirs(directory) except OSError: @@ -99,7 +102,7 @@ def main(): td_list.append(items_str) td_str = "\n".join(td_list) - if input_format != "fragments": + if input_format != "fragments" and mapping != "chromap": if input_format == "fastq": inputformat = "FASTQ Path" else: diff --git a/R/ATACRunSeurat.R b/R/ATACRunSeurat.R index 84e25bc..a912941 100644 --- a/R/ATACRunSeurat.R +++ b/R/ATACRunSeurat.R @@ -1,26 +1,26 @@ #' Clustering analysis for scATAC-seq data using Seurat #' -#' Clustering analysis for scATAC-seq dataset using Seurat (version >=4.0.0) and Signac (version >= 1.1.1). Including normalization, LSI/PCA dimension reduction, clustering and UMAP visualization. To run UMAP analysis, you must first install the umap-learn python package (e.g. via \code{pip install umap-learn}). +#' Clustering analysis for scATAC-seq dataset using Seurat (version >=4.0.0) and Signac (version >= 1.1.1). Including normalization, LSI/PCA dimension reduction, clustering and UMAP visualization. To run UMAP analysis, you must first install the umap-learn python package (e.g. via \code{pip install umap-learn}). #' #' @docType methods #' @name ATACRunSeurat #' @rdname ATACRunSeurat #' #' @param inputMat Input unnormalized count matrix, with peaks as rows and cells as columns. Rownames should be like "chromosome_peakstart_peakend", -#' for example "chr10_100020591_100020841". +#' for example "chr10_100020591_100020841". #' @param type Type of the input matrix. Default is "matrix". Set to "object" if the input is Seurat object. #' @param project Output project name. Default is "MAESTRO.scATAC.Seurat". #' @param orign.ident Orginal identity for the input cells. If supplied, should keep the same order with the column name of the peak x cell matrix. #' @param method Methods for dimension reduction, available options are LSI and PCA. Default is "LSI". -#' @param min.c Minimum number of cells required for a peak. Will exclude the peaks from input matrix if they only identified in +#' @param min.c Minimum number of cells required for a peak. Will exclude the peaks from input matrix if they only identified in #' less than \code{min.c} cells. Default is 10. See \code{\link{CreateSeuratObject}} for details. #' @param min.p Minimum number of peaks required for a cell. Will exclude the cells from input matrix if less than \code{min.p} #' peaks are deteced in one cell. Default is 100. See \code{\link{CreateSeuratObject}} for details. #' @param dims.use Number of dimensions used for UMAP analysis. Default is 1:30, use the first 30 PCs. -#' @param cluster.res Value of the clustering resolution parameter. Please use a value above (below) 1.0 +#' @param cluster.res Value of the clustering resolution parameter. Please use a value above (below) 1.0 #' if users want to obtain a larger (smaller) number of communities. Default is 0.6. #' @param only.pos If seting true, only positive peaks will be output. Default is False. -#' @param peaks.test.use Denotes which test to use to identify differnetial peaks. Default is "presto", a fast version of Wilcoxon Rank Sum test. +#' @param peaks.test.use Denotes which test to use to identify differnetial peaks. Default is "presto", a fast version of Wilcoxon Rank Sum test. #' Available options are "wilcox" and "t". See \code{\link{FindAllMarkersMAESTRO}} for details. #' @param peaks.cutoff Identify differential peaks with adjusted p.value less than \code{peaks.cutoff} as cluster specific peaks #' @param peaks.pct Only test peaks that are detected in a minimum fraction of min.pct cells in either of the two populations. Meant to speed up the function by not testing peaks that are very infrequently detected Default is 0.1 @@ -49,23 +49,23 @@ #' @importFrom Gmisc fastDoCall #' @export -ATACRunSeurat <- function(inputMat, type = "matrix", project = "MAESTRO.scATAC.Seurat", orign.ident = NULL, - min.c = 10, min.p = 100, method = "LSI", dims.use = 1:30, - cluster.res = 0.6, only.pos = FALSE, peaks.test.use = "presto", - peaks.cutoff = 1E-5, peaks.pct = 0.1, peaks.logfc = 0.2, - runtfidf.args = list(), runsvd.args = list(), runpca.args = list(), - findneighbors.args = list(), findclusters.args = list(),...) +ATACRunSeurat <- function(inputMat, type = "matrix", project = "MAESTRO.scATAC.Seurat", orign.ident = NULL, + min.c = 10, min.p = 100, method = "LSI", dims.use = 1:30, + cluster.res = 0.6, only.pos = FALSE, peaks.test.use = "presto", + peaks.cutoff = 1E-5, peaks.pct = 0.1, peaks.logfc = 0.2, + runtfidf.args = list(), runsvd.args = list(), runpca.args = list(), + findneighbors.args = list(), findclusters.args = list(),outdir = ".", ...) { if(type == "matrix"){SeuratObj <- CreateSeuratObject(inputMat, project = project, min.cells = min.c, min.features = min.p, assay = "ATAC")} if(type == "object"){SeuratObj <- inputMat} - if(method == "LSI"){ + if(method == "LSI"){ #============ LSI ============ message("LSI analysis ...") - SeuratObj <- fastDoCall("RunTFIDF", c(object = SeuratObj, runtfidf.args)) + SeuratObj <- fastDoCall("RunTFIDF", c(object = SeuratObj, runtfidf.args)) SeuratObj <- FindTopFeatures(SeuratObj, min.cutoff = 'q0') - SeuratObj <- fastDoCall("RunSVD", c(object = SeuratObj, runsvd.args)) - + SeuratObj <- fastDoCall("RunSVD", c(object = SeuratObj, runsvd.args)) + #============ UMAP ============ message("UMAP analysis ...") #SeuratObj <- fastDoCall("RunUMAP", c(object = SeuratObj, dims = dims.use, reduction = "lsi", runumap.args)) @@ -75,8 +75,8 @@ ATACRunSeurat <- function(inputMat, type = "matrix", project = "MAESTRO.scATAC.S # SeuratObj <- FindNeighbors(object = SeuratObj, reduction = "lsi", dims = dims.use) # SeuratObj <- FindClusters(object = SeuratObj, resolution = cluster.res) p1 <- DimPlot(object = SeuratObj, pt.size = 0.5, label = TRUE) - ggsave(file.path(paste0(project, "_cluster.png")), p1, width=5, height=4) - + ggsave(file.path(outdir, paste0(project, "_cluster.png")), p1, width=5, height=4) + #============ DE analysis ============ message("Identify cluster specific peaks ...") SeuratObj <- NormalizeData(SeuratObj, normalization.method = "LogNormalize", scale.factor = 10000) @@ -84,29 +84,29 @@ ATACRunSeurat <- function(inputMat, type = "matrix", project = "MAESTRO.scATAC.S cluster.peaks <- FindAllMarkersMAESTRO(object = SeuratObj, min.pct = peaks.pct, logfc.threshold = peaks.logfc, test.use = peaks.test.use, only.pos = only.pos) cluster.peaks <- cluster.peaks[cluster.peaks$p_val_adj