diff --git a/DESCRIPTION b/DESCRIPTION index f88a9ac..271e34e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RNAModR Type: Package Title: Functional Analysis of RNA Modifications -Version: 0.1.1 +Version: 0.2.0 Author: Maurits Evers Maintainer: Maurits Evers Description: Transcriptome-wide analysis of RNA modifications. @@ -16,6 +16,7 @@ Imports: methods, RSQLite, rtracklayer, + S4Vectors, stats, utils Depends: diff --git a/NAMESPACE b/NAMESPACE index a90aa5c..5e8745f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,7 +2,6 @@ export(AddAlpha) export(BuildTx) -export(BuildTxTest) export(CheckClass) export(CheckClassTxLocConsistency) export(CheckClassTxLocRef) @@ -38,15 +37,12 @@ export(PlotSpatialRatio) export(PlotTxSecLength) export(PlotTxSecLength.bean) export(ReadBED) -export(ReadDBN) export(SmartMap) -export(SmartMap.ToTx) export(SubsampleTxLoc) export(TxLoc2GRangesList) export(Unfactor) -export(WriteFeatToBED) -export(WriteTxLocToBED) -export(WriteTxLocToCSV) +export(WriteBED) +export(WriteCSV) exportClasses(txLoc) exportMethods(GetId) exportMethods(GetLoci) @@ -63,6 +59,7 @@ import(IRanges) import(RSQLite) import(methods) importFrom(Biostrings,vmatchPattern) +importFrom(S4Vectors,DataFrame) importFrom(beanplot,beanplot) importFrom(gplots,venn) importFrom(grDevices,col2rgb) @@ -82,7 +79,6 @@ importFrom(graphics,plot) importFrom(graphics,points) importFrom(graphics,polygon) importFrom(graphics,text) -importFrom(rtracklayer,export) importFrom(stats,binomial) importFrom(stats,confint.default) importFrom(stats,fisher.test) diff --git a/R/IO.R b/R/IO.R index 04ee3bf..59ec356 100644 --- a/R/IO.R +++ b/R/IO.R @@ -33,7 +33,7 @@ ReadBED <- function(file, collapseRange = TRUE) { if (!file.exists(file)) { stop(sprintf("File %s doesn't exist.", file)) } - bed <- read.table(file, header = FALSE) + bed <- read.table(file, header = FALSE, stringsAsFactors = FALSE) if (ncol(bed) != 6) { stop("[ERROR] File must be a 6 column BED file.") } @@ -67,56 +67,26 @@ ReadBED <- function(file, collapseRange = TRUE) { seqnames = bed$chr, ranges = IRanges(bed$start + 1, bed$end), strand = bed$strand, - score = bed$score, + score = as.numeric(bed$score), id = bed$id) } -#' Write a \code{list} of \code{GRangesList} to a BED file. -#' -#' Write a \code{list} of \code{GRangesList} to a BED file. -#' One file per transcript region is generated. -#' -#' @param txFeatures A \code{list} of \code{GRangesList} objects. -#' -#' @author Maurits Evers, \email{maurits.evers@@anu.edu.au} -#' @keywords internal -#' -#' @importFrom rtracklayer export -#' -#' @export -WriteFeatToBED <- function(txFeatures) { - # Write list of GRangesList transcript features to BED file. - # - # Args: - # txFeatures: List of GRangesList transcript features. - # - # Returns: - # NULL - for (i in 1:length(txFeatures)) { - rtracklayer::export(txFeatures[[i]], sprintf("%s.bed", names(txFeatures)[i])) - } -} - - #' Write \code{txLoc} object to a BED file. #' #' Write \code{txLoc} object to a BED file. See 'Details'. #' -#' The function writes entries from a \code{txLoc} object to a 6-column -#' BED file (BED6). Note that this process is not "splice-aware", i.e. -#' if an entry spans an intron the BED entry gives the left and right-most -#' genomic coordinate of the feature. If \code{file = NULL}, entries will -#' be written to \code{sites.bed}. If \code{noChrName = TRUE}, chromosome -#' names in column 1 of the BED file will be written without "chr". +#' The function writes entries from a \code{txLoc} object to a 6-column BED +#' file (BED6). Note that this process is not "splice-aware", i.e. if an entry +#' spans an intron the BED entry gives the left and right-most genomic +#' coordinate of the feature. If \code{file = NULL}, entries will be written to +#' \code{sites.bed}. #' -#' @param locus A \code{txLoc} object. +#' @param txLoc A \code{txLoc} object. #' @param file A character string; specifies the filename of the output #' BED file. If \code{NULL}, then \code{file = "sites.bed"}; default is #' \code{NULL}. -#' @param noChrName A logical scalar; if \code{TRUE}, chromosome names -#' will be written without "chr"; default is \code{FALSE}. #' #' @author Maurits Evers, \email{maurits.evers@@anu.edu.au} #' @@ -124,113 +94,114 @@ WriteFeatToBED <- function(txFeatures) { #' \dontrun{ #' bedFile <- system.file("extdata", #' "miCLIP_m6A_Linder2015_hg38.bed", -#' package = "RNAModR"); -#' sites <- ReadBED(bedFile); -#' txSites <- SmartMap(sites, id = "m6A", refGenome = "hg38"); -#' WriteTxLocToBED(txSites); +#' package = "RNAModR") +#' sites <- ReadBED(bedFile) +#' txSites <- SmartMap(sites, id = "m6A", refGenome = "hg38") +#' WriteBED(txSites) #' } #' #' @importFrom utils write.table #' #' @export -WriteTxLocToBED <- function(locus, - file = NULL, - noChrName = FALSE) { - # Save transcript features in BED file. - # - # Args: - # locus: List of dataframes with mapped features aross different - # transcript regions - # file: Filename. If file is NULL then output is written to site.bed. - # Default is NULL. - # formatChr: Format of chr column in BED file. Default is "noChrName". - # - # Returns: - # NULL - CheckClass(locus, "txLoc") - id <- GetId(locus) - locus <- GetLoci(locus) - BED <- vector() - for (i in 1:length(locus)) { - feat <- locus[[i]][, c("CHR", "START", "STOP", "ID", "SCORE", "STRAND")] - if (is.numeric(feat[, 2]) && is.numeric(feat[, 3])) { - if (noChrName == TRUE) { - feat[, 1] <- gsub("chr", "", feat[ ,1]) - } - feat[, 2] <- feat[ ,2] - 1 - feat[, 4] <- sprintf("%s|%s|%s", - feat[, 4], - locus[[i]][, c("GENE_ENSEMBL")], - names(locus)[i]) - BED <- rbind(BED, feat) - } else { - ss <- sprintf("Skipping %s.\n", names(locus)[i]) - warning(ss) - } - } - BED <- BED[order(BED[, 1], BED[, 2]), ] +WriteBED <- function(txLoc, file = NULL) { + + # Sanity checks + CheckClass(txLoc, "txLoc") + + # Quieten R CMD check concerns regarding "no visible binding for global + # variable ..." + locus_in_genome <- id <- tx_region <- tx_refseq <- NULL + + # Get loci + loci <- GetLoci(txLoc) + + # Extract BED6 entries from txLoc object and row-bind into `data.frame` + lst <- lapply(loci, function(locus) + with(locus, data.frame( + seqnames = seqnames(locus_in_genome), + start = start(locus_in_genome) - 1L, + end = end(locus_in_genome), + id = paste(GetId(txLoc), id, tx_region, tx_refseq, sep = "|"), + score = score, + strand = strand(locus_in_genome)))) + df <- do.call(rbind, lst) + + # Sort entries + df <- df[order(df[, 1], df[, 2]), ] + + # Output file name if (is.null(file)) { - if (nchar(id) > 0) { - file <- sprintf("sites_%s.bed", id) + if (nchar(GetId(txLoc)) > 0) { + file <- sprintf("sites_%s.bed", GetId(txLoc)) } else { file <- "sites.bed" } } - write.table(BED, file = file, - sep = "\t", quote = FALSE, - col.names = FALSE, row.names = FALSE) + + # Write to file + write.table( + df, + file = file, + sep = "\t", + quote = FALSE, + col.names = FALSE, + row.names = FALSE) + cat(sprintf("Output in file %s.\n", file)) + } #' Write txLoc object to CSV file. #' -#' Write a txLoc object to a/multiple comma-separated-values file(s). +#' Write a \code{txLoc} object to a comma-separated-values file. #' -#' @param locus A \code{txLoc} object. +#' @param txLoc A \code{txLoc} object. #' @param file Filename of output CSV file. If NULL then file = "sites.csv". #' @param withSeq If TRUE then include full sequence. Default is FALSE. -#' @param withGC If TRUE then include GC content. Default is FALSE. +#' @param withGC If TRUE then include GC content. Default is FALSE. Unused. #' #' @author Maurits Evers, \email{maurits.evers@@anu.edu.au} -#' @keywords internal #' #' @importFrom utils write.csv #' #' @export -WriteTxLocToCSV <- function(locus, - file = NULL, - withSeq = FALSE, - withGC = FALSE) { - # Save list of dataframes with mapped features across different - # transcript regions to csv file - # - # Args: - # locus: List of dataframes with mapped features across different - # transcript regions - # file: Filename. If file is NULL then output is written to site.csv. - # Default is NULL. - # - # Returns: - # NULL - CheckClass(locus, "txLoc") - locus <- GetLoci(locus) - CSV <- vector() - selCol <- c("GENE_REGION", "REGION_TXWIDTH", "GENE_REFSEQ", - "GENE_ENTREZ", "GENE_SYMBOL", "GENE_ENSEMBL", - "GENE_UNIGENE", "ID", "CHR", "START", "STOP", "STRAND") - if (withSeq) { - selCol <- c(selCol, "REGION_SEQ") - } - for (i in 1:length(locus)) { - data <- locus[[i]][, selCol] - CSV <- rbind(CSV, data) - } - CSV <- CSV[order(CSV[, 8], CSV[, 9]), ] +WriteCSV <- function(txLoc, + file = NULL, + withSeq = FALSE, + withGC = FALSE) { + + # Sanity checks + CheckClass(txLoc, "txLoc") + + # Get loci + loci <- GetLoci(txLoc) + + # Convert entries in columns to `data.frame` + lst <- lapply(loci, function(locus) { + if (!withSeq) + as.data.frame(locus[, -match("tx_region_sequence", names(locus))]) + else + as.data.frame(locus) + }) + df <- do.call(rbind, lst) + + # Output file name if (is.null(file)) { - file <- "sites.csv" + if (nchar(GetId(txLoc)) > 0) { + file <- sprintf("sites_%s.csv", GetId(txLoc)) + } else { + file <- "sites.csv" + } } - write.csv(CSV, file = file, - row.names = FALSE, quote = FALSE) + + # Write to file + write.csv( + df, + file = file, + row.names = FALSE, + quote = FALSE) + cat(sprintf("Output in file %s.\n", file)) + } @@ -251,8 +222,6 @@ WriteTxLocToCSV <- function(locus, #' @return A \code{dataframe} object. See 'Details'. #' #' @author Maurits Evers, \email{maurits.evers@@anu.edu.au} -#' -#' @export ReadDBN <- function(file) { if (!file.exists(file)) { ss <- sprintf("Could not open %s.", file) diff --git a/R/buildTxMethods.R b/R/buildTxMethods.R index a61dfac..ad1a490 100644 --- a/R/buildTxMethods.R +++ b/R/buildTxMethods.R @@ -210,24 +210,23 @@ GetTxDb <- function(genomeVersion = "hg38", #' This is a low-level function that is being called from #' \code{BuildTx}. #' -#' The function extracts the genome assembly version from -#' the \code{TxDb} object, and loads the suitable genome wide -#' annotation package. For example, if \code{TxDb} is based -#' on \code{"hg38"}, the function loads \code{org.Hs.eg.db}. -#' Various IDs (Ensembl, Unigene, gene symbols) are extracted -#' from the annotation package by mapping RefSeq IDs from -#' \code{TxDb} via \code{mapIds} from the \code{AnnotationDbi} -#' package. Please note that column entries are not necessarily -#' unique: For example, two different Ensembl IDs might have the -#' same Entrez ID. The return object is a \code{data.frame} with -#' the following columns: +#' The function extracts the genome assembly version from the \code{TxDb} +#' object, and loads the suitable genome wide annotation package. For example, +#' if \code{TxDb} is based on \code{"hg38"}, the function loads +#' \code{org.Hs.eg.db}. +#' Various IDs (Ensembl, gene symbols) are extracted from the annotation +#' package by mapping RefSeq IDs from \code{TxDb} via \code{mapIds} from +#' the \code{AnnotationDbi} package. Please note that column entries are not +#' necessarily unique: For example, two different Ensembl IDs might have the +#' same Entrez ID. The return object is a \code{data.frame} with the following +#' columns: #' \itemize{ -#' \item REFSEQ: RefSeq Accession number; these entries are unique +#' \item tx_refseq: RefSeq Accession number; these entries are unique #' and refer to the mRNA transcript -#' \item ENTREZID: Entrez gene ID -#' \item SYMBOL: Gene symbol -#' \item ENSEMBL: Ensembl gene ID -#' \item GENENAME: Gene name +#' \item gene_entrez: Entrez gene ID +#' \item gene_symbol: Gene symbol +#' \item gene_ensembl: Ensembl gene ID +#' \item gene_name: Gene name #' } #' Note that mapping between gene IDs is a many-to-many mapping process. #' For example, different transcript RefSeq IDs can belong to the same @@ -239,7 +238,7 @@ GetTxDb <- function(genomeVersion = "hg38", #' #' @param txdb A \code{TxDb} object. #' -#' @return A \code{data.frame}. See 'Details'. +#' @return A \code{DataFrame}. See 'Details'. #' #' @author Maurits Evers, \email{maurits.evers@@anu.edu.au} #' @keywords internal @@ -295,12 +294,11 @@ GetGeneIds <- function(txdb) { stop(sprintf("Unknown genome %s.", genomeVersion)) } - # Get transcripts from TxDb database; RefSeq transcript ID and Entrez gene ID - # as metadata columns - # Store REFSEQ and ENTREZID in DataFrame geneXID - tx <- transcripts(txdb, c("TXNAME", "GENEID")) - geneXID <- setNames(mcols(tx), c("REFSEQ", "ENTREZID")) - geneXID$ENTREZID <- as.character(geneXID$ENTREZID) + # Get transcripts from TxDb database; RefSeq transcript ID and Entrez + # gene ID as metadata columns + # Store gene_refseq and gene_entrez in `DataFrame` geneXID + geneXID <- mcols(transcripts(txdb, c("TXNAME", "GENEID"))) + geneXID$GENEID <- as.character(geneXID$GENEID) # Get various gene identifiers from OrgDb database and merge with # geneXID DataFrame @@ -312,13 +310,21 @@ GetGeneIds <- function(txdb) { geneXID, suppressMessages(select( refDb, - keys = geneXID$ENTREZID, + keys = geneXID$GENEID, columns = identifier, keytype = "ENTREZID")), - by = "ENTREZID", - sort = FALSE)[c(2:1,3:5)] + by.x = "GENEID", by.y = "ENTREZID", + sort = FALSE)[c(2:1, 3:5)] - return(geneXID[!duplicated(geneXID), ]) + # Set column names + geneXID <- setNames( + geneXID, + c( + "tx_refseq", "gene_entrez", "gene_symbol", + "gene_ensembl", "gene_name")) + + # Return `DataFrame` + geneXID[!duplicated(geneXID), ] } @@ -338,7 +344,7 @@ GetGeneIds <- function(txdb) { #' @param txdb A \code{TxDb} object. #' @param sections A character vector; specifies which transcript #' sections should be extracted from \code{txdb}; default is -#' \code{c("Promoter", "5'UTR", "CDS", "3'UTR", "Intron")}. +#' \code{c("Promoter", "5'UTR", "CDS", "3'UTR")}. #' @param promUpstream An integer; specifies the number of #' nucleotides to be included upstream of 5'UTR start as part of #' the promoter. @@ -359,8 +365,7 @@ GetTxBySec <- function(txdb, "Promoter", "5'UTR", "CDS", - "3'UTR", - "Intron"), + "3'UTR"), promUpstream = 1000, promDownstream = 0, verbose = FALSE) { @@ -380,8 +385,7 @@ GetTxBySec <- function(txdb, validFeat <- c("Promoter", "CDS", "coding", "5pUTR", "5'UTR", "UTR5", - "3pUTR", "3'UTR", "UTR3", - "Intron", "Intronic") + "3pUTR", "3'UTR", "UTR3") isValidFeat <- grepl(sprintf("(%s)", paste0(validFeat, collapse = "|")), sections, ignore.case = TRUE) if (!all(isValidFeat)) { @@ -400,9 +404,6 @@ GetTxBySec <- function(txdb, } else if (grepl("(3pUTR|3'UTR|UTR3)", sections[i], ignore.case = TRUE)) { sec <- suppressWarnings( threeUTRsByTranscript(txdb, use.names = TRUE)) - } else if (grepl("(Intron|Intronic)", sections[i], ignore.case = TRUE)) { - sec <- suppressWarnings( - intronsByTranscript(txdb, use.names = TRUE)) } else if (grepl("Promoter", sections[i], ignore.case = TRUE)) { promoter <- suppressWarnings( unname(promoters(txdb, @@ -553,7 +554,6 @@ CollapseTxBySec <- function(txBySec, stop("Transcript feature list does not contain 3'UTR entries.") } whichPromoter <- grep("Promoter", sections, ignore.case = TRUE) - whichIntron <- grep("(Intron|Intronic)", sections, ignore.case = TRUE) # (1) Collapse CDS: # Step 1: Same RefSeq ID, multiple identical genomic copies # Example: RefSeq ID = NM_000513 @@ -562,12 +562,12 @@ CollapseTxBySec <- function(txBySec, cds <- txBySec[[whichCDS]] cds <- cds[order(names(cds), -sum(GenomicRanges::width(cds)))] cds <- cds[!duplicated(names(cds))] - geneData <- geneXID[which(geneXID$REFSEQ %in% names(cds)), ] + geneData <- geneXID[which(geneXID$tx_refseq %in% names(cds)), ] geneData$lengthCDS <- sum(GenomicRanges::width( - cds[match(geneData$REFSEQ, names(cds))])) - geneData <- geneData[order(geneData$ENTREZID, -geneData$lengthCDS), ] - geneData <- geneData[!duplicated(geneData$ENTREZID), ] - cds <- cds[which(names(cds) %in% geneData$REFSEQ)] + cds[match(geneData$tx_refseq, names(cds))])) + geneData <- geneData[order(geneData$gene_entrez, -geneData$lengthCDS), ] + geneData <- geneData[!duplicated(geneData$gene_entrez), ] + cds <- cds[which(names(cds) %in% geneData$tx_refseq)] # (2) Collapse UTR's # Step 1: Filter based on valid RefSeq ID from cds # Note that UTR list will still contain duplicate RefSeq entries, @@ -592,12 +592,6 @@ CollapseTxBySec <- function(txBySec, promoter <- as(unlist(promoter), "CompressedGRangesList") promoter.match <- DedupeBasedOnNearestRef(promoter, utr5.match, showPb = TRUE) } - # (3) Collapse introns (if included) - if (length(whichIntron) > 0) { - cat("Collapsing duplicate introns entries\n") - intron <- txBySec[[whichIntron]] - intron.match <- DedupeBasedOnNearestRef(intron, cds, showPb = TRUE) - } # Generate return object txBySec.collapsed <- list() for (i in 1:length(txBySec)) { @@ -607,8 +601,6 @@ CollapseTxBySec <- function(txBySec, feat <- utr5.match } else if (grepl("(3pUTR|3'UTR|UTR3)", sections[i], ignore.case = TRUE)) { feat <- utr3.match - } else if (grepl("(Intron|Intronic)", sections[i], ignore.case = TRUE)) { - feat <- intron.match } else if (grepl("Promoter", sections[i], ignore.case = TRUE)) { feat <- promoter.match } @@ -672,10 +664,6 @@ PerformSanityCheck <- function(txBySec) { #' #' @param txBySec A \code{list} of \code{GRangesList} objects; #' output of function \code{GetTxBySec} or \code{CollapseTxBySec}. -#' @param skipIntrons A logical scalar; if \code{TRUE} sequences -#' based on introns are not extracted; this is usually a good idea -#' as long intronic sequences can lead to large memory imprints -#' (and file sizes if objects are saved). #' #' @return A \code{list} of \code{DNAStringSet} objects. #' @@ -683,17 +671,8 @@ PerformSanityCheck <- function(txBySec) { #' @keywords internal #' #' @export -GetTxSeq <- function(txBySec, - skipIntrons = TRUE) { - # Get a list of sequences for every feature from the list - # of transcript sections - # - # Args: - # txBySec: List of GRangesList transcript sections. - # skipIntrons: If TRUE, skip intron sequences. - # - # Returns: - # List of DNAStringSet sequences +GetTxSeq <- function(txBySec) { + genomeVersion <- unique(unlist(lapply(txBySec, function(x) genome(x)))) if (grepl("^hg", genomeVersion)) { genomePkg <- sprintf("BSgenome.Hsapiens.UCSC.%s", genomeVersion) @@ -727,14 +706,6 @@ GetTxSeq <- function(txBySec, stop(sprintf("[ERROR] Unknown genome %s.",genomeVersion)) } sections <- names(txBySec) - if (skipIntrons == TRUE) { - whichIntrons <- grep("(Intron|Intronic)", - sections, ignore.case = TRUE) - if (length(whichIntrons) > 0) { - cat("Skipping introns.\n") - sections <- sections[-whichIntrons] - } - } txSequences<-list() pb <- txtProgressBar(min = 0, max = length(sections), @@ -864,42 +835,40 @@ BuildTx <- function(genomeVersion = c( #' @keywords internal #' #' @import GenomicRanges GenomicFeatures RSQLite -#' -#' @export -BuildTxTest <- function() { -# Generate txBySec - sections <- c("5'UTR", "CDS", "3'UTR") - tx <- data.frame(tx_id = seq(1,3), - tx_name=sprintf("tx%i",seq(1,3)), - tx_chrom="chr1", - tx_strand=c("-", "+", "+"), - tx_start=c(1, 2001, 3001), - tx_end=c(999, 2199, 5199)) - splice <- data.frame(tx_id = c(1L, 2L, 2L, 2L, 3L, 3L), - exon_rank=c(1, 1, 2, 3, 1, 2), - exon_start=c(1, 2001, 2101, 2131, 3001, 4001), - exon_end=c(999, 2085, 2144, 2199, 3601, 5199), - cds_start=c(1, 2022, 2101, 2131, 3201, 4001), - cds_end=c(999, 2085, 2144, 2193, 3601, 4501)) - genes <- cbind.data.frame(tx_name = sprintf("tx%i", seq(1,3)), - gene_id = sprintf("gene%i", seq(1,3))) - chrominfo <- cbind.data.frame(chrom = "chr1", length = 5199, is_circular = FALSE) - txdb <- makeTxDb(tx, splice, genes = genes, chrominfo = chrominfo) - txBySec <- list(fiveUTRsByTranscript(txdb, use.names = TRUE), - cdsBy(txdb, by = "tx", use.names = TRUE), - threeUTRsByTranscript(txdb, use.names = TRUE)) - names(txBySec) <- sections -# Generate genome - bases <- c("A", "C", "G", "T") - genome <- DNAString(paste(sample(bases, 5199, replace = TRUE), collapse = "")) -# Generate seqBySec - seqBySec <- lapply(txBySec, function(x) extractTranscriptSeqs(genome, ranges(x))) -# Generate geneXID - geneXID <- cbind.data.frame(REFSEQ = sprintf("tx%i", seq(1, 3)), - ENTREZID = seq(1000, 3000, length.out = 3), - SYMBOL = sprintf("gene%i", seq(1, 3)), - ENSEMBL = sprintf("ENSEMBL%i", seq(1, 3)), - UNIGENE = sprintf("UNIGENE%i", seq(1,3)), - GENENAME = sprintf("name%i", seq(1,3))) - save(geneXID, txBySec, seqBySec, file = "tx_test.RData", compress = "gzip") -} +#BuildTxTest <- function() { +## Generate txBySec +# sections <- c("5'UTR", "CDS", "3'UTR") +# tx <- data.frame(tx_id = seq(1,3), +# tx_name=sprintf("tx%i",seq(1,3)), +# tx_chrom="chr1", +# tx_strand=c("-", "+", "+"), +# tx_start=c(1, 2001, 3001), +# tx_end=c(999, 2199, 5199)) +# splice <- data.frame(tx_id = c(1L, 2L, 2L, 2L, 3L, 3L), +# exon_rank=c(1, 1, 2, 3, 1, 2), +# exon_start=c(1, 2001, 2101, 2131, 3001, 4001), +# exon_end=c(999, 2085, 2144, 2199, 3601, 5199), +# cds_start=c(1, 2022, 2101, 2131, 3201, 4001), +# cds_end=c(999, 2085, 2144, 2193, 3601, 4501)) +# genes <- cbind.data.frame(tx_name = sprintf("tx%i", seq(1,3)), +# gene_id = sprintf("gene%i", seq(1,3))) +# chrominfo <- cbind.data.frame(chrom = "chr1", length = 5199, is_circular = FALSE) +# txdb <- makeTxDb(tx, splice, genes = genes, chrominfo = chrominfo) +# txBySec <- list(fiveUTRsByTranscript(txdb, use.names = TRUE), +# cdsBy(txdb, by = "tx", use.names = TRUE), +# threeUTRsByTranscript(txdb, use.names = TRUE)) +# names(txBySec) <- sections +## Generate genome +# bases <- c("A", "C", "G", "T") +# genome <- DNAString(paste(sample(bases, 5199, replace = TRUE), collapse = "")) +## Generate seqBySec +# seqBySec <- lapply(txBySec, function(x) extractTranscriptSeqs(genome, ranges(x))) +## Generate geneXID +# geneXID <- cbind.data.frame(REFSEQ = sprintf("tx%i", seq(1, 3)), +# ENTREZID = seq(1000, 3000, length.out = 3), +# SYMBOL = sprintf("gene%i", seq(1, 3)), +# ENSEMBL = sprintf("ENSEMBL%i", seq(1, 3)), +# UNIGENE = sprintf("UNIGENE%i", seq(1,3)), +# GENENAME = sprintf("name%i", seq(1,3))) +# save(geneXID, txBySec, seqBySec, file = "tx_test.RData", compress = "gzip") +#} diff --git a/R/methods.R b/R/methods.R index 9484865..8f2df92 100644 --- a/R/methods.R +++ b/R/methods.R @@ -1,144 +1,135 @@ #' Map genome coordinates to transcript coordinates. #' -#' Map genome coordinates to transcript coordinates. See -#' 'Details'. -#' This is a low-level function that is being called from -#' \code{SmartMap}. -#' -#' The function maps genomic coordinates from \code{locus} to -#' transcript section coordinates from \code{txBySec}. The -#' function returns a \code{list} of \code{data.frame} objects -#' with additional gene ID and sequence information. +#' Map genome coordinates to transcript coordinates. See 'Details'. +#' This is a low-level function that is being called from \code{SmartMap}. +#' There is no guarantee that this function will get exported in future +#' releases of RNAModR. Use at your own risk. +#' +#' The function maps genomic coordinates from \code{gr} to transcript region +#' coordinates from \code{txBySec}. The function returns a \code{list} of +#' \code{DataFrame} objects, each with the following columns: +#' \enumerate{ +#' \item \code{locus_in_txx_region}: A \code{GRanges} object +#' \item \code{locus_in_genome}: A \code{GRanges} object +#' \item \code{score}: A \code{numeric} vector +#' \item \code{id}: A \code{character} vector +#' \item \code{tx_region}: A \code{character} vector +#' \item \code{tx_region_width}: An \code{integer} vector +#' \item \code{tx_region_sequence}: A \code{DNAStringSet} object +#' \item \code{tx_refseq}: A \code{character} vector +#' \item \code{gene_entrez}: A \code{character} vector +#' \item \code{gene_symbol}: A \code{character} vector +#' \item \code{gene_ensembl}: A \code{character} vector +#' \item \code{gene_name}: A \code{character} vector +#' } #' -#' @param locus A \code{GRanges} object; specifies the list of -#' genomic features to be mapped. -#' @param txBySec A \code{list} of \code{GRangesList} objects; -#' specifies the reference transcriptome; the object is usually -#' a result of running \code{BuildTx}. -#' @param seqBySec A \code{list} of \code{DNAStringSet} objects; -#' sequences of (some or all) segments from \code{txBySec}; -#' the object is usually a result of running \code{BuildTx}. -#' @param geneXID A \code{data.frame}; specifies different gene -#' IDs; the object is usually a result of running \code{BuildTx}. -#' @param ignore.strand A logical scalar; if \code{TRUE} strand -#' information is ignored when mapping genome coordinates to -#' transcript coordinates; default is \code{FALSE}. -#' @param noFactors A logical scalar; if \code{TRUE}, the return -#' object contains no \code{factors}; default is \code{TRUE}. +#' @param gr A \code{GRanges} object; specifies the list of genomic features to +#' be mapped. +#' @param txBySec A \code{list} of \code{GRangesList} objects; specifies the +#' reference transcriptome; the object is usually a result of running +#' \code{BuildTx}. +#' @param seqBySec A \code{list} of \code{DNAStringSet} objects; sequences of +#' (some or all) segments from \code{txBySec}; the object is usually a result +#' of running \code{BuildTx}. +#' @param geneXID A \code{DataFrame}; specifies different gene IDs; the object +#' is usually a result of running \code{BuildTx}. +#' @param ignore.strand A logical scalar; if \code{TRUE} strand information is +#' ignored when mapping genome coordinates to transcript coordinates; default +#' is \code{FALSE}. #' @param showPb A logical scalar; if \code{TRUE} show a progress #' bar; default is \code{FALSE}. #' -#' @return A \code{list} of \code{data.frame} objects. See -#' 'Details'. +#' @return A \code{list} of \code{data.frame} objects. See 'Details'. #' #' @author Maurits Evers, \email{maurits.evers@@anu.edu.au} -#' @keywords internal #' -#' @export -SmartMap.ToTx <- function(locus, +#' @importFrom S4Vectors DataFrame +#' +#' @keywords internal +SmartMap.ToTx <- function(gr, txBySec, seqBySec, geneXID, ignore.strand = FALSE, - noFactors = TRUE, showPb = FALSE) { - # Map positions from locus to transcript regions. - # - # Args: - # locus: Loci of genomic features to be mapped as GRanges object. - # txBySec: List of GRangesList transcript features. - # seqBySec: List of DNAStringSet sequences. - # geneXID: Dataframe of cross-referencing gene IDs. - # ignore.strand: Ignore strand information during mapping. - # Default is FALSE. - # noFactors: No (character) factors in output. Default is TRUE. - # - # Returns: - # List of dataframes with mapped transcript coordinates for all - # features in locus. - locusInTx.list <- list() + + # Initialise progress bar if `showPb == TRUE` if (showPb == TRUE) pb <- txtProgressBar(max = length(txBySec), style = 3, width = 60) - for (i in 1:length(txBySec)) { - if (showPb) setTxtProgressBar(pb, i) + + # Map to different `txBySec` regions and return `list` of `DataFrame`s + lst <- lapply( + setNames(1:length(txBySec), names(txBySec)), + function(i) { + + if (showPb) setTxtProgressBar(pb, i) - # [UPDATE September 2019] mapToTranscripts is now more stringent in that it - # requires seqlevels(locus) to be a subset of seqlevels(txBySec[[1]]@unlistData) - # In other words, if locus contains chromosomes that are _not_ included in - # txBySec this will throw a critical error. - # So we need to make sure that we only have entries in `locus` on chromosomes - # that are included as seqlevels of transcripts in `txBySec` - locus <- keepSeqlevels( - locus, - intersect(seqlevels(txBySec[[i]]@unlistData), seqlevels(locus)), - pruning.mode = "coarse") - - # Map to transcripts; the output object has two metadata columns: - # xHits = index of the mapped locus object - # transcriptsHits = index of the txBySec[[i]] object - gr <- mapToTranscripts(locus, txBySec[[i]], ignore.strand = ignore.strand) - - #tmp <- cbind( - # setNames(DataFrame(gr)[1], "locus_in_tx"), - # setNames( - # DataFrame(locus[mcols(gr)$xHits]), - # c("locus_in_genome", names(mcols(locus)))), - # geneXID[as.integer(match(seqnames(gr), geneXID$REFSEQ))], ) + region <- names(txBySec)[i] + + # [UPDATE September 2019] `mapToTranscripts` is now more stringent + # in that it requires `seqlevels(gr)` to be a subset of + # `seqlevels(txBySec[[i]]@unlistData)`. In other words, if `gr` + # contains chromosomes that are _not_ included in `txBySec` this + # will throw a critical error. So we need to make sure that we + # only have entries in `gr` on chromosomes that are included as + # seqlevels of transcripts in `txBySec` + gr <- keepSeqlevels( + gr, + intersect( + seqlevels(txBySec[[region]]@unlistData), + seqlevels(gr)), + pruning.mode = "coarse") + + # Map to transcripts; the output object has two metadata columns: + # xHits = index of the mapped `gr` object + # transcriptsHits = index of the `txBySec[[i]]` object + hits <- mapToTranscripts( + gr, + txBySec[[region]], + ignore.strand = ignore.strand) + + # Return DataFrame with columns + # locus_in_tx_region: + # locus_in_genome: + # score: + # id: + # tx_region: + # tx_region_width: + # tx_region_sequence: + # tx_refseq: + # gene_entrez: + # gene_symbol: + # gene_ensembl: + # gene_name: + cbind( + setNames( + DataFrame( + granges(hits, use.names = F, use.mcols = F)), + "locus_in_tx_region"), + setNames( + DataFrame( + granges(gr[mcols(hits)$xHits], use.names = F, use.mcols = F)), + "locus_in_genome"), + setNames( + mcols(gr[mcols(hits)$xHits]), + c("score", "id")), + setNames(DataFrame( + region, + sum(width(txBySec[[region]][mcols(hits)$transcriptsHits]))), + c("tx_region", "tx_region_width")), + setNames(DataFrame( + seqBySec[[region]][match( + seqnames(hits), names(seqBySec[[region]]))]), + "tx_region_sequence"), + geneXID[match(seqnames(hits), geneXID$tx_refseq), ]) - idxLoc <- mcols(gr)$xHits - idxTx <- mcols(gr)$transcriptsHits - # THIS COULD DO WITH SOME TIDYING UP ... - # Collate data from mapping, locus, txBySec, seqBySec and geneXID -# idxSeq <- which(names(txBySec)[i] == names(seqBySec)) -# dataMap <- GenomicRanges::as.data.frame(gr) -# dataLoc <- GenomicRanges::as.data.frame(locus[idxLoc]) -# dataTx <- GenomicRanges::as.data.frame(range(txBySec[[i]][idxTx])) -# if (length(idxSeq) > 0) { -# dataSeq <- seqBySec[[idxSeq]][match( -# dataMap[, 1], -# names(seqBySec[[idxSeq]]))] -# dataSeq <- as.character(dataSeq) -# } else { -# dataSeq <- rep("", nrow(dataTx)) -# } -# dataID <- geneXID[match(dataMap[, 1], geneXID[, 1]), ] - locusInTx <- GenomicRanges::as.data.frame(gr)[, 1:4] - colnames(locusInTx) <- c("REFSEQ", "TXSTART", "TXEND", "TXWIDTH") - dataQuery <- GenomicRanges::as.data.frame(locus[idxLoc]) - colnames(dataQuery) <- c("CHR", "START", "STOP", "WIDTH", - "STRAND", "SCORE", "ID") - dataRef <- GenomicRanges::as.data.frame( - range(txBySec[[i]][idxTx]))[, -1] - idxSeq <- which(names(txBySec)[i] == names(seqBySec)) - if (length(idxSeq) > 0) { - dataSeq <- as.character(seqBySec[[idxSeq]][match( - dataRef[, 1], - names(seqBySec[[idxSeq]]))]) - } else { - dataSeq <- rep("", nrow(dataRef)) - } - dataRef <- cbind(rep(names(txBySec)[i], nrow(dataRef)), - dataRef[, 1], - geneXID[match(dataRef[, 1], geneXID$REFSEQ), ][, -1], - dataRef[, 2:ncol(dataRef)], - sum(width(txBySec[[i]][idxTx])), - dataSeq) - colnames(dataRef) <- c("GENE_REGION", "GENE_REFSEQ", "GENE_ENTREZ", - "GENE_SYMBOL", "GENE_ENSEMBL", - "GENE_NAME", "GENE_CHR", "GENE_START", - "GENE_STOP", "GENE_WIDTH", "GENE_STRAND", - "REGION_TXWIDTH", "REGION_SEQ") - locusInTx <- cbind(locusInTx, - dataQuery, - dataRef) - if (noFactors) { - locusInTx <- Unfactor(locusInTx) } - locusInTx.list[[length(locusInTx.list) + 1]] <- locusInTx - } + ) + if (showPb) close(pb) - names(locusInTx.list) <- names(txBySec) - return(locusInTx.list) + + lst + } @@ -146,25 +137,23 @@ SmartMap.ToTx <- function(locus, #' #' Map genome coordinates to transcript coordinates. #' -#' The function maps genomic coordinates from \code{locus} to -#' transcript section coordinates. The function automatically -#' loads a reference transcriptome based on \code{refGenome}. -#' An error is thrown if a reference transcriptome could not -#' be found. This usually means that \code{BuildTx} was not yet -#' run successfully. +#' The function maps genomic coordinates from \code{locus} to transcript +#' section coordinates. The function automatically loads a reference +#' transcriptome based on \code{refGenome}. An error is produced if a reference +#' transcriptome could not be found. This usually means that \code{BuildTx} was +#' not yet run successfully. #' The function returns a \code{txLoc} object of mapped positions. #' -#' @param locus A \code{GRanges} object; specifies the list of -#' of genomic features to be mapped. -#' @param id A character string; specifies a name for loci from -#' \code{locus}; if \code{NULL} then \code{id = ""}; default is -#' \code{NULL}. -#' @param refGenome A character string; specifies a specific -#' reference genome assembly version based on which the matching -#' transcriptome is loaded; default is \code{"hg38"}. -#' @param ignore.strand A logical scalar; if \code{TRUE} strand -#' information is ignored when mapping genome coordinates to -#' transcript coordinates; default is \code{FALSE}. +#' @param gr A \code{GRanges} object; specifies the list of of genomic features +#' to be mapped. +#' @param id A character string; specifies a name for loci from \code{gr}; if +#' \code{NULL} then \code{id = ""}; default is \code{NULL}. +#' @param refGenome A character string; specifies a specific reference genome +#' assembly version based on which the matching transcriptome is loaded; +#' default is \code{"hg38"}. +#' @param ignore.strand A logical scalar; if \code{TRUE} strand information is +#' ignored when mapping genome coordinates to transcript coordinates; default +#' is \code{FALSE}. #' #' @return A \code{txLoc} object. See 'Details'. #' @@ -176,29 +165,21 @@ SmartMap.ToTx <- function(locus, #' \dontrun{ #' bedFile <- system.file("extdata", #' "miCLIP_m6A_Linder2015_hg38.bed", -#' package = "RNAModR"); -#' sites <- ReadBED(bedFile); -#' txSites <- SmartMap(sites, id = "m6A", refGenome = "hg38"); +#' package = "RNAModR") +#' sites <- ReadBED(bedFile) +#' txSites <- SmartMap(sites, id = "m6A", refGenome = "hg38") #' } #' @export -SmartMap <- function(locus, +SmartMap <- function(gr, id = NULL, refGenome = "hg38", ignore.strand = FALSE) { - # User function to map positions from locus to transcript regions. - # - # Args: - # locus: GRanges object of loci of genomic features. - # id: Identifier for loci. Default is NULL. - # refGenome: reference genome/transcriptome version - # ignore.strand: Ignore strand during mapping. Default is FALSE. - # - # Returns: - # txLoc object. - # - # Load reference transcriptome - CheckClass(locus, "GRanges") + + # Sanity check + CheckClass(gr, "GRanges") + + # Load transcriptome data refTx <- sprintf("tx_%s.RData", refGenome) if (!file.exists(refTx)) { ss <- sprintf("Reference transcriptome for %s not found.", refGenome) @@ -219,24 +200,25 @@ SmartMap <- function(locus, geneXID <- get("geneXID") seqBySec <- get("seqBySec") txBySec <- get("txBySec") + # Map coordinates to transcript - locusInTx.list <- SmartMap.ToTx( - locus, txBySec, seqBySec, geneXID, + loci <- SmartMap.ToTx( + gr, txBySec, seqBySec, geneXID, ignore.strand = ignore.strand, showPb = TRUE) - if (is.null(id)) { - id = "" - } - obj <- new("txLoc", - loci = locusInTx.list, - id = id, - refGenome = refGenome, - version = as.character(Sys.Date())) - return(obj) + + # Return `txLoc` object + new("txLoc", + loci = loci, + id = if (is.null(id)) "" else id, + refGenome = refGenome, + version = as.character(Sys.Date())) + } + #' Construct \code{txLoc}-suitable \code{dataframe} object from -#' mapping transcript loci to genomic loci. +#' mapping transcript loci to genomic loci. (CURRENTLY BROKEN) #' #' Construct \code{txLoc}-suitable \code{dataframe} object from #' mapping transcript loci to genomic loci. @@ -301,47 +283,46 @@ GetLocus.MapFromTranscripts <- function(gr, ref, seq, section, geneXID) { #' \code{txLoc} object. #' Two different methods can be employed: #' \enumerate{ -#' \item \code{method = "ntAbund"}: Null sites are generated based -#' on the position of all non-modified nucleotides of type \code{nt} -#' in those transcript sections that also contain a modified site of -#' the same type and as specified in \code{locus}. -#' For example, if \code{locus} contains a list of m$^{6}$A sites, -#' the list of null sites consists of all non-methylated adenosines -#' in those transcripts that contain at least one m$^{6}$A site. -#' \item \code{method = "perm"}: Null sites are generated by -#' permuting the position of sites from \code{locus} uniformly within -#' the corresponding transcript section. Note that this will generate -#' a list of null sites with the same abundance ratios across -#' transcript sections as the list of sites from \code{locus}. It is -#' therefore not useful for assessing an enrichment of sites within -#' a particular transcript section. -#' } -#' It is import to emphasise that any downstream enrichment analysis -#' may depend critically on the choice of the null distribution. -#' For example, a position permution-based null distribution may not -#' be a valid null distribution, if the nucleotide position -#' distribution is highly non-uniform across a transcript section. -#' This is the case e.g. for the spatial distribution of cytosines -#' within and across the 5'UTR, CDS and/or 3'UTR. In this case, a -#' better null distribution would be to consider all cytosines -#' in transcript sections that also contain a site in \code{locus}. -#' This can be achieved with \code{method = "ntAbund"}. A still -#' better list of null sites would be based on all \emph{expressed} -#' and non-modified cytosines based on the same sequencing data that -#' was used to identify modified cytosines. -#' -#' @param locus A \code{txLoc} object. -#' @param id A character string; identifier for null sites; if -#' \code{NULL} then id = \code{"null"}; default is \code{NULL}. -#' @param method A character string; specifies the method used -#' to generate null distribution; if \code{method == "ntAbund"} -#' the position of all nucleotides specified by \code{nt} will -#' be used as null sites; if \code{method == "perm"} then null -#' sites will be generated by uniform-randomly shuffling candidate -#' positions from \code{locus} within the corresponding transcript -#' region; default is \code{"ntAbund"}. -#' @param nt A single character; if \code{method == "ntAbund"}, -#' use \code{nt} to derive distribution of null sites. +#' \item \code{method = "ntAbund"}: Null sites are generated based on the +#' position of all non-modified nucleotides of type \code{nt} in those +#' transcript sections that also contain a modified site of the same type and +#' as specified in \code{locus}. +#' For example, if \code{locus} contains a list of m$^{6}$A sites, the list of +#' null sites consists of all non-methylated adenosines in those transcripts +#' that contain at least one m$^{6}$A site. +#' \item \code{method = "perm"}: Null sites are generated by randomly permuting +#' the position of sites from \code{locus} uniformly within the corresponding +#' transcript section. Note that this will generate a list of null sites with +#' the same abundance ratios across transcript sections as the list of sites +#' from \code{locus}. It is therefore not useful for assessing an enrichment of +#' sites within a particular transcript section. In fact, this method should +#' not be used and is included purely for paedagogical purposes (to demonstrate +#' the importance of a sensible null distribution). It is likely that this +#' method will be removed from future RNAModR versions. +#' +#' It is import to emphasise that any downstream enrichment analysis may depend +#' critically on the choice of the null distribution. For example, a position +#' permution-based null distribution may not be a valid null distribution, if +#' the distribution of nucleotides is highly non-uniform across a transcript +#' section. This is the case e.g. for the spatial distribution of cytosines +#' within and across the 5'UTR, CDS and/or 3'UTR. In this case, a permutation- +#' based distribution of cytosines will not give a sensible null distribution. +#' Instead, a sensible null distribution can be derived from the position of +#' all cytosines in the relevant transcript region containing the methylated +#' cytosine site in \code{locus}. \code{method = "ntAbund"} generates a list of +#' null sites using this approach. +#' +#' @param txLoc A \code{txLoc} object. +#' @param id A character string; identifier for null sites; if \code{NULL} then +#' \code{id = "null"}; default is \code{NULL}. +#' @param method A character string; specifies the method used to the generate +#' null distribution; if \code{method == "ntAbund"} the position of all +#' nucleotides specified by \code{nt} will be used as null sites; if +#' \code{method == "perm"} (DEPRECATED, see 'Details') then null sites will be +#' generated by uniform-randomly shuffling candidatepositions from \code{locus} +#' within the corresponding transcript region; default is \code{"ntAbund"}. +#' @param nt A single character; if \code{method == "ntAbund"}, use \code{nt} +#' to derive distribution of null sites. #' @param showPb A logical scalar; if \code{TRUE} show a progress #' bar; default is \code{TRUE}. #' @@ -352,52 +333,44 @@ GetLocus.MapFromTranscripts <- function(gr, ref, seq, section, geneXID) { #' @import GenomicRanges IRanges #' @importFrom utils setTxtProgressBar txtProgressBar #' @importFrom stats runif setNames +#' @importFrom S4Vectors DataFrame #' #' @examples #' \dontrun{ #' bedFile <- system.file("extdata", #' "miCLIP_m6A_Linder2015_hg38.bed", -#' package = "RNAModR"); -#' sites <- ReadBED(bedFile); -#' posSites <- SmartMap(sites, id = "m6A", refGenome = "hg38"); -#' negSites <- GenerateNull(posSites, method = "ntAbund", nt = "A"); +#' package = "RNAModR") +#' sites <- ReadBED(bedFile) +#' posSites <- SmartMap(sites, id = "m6A", refGenome = "hg38") +#' negSites <- GenerateNull(posSites, method = "ntAbund", nt = "A") #' } #' #' @export -GenerateNull <- function(locus, +GenerateNull <- function(txLoc, id = NULL, method = c("ntAbund", "perm"), nt = "C", showPb = TRUE) { - # Generate null distribuion of SNM's across different transcript regions. - # - # Args: - # locus: List of dataframes with mapped SNM's across different - # transcript regions. - # method: Method used to generate null distribution. - # If method == "ntAbund" then the distribution of all - # nucleotides specified by nucleotide will be used as null sites. - # If method == "perm" then null sites will be generated - # by uniformly randomly permuting candidate positions from locus - # within transcript region. Default is "ntAbund". - # nt: Nucleotide to be used for deriving a list of null sites, - # if method == "ntAbund" - # showPb: If TRUE, show progress bar - # - # Returns: - # A txLoc object. Note that genome coordinates are not available for - # null sites. - CheckClass(locus, "txLoc") + + # Sanity checks + CheckClass(txLoc, "txLoc") method <- match.arg(method) - refGenome <- GetRef(locus) + refGenome <- GetRef(txLoc) if (is.null(id)) { - id <- sprintf("null_%s", GetId(locus)) + id <- sprintf("null_%s", GetId(txLoc)) } - locus <- GetLoci(locus) - locusNull.list <- list() + + # Quieten R CMD check concerns regarding "no visible binding for global + # variable ..." + tx_region_sequence <- tx_refseq <- NULL + + # Get loci from `txLoc` object + loci.pos <- GetLoci(txLoc) + + # Load transcriptome data for `method == "ntAbund"` if (method == "ntAbund") { -# FIX THIS!!! -# LoadRefTx(refGenome, geneXID, seqBySec, txBySec) + + # Load transcriptome data refTx <- sprintf("tx_%s.RData", refGenome) if (!file.exists(refTx)) { ss <- sprintf("Reference transcriptome for %s not found.", refGenome) @@ -418,89 +391,81 @@ GenerateNull <- function(locus, geneXID <- get("geneXID") seqBySec <- get("seqBySec") txBySec <- get("txBySec") + } + + # Activate progressbar if `showPb == TRUE` if (showPb == TRUE) - pb <- txtProgressBar(max = length(locus), style = 3, width = 60) - for (i in 1:length(locus)) { - if (showPb) setTxtProgressBar(pb, i) - if (method == "ntAbund") { - seqData <- locus[[i]][!duplicated(locus[[i]][, 1]), - c("REFSEQ", "GENE_CHR", "GENE_START", - "GENE_STOP", "STRAND", "REGION_SEQ")] - if (all(grepl("\\w+", seqData$REGION_SEQ))) { - # Positive sites that should be excluded from null - exclude <- locus[[i]][, c("REFSEQ", "TXSTART")] - # All sites of a specific nucleotide - txPos <- gregexpr(nt, seqData$REGION_SEQ) - names(txPos) <- seqData$REFSEQ - txPos <- data.frame( - ID = rep(names(txPos), sapply(txPos, length)), - x = unlist(txPos), stringsAsFactors = FALSE) - # Exclude positive sites - txPos <- rbind(txPos, setNames(exclude, names(txPos))) - txPos <- txPos[!duplicated(txPos, fromLast = FALSE) & - !duplicated(txPos, fromLast = TRUE), ] - rownames(txPos) <- seq(1, nrow(txPos)) - # Map sites back to genome - gr <- GRanges(txPos$ID, IRanges(txPos$x, txPos$x)) - idxSec <- which(names(locus)[i] == names(txBySec)) - genomePos <- mapFromTranscripts(gr, txBySec[[idxSec]]) - genomePos <- as.data.frame(genomePos); - } else { - sec <- names(locus)[i] - ss <- sprintf("Cannot generate null for %s", sec) - ss <- sprintf("%s: Missing sequence information.", ss) - ss <- sprintf("%s\nEither exclude %s from downstream analysis", - ss, sec) - ss <- sprintf("%s, or use different null.", ss) - warning(ss) - txPos <- cbind.data.frame( - ID = seqData$REFSEQ, - x = rep("*", nrow(seqData))) - genomePos <- cbind.data.frame( - start = rep("*", nrow(seqData)), - end = rep("*", nrow(seqData)), - width = rep(1, nrow(seqData))) - } - idxSeq <- match(txPos$ID, seqData$REFSEQ) - idxLoc <- match(txPos$ID, locus[[i]]$GENE_REFSEQ) - locusNull <- cbind.data.frame( - txPos, txPos$x, rep(1, nrow(txPos)), - seqData$GENE_CHR[idxSeq], - genomePos$start, - genomePos$end, - genomePos$width, - seqData$STRAND[idxSeq], - rep(0, nrow(txPos)), - rep(sprintf("nucl_%s", nt), nrow(txPos)), - locus[[i]][idxLoc, 12:ncol(locus[[i]])]); - colnames(locusNull) <- colnames(locus[[i]]) - locusNull <- Unfactor(locusNull) - } else if (method == "perm") { - locusNull <- locus[[i]] - locusNull$TXSTART <- apply( - locusNull, 1, function(x) { - round(runif(1, - min = 1, - max = as.numeric(x["REGION_TXWIDTH"]) - 1))}) - locusNull$TXEND <- locusNull$TXSTART - locusNull[, c("CHR", "START", "STOP", - "WIDTH", "STRAND", "SCORE")] <- matrix( - "*", - nrow = nrow(locusNull), - ncol = 6) - locusNull$ID <- "unif_perm" - } - locusNull.list[[length(locusNull.list) + 1]] <- locusNull - } + pb <- txtProgressBar(max = length(loci.pos), style = 3, width = 60) + + loci <- mapply( + function(i, sec) { + + # Update progress bar + if (showPb) setTxtProgressBar(pb, i) + + # Loci of positive sites for region i + locus.pos <- loci.pos[[i]] + + # Get sequence data for unique RefSeq transcript regions + # Returns a DNAStringSet object + unique_tx_region_sequence <- with( + locus.pos, + tx_region_sequence[!duplicated(tx_refseq)]) + + # Extract position of all `nt`s inside every sequence + mindex <- vmatchPattern(nt, unique_tx_region_sequence) + + # Exclude positive sites from null sites + gr <- subsetByOverlaps( + GRanges(mindex), + locus.pos$locus_in_tx_region, + invert = TRUE) + + # Map sites from transcriptomic to genomic coordinates + hits <- mapFromTranscripts(gr, sec) + + # Copy genomic strand information to tx data + strand(gr) <- strand(hits) + + # Return DataFrame with columns + # locus_in_tx_region: + # locus_in_genome: + # score: + # id: + # tx_region: + # tx_region_width: + # tx_region_sequence: + # tx_refseq: + # gene_entrez: + # gene_symbol: + # gene_ensembl: + # gene_name: + cbind( + setNames(DataFrame(gr[mcols(hits)$xHits]), "locus_in_tx_region"), + setNames( + DataFrame(granges(hits, use.names = FALSE, use.mcols = FALSE)), + "locus_in_genome"), + setNames(DataFrame( + rep(0, length(gr)), + paste0(seqnames(hits), ":", start(hits), "-", end(hits))), + c("score", "id")), + locus.pos[match( + seqnames(gr[mcols(hits)$xHits]), locus.pos$tx_refseq), -(1:4)]) + + }, + setNames(seq_along(loci.pos), names(loci.pos)), + txBySec[names(loci.pos)]) + if (showPb) close(pb) - names(locusNull.list) <- names(locus) - obj <- new("txLoc", - loci = locusNull.list, - id = id, - refGenome = refGenome, - version = as.character(Sys.Date())) - return(obj) + + # Return `txLoc` object + new("txLoc", + loci = loci, + id = id, + refGenome = refGenome, + version = as.character(Sys.Date())) + } diff --git a/R/plot.R b/R/plot.R index de3c8f1..0899145 100644 --- a/R/plot.R +++ b/R/plot.R @@ -244,10 +244,10 @@ PlotSectionDistribution <- function(locus, filter = NULL) { } -#' Plot spatial distribution of loci from \code{txLoc} object. +#' Plot spatial distribution of loci from a \code{txLoc} object. #' -#' Plot spatial distribution of loci from \code{txLoc} object within every -#' transcript section. +#' Plot spatial distribution of loci from a \code{txLoc} object within every +#' transcript region. #' #' @param locus A \code{txLoc} object. #' @param filter Only plot loci in transcript regions specified in filter. @@ -269,14 +269,14 @@ PlotSectionDistribution <- function(locus, filter = NULL) { #' \dontrun{ #' bedFile <- system.file("extdata", #' "miCLIP_m6A_Linder2015_hg38.bed", -#' package = "RNAModR"); -#' sites <- ReadBED(bedFile); -#' posSites <- SmartMap(sites, id = "m6A", refGenome = "hg38"); -#' PlotSpatialDistribution(posSites); +#' package = "RNAModR") +#' sites <- ReadBED(bedFile) +#' posSites <- SmartMap(sites, id = "m6A", refGenome = "hg38") +#' PlotSpatialDistribution(posSites) #' PlotSpatialDistribution(posSites, #' absolute = TRUE, #' filter = c("5'UTR", "CDS", "3'UTR"), -#' ylim = c(0, 200)); +#' ylim = c(0, 200)) #' } #' #' @export @@ -288,25 +288,8 @@ PlotSpatialDistribution <- function(locus, posMax = 1000, doBootstrap = TRUE, ...) { - # Plot the spatial distribution of features across different - # transcript regions. - # - # Args: - # locus: List of dataframes with mapped features across different - # transcript regions. - # filter: Only plot loci in transcript regions specified in filter. - # nbreaks: Number of spatial bins. - # absolute: Plot as a function of absolute coordinates. - # Default is FALSE. - # binWidth: Spatial bin width. Overrides nbreaks if not NULL. - # posMax: If absolute == TRUE, plot up distribution up to - # this distance. Default is 1000 nt. - # doBootstrap: Calculate bootstrap CI (only if absolute == FALSE). - # Default is TRUE. - # ...: Additional parameters passed to plot. - # - # Returns: - # NULL + + # Sanity check CheckClass(locus, "txLoc") id <- GetId(locus) refGenome <- GetRef(locus) diff --git a/man/BuildTxTest.Rd b/man/BuildTxTest.Rd deleted file mode 100644 index 7bd6ce6..0000000 --- a/man/BuildTxTest.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/buildTxMethods.R -\name{BuildTxTest} -\alias{BuildTxTest} -\title{Build a test transcriptome.} -\usage{ -BuildTxTest() -} -\description{ -Build a test transcriptome. See 'Details'. -} -\details{ -The function builds a transcriptome for testing purposes. -} -\author{ -Maurits Evers, \email{maurits.evers@anu.edu.au} -} -\keyword{internal} diff --git a/man/GenerateNull.Rd b/man/GenerateNull.Rd index 94b2202..1355b52 100644 --- a/man/GenerateNull.Rd +++ b/man/GenerateNull.Rd @@ -4,25 +4,24 @@ \alias{GenerateNull} \title{Generate a list of null sites.} \usage{ -GenerateNull(locus, id = NULL, method = c("ntAbund", "perm"), +GenerateNull(txLoc, id = NULL, method = c("ntAbund", "perm"), nt = "C", showPb = TRUE) } \arguments{ -\item{locus}{A \code{txLoc} object.} +\item{txLoc}{A \code{txLoc} object.} -\item{id}{A character string; identifier for null sites; if -\code{NULL} then id = \code{"null"}; default is \code{NULL}.} +\item{id}{A character string; identifier for null sites; if \code{NULL} then +\code{id = "null"}; default is \code{NULL}.} -\item{method}{A character string; specifies the method used -to generate null distribution; if \code{method == "ntAbund"} -the position of all nucleotides specified by \code{nt} will -be used as null sites; if \code{method == "perm"} then null -sites will be generated by uniform-randomly shuffling candidate -positions from \code{locus} within the corresponding transcript -region; default is \code{"ntAbund"}.} +\item{method}{A character string; specifies the method used to the generate +null distribution; if \code{method == "ntAbund"} the position of all +nucleotides specified by \code{nt} will be used as null sites; if +\code{method == "perm"} (DEPRECATED, see 'Details') then null sites will be +generated by uniform-randomly shuffling candidatepositions from \code{locus} +within the corresponding transcript region; default is \code{"ntAbund"}.} -\item{nt}{A single character; if \code{method == "ntAbund"}, -use \code{nt} to derive distribution of null sites.} +\item{nt}{A single character; if \code{method == "ntAbund"}, use \code{nt} +to derive distribution of null sites.} \item{showPb}{A logical scalar; if \code{TRUE} show a progress bar; default is \code{TRUE}.} @@ -33,49 +32,14 @@ A \code{txLoc} object. See 'Details'. \description{ Generate a list of null sites. See 'Details'. } -\details{ -The function generates a null distribution of single-nucleotide -sites across different transcript sections, and returns a -\code{txLoc} object. -Two different methods can be employed: -\enumerate{ -\item \code{method = "ntAbund"}: Null sites are generated based -on the position of all non-modified nucleotides of type \code{nt} -in those transcript sections that also contain a modified site of -the same type and as specified in \code{locus}. -For example, if \code{locus} contains a list of m$^{6}$A sites, -the list of null sites consists of all non-methylated adenosines -in those transcripts that contain at least one m$^{6}$A site. -\item \code{method = "perm"}: Null sites are generated by -permuting the position of sites from \code{locus} uniformly within -the corresponding transcript section. Note that this will generate -a list of null sites with the same abundance ratios across -transcript sections as the list of sites from \code{locus}. It is -therefore not useful for assessing an enrichment of sites within -a particular transcript section. -} -It is import to emphasise that any downstream enrichment analysis -may depend critically on the choice of the null distribution. -For example, a position permution-based null distribution may not -be a valid null distribution, if the nucleotide position -distribution is highly non-uniform across a transcript section. -This is the case e.g. for the spatial distribution of cytosines -within and across the 5'UTR, CDS and/or 3'UTR. In this case, a -better null distribution would be to consider all cytosines -in transcript sections that also contain a site in \code{locus}. -This can be achieved with \code{method = "ntAbund"}. A still -better list of null sites would be based on all \emph{expressed} -and non-modified cytosines based on the same sequencing data that -was used to identify modified cytosines. -} \examples{ \dontrun{ bedFile <- system.file("extdata", "miCLIP_m6A_Linder2015_hg38.bed", - package = "RNAModR"); -sites <- ReadBED(bedFile); -posSites <- SmartMap(sites, id = "m6A", refGenome = "hg38"); -negSites <- GenerateNull(posSites, method = "ntAbund", nt = "A"); + package = "RNAModR") +sites <- ReadBED(bedFile) +posSites <- SmartMap(sites, id = "m6A", refGenome = "hg38") +negSites <- GenerateNull(posSites, method = "ntAbund", nt = "A") } } diff --git a/man/GetGeneIds.Rd b/man/GetGeneIds.Rd index e484450..1460e0f 100644 --- a/man/GetGeneIds.Rd +++ b/man/GetGeneIds.Rd @@ -10,7 +10,7 @@ GetGeneIds(txdb) \item{txdb}{A \code{TxDb} object.} } \value{ -A \code{data.frame}. See 'Details'. +A \code{DataFrame}. See 'Details'. } \description{ Get different gene IDs for mapping between different gene @@ -19,24 +19,23 @@ This is a low-level function that is being called from \code{BuildTx}. } \details{ -The function extracts the genome assembly version from -the \code{TxDb} object, and loads the suitable genome wide -annotation package. For example, if \code{TxDb} is based -on \code{"hg38"}, the function loads \code{org.Hs.eg.db}. -Various IDs (Ensembl, Unigene, gene symbols) are extracted -from the annotation package by mapping RefSeq IDs from -\code{TxDb} via \code{mapIds} from the \code{AnnotationDbi} -package. Please note that column entries are not necessarily -unique: For example, two different Ensembl IDs might have the -same Entrez ID. The return object is a \code{data.frame} with -the following columns: +The function extracts the genome assembly version from the \code{TxDb} +object, and loads the suitable genome wide annotation package. For example, +if \code{TxDb} is based on \code{"hg38"}, the function loads +\code{org.Hs.eg.db}. +Various IDs (Ensembl, gene symbols) are extracted from the annotation +package by mapping RefSeq IDs from \code{TxDb} via \code{mapIds} from +the \code{AnnotationDbi} package. Please note that column entries are not +necessarily unique: For example, two different Ensembl IDs might have the +same Entrez ID. The return object is a \code{data.frame} with the following +columns: \itemize{ - \item REFSEQ: RefSeq Accession number; these entries are unique + \item tx_refseq: RefSeq Accession number; these entries are unique and refer to the mRNA transcript - \item ENTREZID: Entrez gene ID - \item SYMBOL: Gene symbol - \item ENSEMBL: Ensembl gene ID - \item GENENAME: Gene name + \item gene_entrez: Entrez gene ID + \item gene_symbol: Gene symbol + \item gene_ensembl: Ensembl gene ID + \item gene_name: Gene name } Note that mapping between gene IDs is a many-to-many mapping process. For example, different transcript RefSeq IDs can belong to the same diff --git a/man/GetLocus.MapFromTranscripts.Rd b/man/GetLocus.MapFromTranscripts.Rd index 3aecbb0..a4e4dab 100644 --- a/man/GetLocus.MapFromTranscripts.Rd +++ b/man/GetLocus.MapFromTranscripts.Rd @@ -3,7 +3,7 @@ \name{GetLocus.MapFromTranscripts} \alias{GetLocus.MapFromTranscripts} \title{Construct \code{txLoc}-suitable \code{dataframe} object from -mapping transcript loci to genomic loci.} +mapping transcript loci to genomic loci. (CURRENTLY BROKEN)} \usage{ GetLocus.MapFromTranscripts(gr, ref, seq, section, geneXID) } diff --git a/man/GetTxBySec.Rd b/man/GetTxBySec.Rd index fb8a7ac..be2109e 100644 --- a/man/GetTxBySec.Rd +++ b/man/GetTxBySec.Rd @@ -4,16 +4,15 @@ \alias{GetTxBySec} \title{Get transcript sections.} \usage{ -GetTxBySec(txdb, sections = c("Promoter", "5'UTR", "CDS", "3'UTR", - "Intron"), promUpstream = 1000, promDownstream = 0, - verbose = FALSE) +GetTxBySec(txdb, sections = c("Promoter", "5'UTR", "CDS", "3'UTR"), + promUpstream = 1000, promDownstream = 0, verbose = FALSE) } \arguments{ \item{txdb}{A \code{TxDb} object.} \item{sections}{A character vector; specifies which transcript sections should be extracted from \code{txdb}; default is -\code{c("Promoter", "5'UTR", "CDS", "3'UTR", "Intron")}.} +\code{c("Promoter", "5'UTR", "CDS", "3'UTR")}.} \item{promUpstream}{An integer; specifies the number of nucleotides to be included upstream of 5'UTR start as part of diff --git a/man/GetTxSeq.Rd b/man/GetTxSeq.Rd index 4c014ff..bfbda5a 100644 --- a/man/GetTxSeq.Rd +++ b/man/GetTxSeq.Rd @@ -4,16 +4,11 @@ \alias{GetTxSeq} \title{Get sequences for transcript segments.} \usage{ -GetTxSeq(txBySec, skipIntrons = TRUE) +GetTxSeq(txBySec) } \arguments{ \item{txBySec}{A \code{list} of \code{GRangesList} objects; output of function \code{GetTxBySec} or \code{CollapseTxBySec}.} - -\item{skipIntrons}{A logical scalar; if \code{TRUE} sequences -based on introns are not extracted; this is usually a good idea -as long intronic sequences can lead to large memory imprints -(and file sizes if objects are saved).} } \value{ A \code{list} of \code{DNAStringSet} objects. diff --git a/man/PlotSpatialDistribution.Rd b/man/PlotSpatialDistribution.Rd index dbcaed8..b15e3be 100644 --- a/man/PlotSpatialDistribution.Rd +++ b/man/PlotSpatialDistribution.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/plot.R \name{PlotSpatialDistribution} \alias{PlotSpatialDistribution} -\title{Plot spatial distribution of loci from \code{txLoc} object.} +\title{Plot spatial distribution of loci from a \code{txLoc} object.} \usage{ PlotSpatialDistribution(locus, filter = NULL, nbreaks = 100, absolute = FALSE, binWidth = NULL, posMax = 1000, @@ -31,21 +31,21 @@ sites within transcript region. Default is \code{TRUE}.} \item{...}{Additional parameters passed to plot.} } \description{ -Plot spatial distribution of loci from \code{txLoc} object within every -transcript section. +Plot spatial distribution of loci from a \code{txLoc} object within every +transcript region. } \examples{ \dontrun{ bedFile <- system.file("extdata", "miCLIP_m6A_Linder2015_hg38.bed", - package = "RNAModR"); -sites <- ReadBED(bedFile); -posSites <- SmartMap(sites, id = "m6A", refGenome = "hg38"); -PlotSpatialDistribution(posSites); + package = "RNAModR") +sites <- ReadBED(bedFile) +posSites <- SmartMap(sites, id = "m6A", refGenome = "hg38") +PlotSpatialDistribution(posSites) PlotSpatialDistribution(posSites, absolute = TRUE, filter = c("5'UTR", "CDS", "3'UTR"), - ylim = c(0, 200)); + ylim = c(0, 200)) } } diff --git a/man/SmartMap.Rd b/man/SmartMap.Rd index b3bcf83..175d3db 100644 --- a/man/SmartMap.Rd +++ b/man/SmartMap.Rd @@ -4,23 +4,22 @@ \alias{SmartMap} \title{Map genome coordinates to transcript coordinates.} \usage{ -SmartMap(locus, id = NULL, refGenome = "hg38", ignore.strand = FALSE) +SmartMap(gr, id = NULL, refGenome = "hg38", ignore.strand = FALSE) } \arguments{ -\item{locus}{A \code{GRanges} object; specifies the list of -of genomic features to be mapped.} +\item{gr}{A \code{GRanges} object; specifies the list of of genomic features +to be mapped.} -\item{id}{A character string; specifies a name for loci from -\code{locus}; if \code{NULL} then \code{id = ""}; default is -\code{NULL}.} +\item{id}{A character string; specifies a name for loci from \code{gr}; if +\code{NULL} then \code{id = ""}; default is \code{NULL}.} -\item{refGenome}{A character string; specifies a specific -reference genome assembly version based on which the matching -transcriptome is loaded; default is \code{"hg38"}.} +\item{refGenome}{A character string; specifies a specific reference genome +assembly version based on which the matching transcriptome is loaded; +default is \code{"hg38"}.} -\item{ignore.strand}{A logical scalar; if \code{TRUE} strand -information is ignored when mapping genome coordinates to -transcript coordinates; default is \code{FALSE}.} +\item{ignore.strand}{A logical scalar; if \code{TRUE} strand information is +ignored when mapping genome coordinates to transcript coordinates; default +is \code{FALSE}.} } \value{ A \code{txLoc} object. See 'Details'. @@ -29,21 +28,20 @@ A \code{txLoc} object. See 'Details'. Map genome coordinates to transcript coordinates. } \details{ -The function maps genomic coordinates from \code{locus} to -transcript section coordinates. The function automatically -loads a reference transcriptome based on \code{refGenome}. -An error is thrown if a reference transcriptome could not -be found. This usually means that \code{BuildTx} was not yet -run successfully. +The function maps genomic coordinates from \code{locus} to transcript +section coordinates. The function automatically loads a reference +transcriptome based on \code{refGenome}. An error is produced if a reference +transcriptome could not be found. This usually means that \code{BuildTx} was +not yet run successfully. The function returns a \code{txLoc} object of mapped positions. } \examples{ \dontrun{ bedFile <- system.file("extdata", "miCLIP_m6A_Linder2015_hg38.bed", - package = "RNAModR"); -sites <- ReadBED(bedFile); -txSites <- SmartMap(sites, id = "m6A", refGenome = "hg38"); + package = "RNAModR") +sites <- ReadBED(bedFile) +txSites <- SmartMap(sites, id = "m6A", refGenome = "hg38") } } \author{ diff --git a/man/SmartMap.ToTx.Rd b/man/SmartMap.ToTx.Rd index bb4a783..6fcb3b4 100644 --- a/man/SmartMap.ToTx.Rd +++ b/man/SmartMap.ToTx.Rd @@ -4,49 +4,58 @@ \alias{SmartMap.ToTx} \title{Map genome coordinates to transcript coordinates.} \usage{ -SmartMap.ToTx(locus, txBySec, seqBySec, geneXID, ignore.strand = FALSE, - noFactors = TRUE, showPb = FALSE) +SmartMap.ToTx(gr, txBySec, seqBySec, geneXID, ignore.strand = FALSE, + showPb = FALSE) } \arguments{ -\item{locus}{A \code{GRanges} object; specifies the list of -genomic features to be mapped.} +\item{gr}{A \code{GRanges} object; specifies the list of genomic features to +be mapped.} -\item{txBySec}{A \code{list} of \code{GRangesList} objects; -specifies the reference transcriptome; the object is usually -a result of running \code{BuildTx}.} +\item{txBySec}{A \code{list} of \code{GRangesList} objects; specifies the +reference transcriptome; the object is usually a result of running +\code{BuildTx}.} -\item{seqBySec}{A \code{list} of \code{DNAStringSet} objects; -sequences of (some or all) segments from \code{txBySec}; -the object is usually a result of running \code{BuildTx}.} +\item{seqBySec}{A \code{list} of \code{DNAStringSet} objects; sequences of +(some or all) segments from \code{txBySec}; the object is usually a result +of running \code{BuildTx}.} -\item{geneXID}{A \code{data.frame}; specifies different gene -IDs; the object is usually a result of running \code{BuildTx}.} +\item{geneXID}{A \code{DataFrame}; specifies different gene IDs; the object +is usually a result of running \code{BuildTx}.} -\item{ignore.strand}{A logical scalar; if \code{TRUE} strand -information is ignored when mapping genome coordinates to -transcript coordinates; default is \code{FALSE}.} - -\item{noFactors}{A logical scalar; if \code{TRUE}, the return -object contains no \code{factors}; default is \code{TRUE}.} +\item{ignore.strand}{A logical scalar; if \code{TRUE} strand information is +ignored when mapping genome coordinates to transcript coordinates; default +is \code{FALSE}.} \item{showPb}{A logical scalar; if \code{TRUE} show a progress bar; default is \code{FALSE}.} } \value{ -A \code{list} of \code{data.frame} objects. See -'Details'. +A \code{list} of \code{data.frame} objects. See 'Details'. } \description{ -Map genome coordinates to transcript coordinates. See -'Details'. -This is a low-level function that is being called from -\code{SmartMap}. +Map genome coordinates to transcript coordinates. See 'Details'. +This is a low-level function that is being called from \code{SmartMap}. +There is no guarantee that this function will get exported in future +releases of RNAModR. Use at your own risk. } \details{ -The function maps genomic coordinates from \code{locus} to -transcript section coordinates from \code{txBySec}. The -function returns a \code{list} of \code{data.frame} objects -with additional gene ID and sequence information. +The function maps genomic coordinates from \code{gr} to transcript region +coordinates from \code{txBySec}. The function returns a \code{list} of +\code{DataFrame} objects, each with the following columns: +\enumerate{ + \item \code{locus_in_txx_region}: A \code{GRanges} object + \item \code{locus_in_genome}: A \code{GRanges} object + \item \code{score}: A \code{numeric} vector + \item \code{id}: A \code{character} vector + \item \code{tx_region}: A \code{character} vector + \item \code{tx_region_width}: An \code{integer} vector + \item \code{tx_region_sequence}: A \code{DNAStringSet} object + \item \code{tx_refseq}: A \code{character} vector + \item \code{gene_entrez}: A \code{character} vector + \item \code{gene_symbol}: A \code{character} vector + \item \code{gene_ensembl}: A \code{character} vector + \item \code{gene_name}: A \code{character} vector +} } \author{ Maurits Evers, \email{maurits.evers@anu.edu.au} diff --git a/man/WriteBED.Rd b/man/WriteBED.Rd new file mode 100644 index 0000000..306ea38 --- /dev/null +++ b/man/WriteBED.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/IO.R +\name{WriteBED} +\alias{WriteBED} +\title{Write \code{txLoc} object to a BED file.} +\usage{ +WriteBED(txLoc, file = NULL) +} +\arguments{ +\item{txLoc}{A \code{txLoc} object.} + +\item{file}{A character string; specifies the filename of the output +BED file. If \code{NULL}, then \code{file = "sites.bed"}; default is +\code{NULL}.} +} +\description{ +Write \code{txLoc} object to a BED file. See 'Details'. +} +\details{ +The function writes entries from a \code{txLoc} object to a 6-column BED +file (BED6). Note that this process is not "splice-aware", i.e. if an entry +spans an intron the BED entry gives the left and right-most genomic +coordinate of the feature. If \code{file = NULL}, entries will be written to +\code{sites.bed}. +} +\examples{ +\dontrun{ +bedFile <- system.file("extdata", + "miCLIP_m6A_Linder2015_hg38.bed", + package = "RNAModR") +sites <- ReadBED(bedFile) +txSites <- SmartMap(sites, id = "m6A", refGenome = "hg38") +WriteBED(txSites) +} + +} +\author{ +Maurits Evers, \email{maurits.evers@anu.edu.au} +} diff --git a/man/WriteTxLocToCSV.Rd b/man/WriteCSV.Rd similarity index 64% rename from man/WriteTxLocToCSV.Rd rename to man/WriteCSV.Rd index 7ab42c2..8c28a03 100644 --- a/man/WriteTxLocToCSV.Rd +++ b/man/WriteCSV.Rd @@ -1,24 +1,23 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/IO.R -\name{WriteTxLocToCSV} -\alias{WriteTxLocToCSV} +\name{WriteCSV} +\alias{WriteCSV} \title{Write txLoc object to CSV file.} \usage{ -WriteTxLocToCSV(locus, file = NULL, withSeq = FALSE, withGC = FALSE) +WriteCSV(txLoc, file = NULL, withSeq = FALSE, withGC = FALSE) } \arguments{ -\item{locus}{A \code{txLoc} object.} +\item{txLoc}{A \code{txLoc} object.} \item{file}{Filename of output CSV file. If NULL then file = "sites.csv".} \item{withSeq}{If TRUE then include full sequence. Default is FALSE.} -\item{withGC}{If TRUE then include GC content. Default is FALSE.} +\item{withGC}{If TRUE then include GC content. Default is FALSE. Unused.} } \description{ -Write a txLoc object to a/multiple comma-separated-values file(s). +Write a \code{txLoc} object to a comma-separated-values file. } \author{ Maurits Evers, \email{maurits.evers@anu.edu.au} } -\keyword{internal} diff --git a/man/WriteFeatToBED.Rd b/man/WriteFeatToBED.Rd deleted file mode 100644 index f0a65e9..0000000 --- a/man/WriteFeatToBED.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/IO.R -\name{WriteFeatToBED} -\alias{WriteFeatToBED} -\title{Write a \code{list} of \code{GRangesList} to a BED file.} -\usage{ -WriteFeatToBED(txFeatures) -} -\arguments{ -\item{txFeatures}{A \code{list} of \code{GRangesList} objects.} -} -\description{ -Write a \code{list} of \code{GRangesList} to a BED file. -One file per transcript region is generated. -} -\author{ -Maurits Evers, \email{maurits.evers@anu.edu.au} -} -\keyword{internal} diff --git a/man/WriteTxLocToBED.Rd b/man/WriteTxLocToBED.Rd deleted file mode 100644 index 0b1550e..0000000 --- a/man/WriteTxLocToBED.Rd +++ /dev/null @@ -1,43 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/IO.R -\name{WriteTxLocToBED} -\alias{WriteTxLocToBED} -\title{Write \code{txLoc} object to a BED file.} -\usage{ -WriteTxLocToBED(locus, file = NULL, noChrName = FALSE) -} -\arguments{ -\item{locus}{A \code{txLoc} object.} - -\item{file}{A character string; specifies the filename of the output -BED file. If \code{NULL}, then \code{file = "sites.bed"}; default is -\code{NULL}.} - -\item{noChrName}{A logical scalar; if \code{TRUE}, chromosome names -will be written without "chr"; default is \code{FALSE}.} -} -\description{ -Write \code{txLoc} object to a BED file. See 'Details'. -} -\details{ -The function writes entries from a \code{txLoc} object to a 6-column -BED file (BED6). Note that this process is not "splice-aware", i.e. -if an entry spans an intron the BED entry gives the left and right-most -genomic coordinate of the feature. If \code{file = NULL}, entries will -be written to \code{sites.bed}. If \code{noChrName = TRUE}, chromosome -names in column 1 of the BED file will be written without "chr". -} -\examples{ -\dontrun{ -bedFile <- system.file("extdata", - "miCLIP_m6A_Linder2015_hg38.bed", - package = "RNAModR"); -sites <- ReadBED(bedFile); -txSites <- SmartMap(sites, id = "m6A", refGenome = "hg38"); -WriteTxLocToBED(txSites); -} - -} -\author{ -Maurits Evers, \email{maurits.evers@anu.edu.au} -}