riboTransVis/man/get_transcript_sequence.Rd at master · junjunlab/riboTransVis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/implementation.R
\name{get_transcript_sequence}
\alias{get_transcript_sequence}
\title{Extract Transcript Sequences from Genome and GTF Annotation}
\usage{
get_transcript_sequence(
  genome_file = NULL,
  gtf_file = NULL,
  feature = "exon",
  extend = FALSE,
  extend_upstream = 0,
  extend_downstream = 0,
  return_extend_region = FALSE,
  output_file = NULL
)
}
\arguments{
\item{genome_file}{Either a \code{BSgenome} object or the path to an uncompressed genome FASTA file (with accompanying \code{.fai} index or it will be generated).}

\item{gtf_file}{Path to a GTF annotation file containing gene models (required) or Granges format file from gtf data.}

\item{feature}{Feature type to extract sequences for (Default: "exon"). Could be "exon", "CDS", etc.}

\item{extend}{Logical; whether to extend the first and last exons per transcript (Default: \code{FALSE}).}

\item{extend_upstream}{Integer; number of bases to extend upstream of the first exon (Default: \code{0}).}

\item{extend_downstream}{Integer; number of bases to extend downstream of the last exon (Default: \code{0}).}

\item{return_extend_region}{Logical; if \code{TRUE}, returns the genomic coordinates (as a \code{data.frame}) of the extracted (possibly extended) regions instead of sequences (Default: \code{FALSE}).}

\item{output_file}{Output file path to write the transcript sequences in FASTA format. Required if \code{return_extend_region = FALSE}.}
}
\value{
If \code{return_extend_region = TRUE}, returns a \code{data.frame} containing the extended coordinates of the selected feature (e.g., exon). Otherwise, writes transcript sequences to \code{output_file} and returns nothing.
}
\description{
This function extracts transcript (or other genomic feature) sequences based on genome FASTA and GTF annotations.
It also supports optional region extension upstream/downstream, and can output extended coordinates or the final transcript FASTA.
}
\details{
\itemize{
\item The function supports both UCSC-style and Ensembl-style chromosome names, and will automatically adjust "chr" prefix if needed.
\item Transcript sequences are reconstructed by collapsing all exons (or chosen \code{feature}) for each transcript.
\item For \code{feature = "CDS"}, stop codons (+3bp) can optionally be included at the 3' end (or 5' end for minus strand).
\item Requires the following Bioconductor packages: \code{rtracklayer}, \code{Rsamtools}, \code{Biostrings}, \code{txdbmaker}, \code{GenomicFeatures}, \code{AnnotationDbi}
}
}
\examples{
\dontrun{
get_transcript_sequence(
  genome_file = "Homo_sapiens.GRCh38.dna.primary_assembly.fa",
  gtf_file = "Homo_sapiens.GRCh38.110.gtf",
  feature = "exon",
  extend = TRUE,
  extend_upstream = 50,
  extend_downstream = 100,
  return_extend_region = FALSE,
  output_file = "transcripts.fa"
)
}

}