-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnormalise_counts.R
executable file
·108 lines (87 loc) · 3.04 KB
/
normalise_counts.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env Rscript
library(optparse)
library(edgeR)
library(readr)
library(tibble)
library(DESeq2)
option_list <- list(
make_option(c("-t", "--log_transform"),
type = "character", default = NULL,
help = "log transform the counts? default = FALSE",
metavar = "character"
),
make_option(c("-o", "--outdir"),
type = "character", default = NULL,
help = "output directory for results", metavar = "character"
)
)
opt_parser <- OptionParser(option_list = option_list)
opt <- parse_args(opt_parser)
log <- if (opt$log_transform == "TRUE") TRUE else FALSE
outdir <- opt$outdir
## Read data
# counts_tab <- read.csv(
# "gene_counts_pc.tsv",
# header = TRUE, na.strings = c("", "NA"), sep = "\t",
# stringsAsFactors = FALSE
# )
# ref_gene_tab <- read.csv(
# "ref_gene_df.tsv",
# header = TRUE, na.strings = c("", "NA"), sep = "\t",
# stringsAsFactors = FALSE
# )
counts_tab <- read_tsv("gene_counts_pc.tsv")
ref_gene_tab <- read_tsv("ref_gene_df.tsv")
gene_names <- counts_tab[["feature_id"]]
counts_tab[["feature_id"]] <- NULL
# counts_tab <- as.data.frame(sapply(counts_tab, as.numeric))
counts_tab <- as.data.frame(counts_tab)
rownames(counts_tab) <- gene_names
# ## remove rRNA genes
# ref_tab_sub <- ref_gene_tab[!ref_gene_tab$biotype == "rRNA", ]
# non_rRNA_counts <- counts_tab[rownames(counts_tab) %in% ref_tab_sub$locus_tag, ]
ref_tab_sub <- ref_gene_tab
non_rRNA_counts <- counts_tab
## ensure that ref gene annotations order matches counts table
ref_tab_sub <- ref_tab_sub[match(
rownames(non_rRNA_counts), ref_tab_sub$locus_tag
), ]
## DESeq2 scaled counts
colData <- data.frame(sample_name = colnames(non_rRNA_counts))
dds <- DESeqDataSetFromMatrix(
countData = round(non_rRNA_counts),
colData = colData, design = ~1
)
dds <- estimateSizeFactors(dds)
deseq_norm <- counts(dds, normalized = TRUE)
deseq_df <- as_tibble(as.data.frame(log2(deseq_norm + 1)))
deseq_df$feature_id <- rownames(deseq_df)
deseq_df <- deseq_df[c("feature_id", colData$sample_name)]
write.table(
deseq_df, file.path(outdir, "deseq_counts.tsv"),
col.names = TRUE,
row.names = FALSE, sep = "\t", quote = FALSE
)
## counts per million (cpm) - normalised for lib size but not gene length
y <- DGEList(counts = non_rRNA_counts)
y <- calcNormFactors(y, method = "TMM")
cpm_df <- as.data.frame(edgeR::cpm(y, log = log))
cpm_df <- tibble::rownames_to_column(as.data.frame(cpm_df), "feature_id")
write.table(
cpm_df, file.path(outdir, "cpm_counts.tsv"),
col.names = TRUE,
row.names = FALSE, sep = "\t", quote = FALSE
)
## reads per kilobase per million (rpkm) - normalised for lib size + gene length
y <- DGEList(
counts = non_rRNA_counts,
genes = data.frame(gene.length = as.numeric(ref_tab_sub$gene_length))
)
y <- calcNormFactors(y)
rpkm_df <- as.data.frame(edgeR::rpkm(y, log = log))
rpkm_df <- tibble::rownames_to_column(as.data.frame(rpkm_df), "feature_id")
write.table(
rpkm_df, file.path(outdir, "rpkm_counts.tsv"),
col.names = TRUE,
row.names = FALSE, sep = "\t", quote = FALSE
)