-
Notifications
You must be signed in to change notification settings - Fork 0
/
updateDesignData.R
151 lines (140 loc) · 7.26 KB
/
updateDesignData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# =============================================================================.
# This script updates micro array design information for new genome releases.
# It relies on the bowtie aligner to map probe sequences on a chosen assembly of
# the genome for a precise update of genomic coordinates addressed by
# the microarray platform.
# =============================================================================.
# LIBRARIES ####################################################################
message("Loading R/Bioconductor packages")
suppressPackageStartupMessages(library("MRA.TA"))
# SCRIPT PARAMETERS ############################################################
# =============================================================================.
script.args = matrix(c(
# Column 1: long option name
# Column 2: short option name
# Column 3: 0=no argument, 1=required argument, 2=optional argument
# Column 4: data type (logical, integer, double, complex, character)
# Column 5: a brief description of the purpose of the option
# Column 6: default value
'genome.id', 'i', 1, 'character', "identifier of the genome assembly", "",
'genome.index.path', 'p', 1, 'character', "path to genome index for bowtie", "",
'original.design.path', 'o', 1, 'character', "path to the original array design file", "",
'updated.design.path', 'u', 2, 'character', "path to folder for storage of updated design data, default = Processed_Data/Updated/", "Processed_Data/Updated/",
'bowtie.mismatches', 'n', 2, 'integer', "number of allowed mismatches (bowtie mapping), default = 0", 0,
'bowtie.threads', 't', 2, 'integer', "number of CPU threads (bowtie mapping), default = 6", 6
), byrow=TRUE, ncol=6)
Cfg <- processArgs(script.args)
Cfg$script.args <- NULL
# -----------------------------------------------------------------------------.
# DEBUG VALUES # *** WARNING/TODO *** #
if(is.na(Cfg$script.name)) {
Cfg$genome.id <- "dm6"
Cfg$genome.index.path <- "Genome_Data/UCSC_dm6/Indexes/bowtie/genome"
Cfg$original.design.path <- "Raw_Data/GSE23887_RAW/GPL10867.ndf.gz"
Cfg$updated.design.path <- "Processed_Data/Updated/"
Cfg$bowtie.mismatches <- 0
Cfg$bowtie.threads <- 6
}
# =============================================================================.
# Parameters validation
lbl.lst <- c("genome.id", "genome.index.path", "original.design.path")
for(lbl in lbl.lst) {
if(Cfg[[lbl]]=="") {
msg <- paste("Missing", lbl, "get help using: Rscript", Cfg$script.name, "-h")
stop(msg, "\n\n")
}
}
# -----------------------------------------------------------------------------.
attach(Cfg)
# -----------------------------------------------------------------------------.
# Verify that bowtie index path is correct
verifyInputFiles(paste(genome.index.path, ".1.ebwt", sep=""))
# -----------------------------------------------------------------------------.
# Verify that array design file exists and has required columns
array.design.format <- file.formats$nimblegen.ndf
verifyInputFiles(original.design.path, array.design.format$colnames)
# =============================================================================.
# Parameter updates
cmd <- paste('mkdir -p', updated.design.path)
system(cmd, intern=F)
design.id <- gsub(
"[.](ndf|txt)([.]gz)?$", "", basename(original.design.path), perl=T, ignore.case=T
)
probes.file.path <- paste(updated.design.path, "/", design.id, "_probe_sequences.fa", sep="")
mapped.probes.path <- paste(updated.design.path, "/", design.id, "_", genome.id, "_aligned_probes.bowtie", sep="")
mapped.probes.format <- file.formats$mapped.probe.sequences
updated.design.path <- paste(updated.design.path, "/", design.id, "_", genome.id, ".txt", sep="")
# PROCESSING ###################################################################
# -----------------------------------------------------------------------------.
# Load microarray design
design <- readData(original.design.path, array.design.format)
# Remove array layout and control probes
discard <- design$CONTAINER %in% c("V_CODE", "H_CODE", "NGS_CONTROLS", "RANDOM")
design <- design[! discard,]
# -----------------------------------------------------------------------------.
# Create a FASTA file with microarray probe sequences
i.probes <- 1:nrow(design)
n.probes <- length(i.probes)
lines <- rep("", 2*n.probes)
lines[2*(1:n.probes) - 1] <- paste(">", i.probes,"\t",design$PROBE_ID[i.probes], sep="")
lines[2*(1:n.probes)] <- design$PROBE_SEQUENCE[i.probes]
probes.file <- file(probes.file.path, "w")
writeLines(lines, con = probes.file)
close(probes.file)
# -----------------------------------------------------------------------------.
# Realign probe sequences to latest genome release using bowtie
cmd <- paste(
"bowtie -f -n", bowtie.mismatches, "-m 1 --strata --best -p", bowtie.threads,
genome.index.path, probes.file.path, mapped.probes.path,
sep=" "
)
txt <- system(cmd, intern=T)
# =============================================================================.
# Update microarray design with realigned probes
# -----------------------------------------------------------------------------.
# Main columns of the output from bowtie:
# 1: probe index
# 2: PROBE_ID
# 3: strand
# 4: chromosome/seq_id
# 5: position
# 6: PROBE_SEQUENCE
# -----------------------------------------------------------------------------.
probes <- readData(mapped.probes.path, mapped.probes.format)
# Check consistency
i <- probes[,1]
if(sum(design$PROBE_ID[i]!=probes[,2])!=0) stop("bad probe alignement file")
# -----------------------------------------------------------------------------.
l <- nchar(design$PROBE_SEQUENCE[i])
chr <- gsub("(^[ \t\n]+)|([ \t\n]+$)", "", (as.character(probes[,4])))
updated.design <- cbind(
as.character(design$PROBE_ID[i]),
as.character(design$PROBE_CLASS[i]),
as.character(design$CONTAINER[i]),
as.integer(design$X[i]),
as.integer(design$Y[i]),
as.character(design$SEQ_ID[i]), # chromosome (seqname) from original design
chr, # chromosome (seqname) from bowtie alignment
as.character(probes[,3]), # strand
as.integer(probes[,5] + 1), # start
as.integer(probes[,5] + l - 1), # end
as.integer(l) # length
)
colnames(updated.design) <- file.formats$nimblegen.updated$table$columns
# -----------------------------------------------------------------------------.
write.table(format(updated.design, scientific=FALSE, digits=11, trim=TRUE, justify='none'), file=updated.design.path , sep="\t", row.names=FALSE, quote=FALSE)
# =============================================================================.
# Cleanup alignment files and compress the updated design files
# rm Processed_Data/Updated/*_probe_sequences.fa
cmd <- paste("rm -f", probes.file.path)
system(cmd, intern=F)
# -----------------------------------------------------------------------------.
# rm Processed_Data/Updated/*_aligned_probes.bowtie
cmd <- paste("rm -f", mapped.probes.path)
system(cmd, intern=F)
# -----------------------------------------------------------------------------.
# gzip Processed_Data/Updated/*.txt
cmd <- paste("gzip", updated.design.path)
system(cmd, intern=F)
# =============================================================================.
detach(Cfg)