-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdemultiplex_prism.sh
executable file
·565 lines (497 loc) · 20.9 KB
/
demultiplex_prism.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
#!/bin/bash
#
# this prism supports demultiplex of a set of sequence files , all of which contain
# multiplexed GBS sequencing data as specified by a sample information file.
#
#
# references :
#
# https://biohpc.cornell.edu/lab/doc/TasselPipelineGBS20120215.pdf
declare -a files_array
function get_opts() {
DRY_RUN=no
DEBUG=no
HPC_TYPE=slurm
FILES=""
OUT_DIR=""
ENZYME_INFO=""
FORCE=no
SPLIT=no
PARAMETERS_FILE=""
help_text=$(cat << 'EOF'
usage :
./demultiplex_prism.sh [-h] [-n] [-d] [-x gbsx|tassel3_qc|tassel3] [-l sample_info ] [-e enzymeinfo] -O outdir input_file_name(s)
example:
Notes:
* only use this script to process more than one fastq file, where all files relate to the sample info file
(e.g. keyfile) you supply (i.e. all files relate to the same library)
* for GBSX, enzyme_info is an optional filename, of a file containing cut-sites for non-default enzymes). For
tassel3 it is mandatory and is the name of the enzyme to use
EOF
)
while getopts ":nhSfO:C:x:l:e:p:" opt; do
case $opt in
n)
DRY_RUN=yes
;;
d)
DEBUG=yes
;;
f)
FORCE=yes
;;
h)
echo -e $help_text
exit 0
;;
O)
OUT_DIR=$OPTARG
;;
C)
HPC_TYPE=$OPTARG
;;
x)
ENGINE=$OPTARG
;;
e)
ENZYME_INFO=$OPTARG
;;
l)
SAMPLE_INFO=$OPTARG
;;
p)
PARAMETERS_FILE=$OPTARG
;;
S)
SPLIT=yes
;;
\?)
echo "Invalid option: -$OPTARG" >&2
exit 1
;;
:)
echo "Option -$OPTARG requires an argument." >&2
exit 1
;;
esac
done
shift $((OPTIND-1))
FILE_STRING=$@
# this is needed because of the way we process args a "$@" - which
# is needed in order to parse parameter sets to be passed to the
# aligner (which are space-separated)
declare -a files="(${FILE_STRING})";
NUM_FILES=${#files[*]}
for ((i=0;$i<$NUM_FILES;i=$i+1)) do
files_array[$i]=${files[$i]}
done
}
function check_opts() {
if [ -z "$SEQ_PRISMS_BIN" ]; then
echo "please set SEQ_PRISMS_BIN environment variable"
exit 1
fi
if [ ! -d $OUT_DIR ]; then
echo "OUT_DIR $OUT_DIR not found"
exit 1
fi
if [[ $HPC_TYPE != "local" && $HPC_TYPE != "slurm" ]]; then
echo "HPC_TYPE must be one of local, slurm"
exit 1
fi
if [ ! -f $SAMPLE_INFO ]; then
echo "could not find $SAMPLE_INFO"
exit 1
fi
if [[ ( $ENGINE != "gbsx" ) && ( $ENGINE != "tassel3_qc" ) && ( $ENGINE != "tassel3" ) ]] ; then
echo "demultiplexer engines supported : tassel3, gbsx , tassel3_qc (not $ENGINE ) "
exit 1
fi
if [ ! -z $ENZYME_INFO ]; then
if [ $ENGINE == "gbsx" ]; then
if [ ! -f $ENZYME_INFO ]; then
echo "could not find $ENZYME_INFO"
exit 1
fi
else
# map enzymes such as HpaIII and HpaII (methylation sensitive) to equivalent
# (same recognition-site) enzymes that uneak knows about
enzyme_for_uneak=$ENZYME_INFO
echo $enzyme_for_uneak | grep -iq "hpa" > /dev/null 2>&1
if [ $? == 0 ]; then
echo "attempting to map enzyme $ENZYME_INFO to an equivalent for uneak..."
enzyme_for_uneak=`echo $ENZYME_INFO | sed -r 's/HpaII|HpaIII/MspI/g' -`
if [ $ENZYME_INFO != $enzyme_for_uneak ]; then
echo "$ENZYME_INFO mapped to $enzyme_for_uneak for uneak"
else
echo "warning, no mapping done so will use $ENZYME_INFO"
fi
fi
fi
elif [[ ( $ENGINE == "tassel3" ) || ( $ENGINE == "tassel3_qc" ) ]]; then
echo "must specify enzyme , for tassel3 (should match enzyme specified in keyfile)"
exit 1
fi
if [ ! -z "$PARAMETERS_FILE" ]; then
if [ ! -f "$PARAMETERS_FILE" ]; then
echo "could not find $PARAMETERS_FILE"
exit 1
fi
fi
}
function echo_opts() {
echo OUT_DIR=$OUT_DIR
echo DRY_RUN=$DRY_RUN
echo DEBUG=$DEBUG
echo HPC_TYPE=$HPC_TYPE
echo FILES=${files_array[*]}
echo ENGINE=$ENGINE
echo SAMPLE_INFO=$SAMPLE_INFO
echo ENZYME_INFO=$ENZYME_INFO
echo PARAMETERS_FILE=$PARAMETERS_FILE
}
#
# edit this method to set required environment (or set up
# before running this script)
#
function configure_env() {
# copy scripts we need to outfolder
cd $GBS_PRISM_BIN
cp demultiplex_prism.sh $OUT_DIR
cp demultiplex_prism.mk $OUT_DIR
cp get_reads_tags_per_sample.py $OUT_DIR
cp $SAMPLE_INFO $OUT_DIR
if [ -f $ENZYME_INFO ]; then
cp $ENZYME_INFO $OUT_DIR
fi
cp $GBS_PRISM_BIN/etc/larger_mem_slurm_array_job $OUT_DIR
# set up the environment includes that we will need - these activate
# environments
echo "
max_tasks=50
jobtemplatefile = \"$OUT_DIR/larger_mem_slurm_array_job\"
" > $OUT_DIR/tardis_demultiplex.toml
if [ -f $OUT_DIR/tardis.toml ]; then
cp $OUT_DIR/tardis.toml $OUT_DIR/tardis.toml.orig
fi
cp $OUT_DIR/tardis_demultiplex.toml $OUT_DIR/tardis.toml
cp $GBS_PRISM_BIN/etc/larger_mem_slurm_array_job $OUT_DIR
echo "
conda activate tassel3
" > $OUT_DIR/tassel3_env.src
cd $OUT_DIR
}
function check_env() {
if [ -z "$SEQ_PRISMS_BIN" ]; then
echo "SEQ_PRISMS_BIN not set - exiting"
exit 1
fi
if [ -z "$GBS_PRISM_BIN" ]; then
echo "GBS_PRISM_BIN not set - exiting"
exit 1
fi
}
function get_uneak_plugin_parameters() {
if [ -z "$PARAMETERS_FILE" ]; then
uneak_plugin_parameters=""
else
plugin_name=$1
uneak_plugin_parameters=`grep $plugin_name $PARAMETERS_FILE | sed "s/$plugin_name//g" -`
fi
}
function get_targets() {
# make a target moniker for each input file and write associated
# ENGINE wrapper, which will be called by make
rm -f $OUT_DIR/demultiplex_targets.txt
for ((j=0;$j<$NUM_FILES;j=$j+1)) do
file=${files_array[$j]}
file_base=`basename $file`
parameters_moniker=`basename $SAMPLE_INFO`
if [ ! -z $ENZYME_INFO ]; then
parameters_moniker="${parameters_moniker}.$ENZYME_INFO"
fi
if [ $ENGINE == "gbsx" ]; then
demultiplex_moniker=${file_base}.${parameters_moniker}.${ENGINE}
echo $OUT_DIR/${demultiplex_moniker}.demultiplex_prism >> $OUT_DIR/demultiplex_targets.txt
script=$OUT_DIR/${demultiplex_moniker}.demultiplex_prism.sh
elif [[ ( $ENGINE == "tassel3" ) || ( $ENGINE == "tassel3_qc" ) ]]; then
demultiplex_moniker=${parameters_moniker}.${ENGINE}
echo $OUT_DIR/${demultiplex_moniker}.demultiplex_prism > $OUT_DIR/demultiplex_targets.txt # one line only
script=$OUT_DIR/${demultiplex_moniker}.demultiplex_prism.sh
fi
if [ -f script ]; then
if [ ! $FORCE == yes ]; then
echo "found existing demultiplex script $script - will re-use (use -f to force rebuild of scripts) "
continue
fi
fi
base=`basename $file`
sample_info_base=`basename $SAMPLE_INFO`
if [ $ENGINE == "tassel3" ]; then
# we only generate a single target , even if there are multiple files. The setup
# of the target involves configuring output sub-folders
# structure , so set this up
if [ ! -d ${OUT_DIR}/key ] ; then
mkdir -p ${OUT_DIR}/key
cp -fs $OUT_DIR/$sample_info_base ${OUT_DIR}/key
fi
if [ ! -d ${OUT_DIR}/Illumina ]; then
mkdir -p ${OUT_DIR}/Illumina
fi
cp -s $file ${OUT_DIR}/Illumina
# get plugin parameters (if applicable)
get_uneak_plugin_parameters FastqToTagCount ; p_FastqToTagCount=$uneak_plugin_parameters
get_uneak_plugin_parameters MergeTaxaTagCount ; p_MergeTaxaTagCount=$uneak_plugin_parameters
get_uneak_plugin_parameters TagCountToTagPair ; p_TagCountToTagPair=$uneak_plugin_parameters
get_uneak_plugin_parameters TagPairToTBT ; p_TagPairToTBT=$uneak_plugin_parameters
get_uneak_plugin_parameters TBTToMapInfo ; p_TBTToMapInfo=$uneak_plugin_parameters
get_uneak_plugin_parameters MapInfoToHapMap ; p_MapInfoToHapMap=$uneak_plugin_parameters
#
# note that the tassel3 modules do not generally exit with a non-zero error code
# if something goes wrong , so the code below that looks at the exit code does not work
# - even if an early module fails , all modules are attempted , and the
# error status is only picked up downstream when something tries to use the
# demultiplex result. This needs to be improved - e.g. sniff the stdout / stderr files
# from these modules to figure out if the job completed OK - then exit with the
# appropiate code
#
echo "#!/bin/bash
cd $OUT_DIR
if [ ! -f tagCounts.done ]; then
mkdir tagCounts
tardis --hpctype $HPC_TYPE -k -d $OUT_DIR --shell-include-file $OUT_DIR/tassel3_env.src run_pipeline.pl -Xms512m -Xmx5g -fork1 -UFastqToTagCountPlugin -p $p_FastqToTagCount -w ./ -c 1 -e $enzyme_for_uneak -s 900000000 -endPlugin -runfork1 \> $OUT_DIR/${demultiplex_moniker}.FastqToTagCount.stdout 2\>$OUT_DIR/${demultiplex_moniker}.FastqToTagCount.stderr
fi
if [ \$? != 0 ]; then
echo \"demultplex_prism.sh: error code returned from FastqToTagCount process - check log files. check log files. quitting\"; exit 1
else
date > tagCounts.done
fi
" > $script
chmod +x $script
elif [ $ENGINE == "tassel3_qc" ]; then
# as above - but this one also runs downstream processsing as a convenience
if [ ! -d ${OUT_DIR}/key ] ; then
mkdir -p ${OUT_DIR}/key
cp -fs $OUT_DIR/$sample_info_base ${OUT_DIR}/key
fi
if [ ! -d ${OUT_DIR}/Illumina ]; then
mkdir -p ${OUT_DIR}/Illumina
fi
cp -s $file ${OUT_DIR}/Illumina
# get plugin parameters (if applicable)
get_uneak_plugin_parameters FastqToTagCount ; p_FastqToTagCount=$uneak_plugin_parameters
get_uneak_plugin_parameters MergeTaxaTagCount ; p_MergeTaxaTagCount=$uneak_plugin_parameters
get_uneak_plugin_parameters TagCountToTagPair ; p_TagCountToTagPair=$uneak_plugin_parameters
get_uneak_plugin_parameters TagPairToTBT ; p_TagPairToTBT=$uneak_plugin_parameters
get_uneak_plugin_parameters TBTToMapInfo ; p_TBTToMapInfo=$uneak_plugin_parameters
get_uneak_plugin_parameters MapInfoToHapMap ; p_MapInfoToHapMap=$uneak_plugin_parameters
echo "#!/bin/bash
cd $OUT_DIR
if [ ! -f tagCounts.done ]; then
mkdir tagCounts
#check whether the keyfile contains multiple flowcell-library-fastqfile combinations, if so need to demultiplex each combination separately
# the ramify_tassel_keyfile.py script will set up a tassel demultiplexing environment in subfolders of tagCounts_parts. These will be called e.g.
# tagCounts_parts/part_<digest>
# where <digest> is a hash of flowcell-library-fastqfile
# so the structure will be
# tagCounts_parts/part<digest>/tagCounts
# /key
# /Illumina
mkdir tagCounts_parts
$GBS_PRISM_BIN/ramify_tassel_keyfile.py -t ramify -o ${OUT_DIR}/tagCounts_parts --sub_tassel_prefix part ${OUT_DIR}/key/$sample_info_base
if [ \$? != 0 ]; then
echo \"ramify_tassel_keyfile.py: error code returned - check log files. quitting\"; exit 1
exit $1
fi
number_of_parts=\`ls ${OUT_DIR}/tagCounts_parts | wc -l\`
if [ \$number_of_parts == 1 ]; then
rm -rf tagCounts_parts
tardis --hpctype $HPC_TYPE -k -d $OUT_DIR --shell-include-file $OUT_DIR/tassel3_env.src run_pipeline.pl -Xms512m -Xmx5g -fork1 -UFastqToTagCountPlugin $p_FastqToTagCount -w ./ -c 1 -e $enzyme_for_uneak -s 900000000 -endPlugin -runfork1 \> $OUT_DIR/${demultiplex_moniker}.FastqToTagCount.stdout 2\>$OUT_DIR/${demultiplex_moniker}.FastqToTagCount.stderr
cat $OUT_DIR/${demultiplex_moniker}.FastqToTagCount.stdout | $OUT_DIR/get_reads_tags_per_sample.py > $OUT_DIR/TagCount.csv
$GBS_PRISM_BIN/summarise_read_and_tag_counts.py -o $OUT_DIR/tags_reads_summary.txt $OUT_DIR/TagCount.csv
cat $OUT_DIR/tags_reads_summary.txt | awk -F'\\t' '{printf(\"%s\\t%s\\t%s\\n\",\$1,\$4,\$9)}' - > $OUT_DIR/tags_reads_cv.txt
elif [ \$number_of_parts > 1 ]; then
# make a command file to demultiplex each part , and launch on the cluster
rm -f tagCounts_parts/demultiplex_parts_commands.src
for part_folder in tagCounts_parts/part*; do
part_base=\`basename \$part_folder\`
echo \"cd $OUT_DIR/tagCounts_parts/\$part_base; run_pipeline.pl -Xms512m -Xmx5g -fork1 -UFastqToTagCountPlugin $p_FastqToTagCount -w ./ -c 1 -e $enzyme_for_uneak -s 900000000 -endPlugin -runfork1 \> $OUT_DIR/tagCounts_parts/\$part_base/${demultiplex_moniker}.FastqToTagCount.stdout 2\>$OUT_DIR/tagCounts_parts/\$part_base/${demultiplex_moniker}.FastqToTagCount.stderr\" >> tagCounts_parts/demultiplex_parts_commands.src
done
# launch the command-file
tardis --hpctype $HPC_TYPE -c 1 -k -d $OUT_DIR --shell-include-file $OUT_DIR/tassel3_env.src source _condition_text_input_tagCounts_parts/demultiplex_parts_commands.src
#
# merge the outputs into the top level folder
$GBS_PRISM_BIN/ramify_tassel_keyfile.py -t merge_results -o ${OUT_DIR}/tagCounts_parts -m ${OUT_DIR}/tagCounts --sub_tassel_prefix part ${OUT_DIR}/key/$sample_info_base
# merge the tag counts into a single tag count file
$GBS_PRISM_BIN/ramify_tassel_keyfile.py -t merge_counts -o ${OUT_DIR}/tagCounts_parts --sub_tassel_prefix part ${OUT_DIR}/key/$sample_info_base > $OUT_DIR/TagCount.csv
$GBS_PRISM_BIN/summarise_read_and_tag_counts.py -o $OUT_DIR/tags_reads_summary.txt $OUT_DIR/TagCount.csv
cat $OUT_DIR/tags_reads_summary.txt | awk -F'\\t' '{printf(\"%s\\t%s\\t%s\\n\",\$1,\$4,\$9)}' - > $OUT_DIR/tags_reads_cv.txt
else
echo "demultiplex_prism : error analysing keyfile"
fi
fi
if [ \$? != 0 ]; then
echo \"demultplex_prism.sh: error code returned from FastqToTagCount process - check log files. quitting\"; exit 1
else
date > tagCounts.done
fi
if [ ! -f mergedTagCounts.done ]; then
mkdir mergedTagCounts
tardis --hpctype $HPC_TYPE -k -d $OUT_DIR --shell-include-file $OUT_DIR/tassel3_env.src run_pipeline.pl -Xms512m -Xmx500g -fork1 -UMergeTaxaTagCountPlugin $p_MergeTaxaTagCount -w ./ -m 600000000 -x 100000000 -c 5 -endPlugin -runfork1 \> $OUT_DIR/${demultiplex_moniker}.MergeTaxaTagCount.stdout 2\>$OUT_DIR/${demultiplex_moniker}.MergeTaxaTagCount.stderr
fi
if [ \$? != 0 ]; then
echo \"demultplex_prism.sh: error code returned from MergeTaxaTagCount process - check log files. quitting\"; exit 2
else
date > mergedTagCounts.done
fi
if [ ! -f tagPair.done ]; then
mkdir tagPair
tardis --hpctype $HPC_TYPE -k -d $OUT_DIR --shell-include-file $OUT_DIR/tassel3_env.src run_pipeline.pl -Xms512m -Xmx500g -fork1 -UTagCountToTagPairPlugin $p_TagCountToTagPair -w ./ -e 0.03 -endPlugin -runfork1 \> $OUT_DIR/${demultiplex_moniker}.TagCountToTagPair.stdout 2\>$OUT_DIR/${demultiplex_moniker}.TagCountToTagPair.stderr
fi
if [ \$? != 0 ]; then
echo \"demultplex_prism.sh: error code returned from TagCountToTagPair process - check log files. quitting\"; exit 3
else
date > tagPair.done
fi
if [ ! -f tagsByTaxa.done ]; then
mkdir tagsByTaxa
tardis --hpctype $HPC_TYPE -k -d $OUT_DIR --shell-include-file $OUT_DIR/tassel3_env.src run_pipeline.pl -Xms512m -Xmx500g -fork1 -UTagPairToTBTPlugin $p_TagPairToTBT -w ./ -endPlugin -runfork1 \> $OUT_DIR/${demultiplex_moniker}.TagPairToTBT.stdout 2\>$OUT_DIR/${demultiplex_moniker}.TagPairToTBT.stderr
fi
if [ \$? != 0 ]; then
echo \"demultplex_prism.sh: error code returned from TagPairToTBT process - check log files. quitting\"; exit 4
else
date > tagsByTaxa.done
fi
if [ ! -f mapInfo.done ] ; then
mkdir mapInfo
tardis --hpctype $HPC_TYPE -k -d $OUT_DIR --shell-include-file $OUT_DIR/tassel3_env.src run_pipeline.pl -Xms512m -Xmx500g -fork1 -UTBTToMapInfoPlugin $p_TBTToMapInfo -w ./ -endPlugin -runfork1 \> $OUT_DIR/${demultiplex_moniker}.TBTToMapInfo.stdout 2\>$OUT_DIR/${demultiplex_moniker}.TBTToMapInfo.stderr
fi
if [ \$? != 0 ]; then
echo \"demultplex_prism.sh: error code returned from TBTToMapInfo process - check log files. quitting\"; exit 5
else
date > mapInfo.done
fi
if [ ! -f hapMap.done ]; then
mkdir hapMap
tardis --hpctype $HPC_TYPE -k -d $OUT_DIR --shell-include-file $OUT_DIR/tassel3_env.src run_pipeline.pl -Xms512m -Xmx500g -fork1 -UMapInfoToHapMapPlugin $p_MapInfoToHapMap -w ./ -mnMAF 0.03 -mxMAF 0.5 -mnC 0.1 -endPlugin -runfork1 \> $OUT_DIR/${demultiplex_moniker}.MapInfoToHapMap.stdout 2\>$OUT_DIR/${demultiplex_moniker}.MapInfoToHapMap.stderr
fi
if [ \$? != 0 ]; then
echo \"demultplex_prism.sh: error code returned from MapInfoToHapMap process - check log files. quitting\"; exit 6
else
date > hapMap.done
fi
if [ ! -f hapMap.done ]; then
echo \"demultplex_prism.sh: didn't find hapMap.done - something went wrong - check log files. quitting\"; exit 7
fi
" > $script
chmod +x $script
elif [ $ENGINE == "gbsx" ]; then
if [ -f "$ENZYME_INFO" ]; then
ENZYME_PHRASE="-ea $ENZYME_INFO"
fi
if [ $SPLIT == "yes" ]; then
# can optionally split the input fastq and launch demultiplex of the splits in parallel - however this
# usually is slower than just launching a single job
cat << END_SPLIT > $script
#!/bin/bash
#
# note , currently using -k option for debugging - remove this
#
set -x
base=`basename $file`
mkdir ${OUT_DIR}/${base}.demultiplexed
cd ${OUT_DIR}
# this will demultiplex in parallel into numbered subfolders of the tardis working folder
tardis --hpctype $HPC_TYPE -k -d $OUT_DIR java -jar $SEQ_PRISMS_BIN/../bin/GBSX_v1.3.jar --Demultiplexer $ENZYME_PHRASE -f1 _condition_fastq_input_$file -i $OUT_DIR/$sample_info_base -o _condition_output_$OUT_DIR/${base}.demultiplexed -lf TRUE -gzip FALSE -mb 0 -me 0 -n false -t 8 \> _condition_uncompressedtext_output_$OUT_DIR/${demultiplex_moniker}.stdout 2\> _condition_uncompressedtext_output_$OUT_DIR/${demultiplex_moniker}.stderr
# for each distinct sample , combine all the slices
# get the distinct samples
for outfile in tardis_*/\${base}.*.demultiplexed/*.fastq; do
sample=\`basename \$outfile\`
echo \$sample >> $OUT_DIR/\${base}.demultiplexed/sample_list
done
# combine all slices
for sample in \`cat $OUT_DIR/\${base}.demultiplexed/sample_list\`; do
cat tardis_*/\${base}.*.demultiplexed/\${sample} > $OUT_DIR/\${base}.demultiplexed/\${sample}
done
END_SPLIT
else
# see if file is compressed and set gzip option
gzip_phrase="-gzip TRUE"
gunzip -l $file > /dev/null 2>&1
if [ $? != 0 ]; then
gzip_phrase=""
fi
cat << END_NOSPLIT > $script
#!/bin/bash
#
# note , currently using -k option for debugging - remove this
#
set -x
base=`basename $file`
mkdir ${OUT_DIR}/${base}.demultiplexed
cd ${OUT_DIR}
tardis --hpctype $HPC_TYPE -k -d $OUT_DIR java -jar $SEQ_PRISMS_BIN/../bin/GBSX_v1.3.jar --Demultiplexer $ENZYME_PHRASE -f1 $file -i $OUT_DIR/$sample_info_base -o $OUT_DIR/${base}.demultiplexed -lf TRUE $gzip_phrase -mb 0 -me 0 -n false -t 8 \> $OUT_DIR/${demultiplex_moniker}.stdout 2\> $OUT_DIR/${demultiplex_moniker}.stderr
#
# test all output gzip files for integrity - e.g. gbsx does not detect problems like running out of disk space
test_ok=yes
for gzfile in $OUT_DIR/${base}.demultiplexed/*.fastq.gz; do
gzip -t \$gzfile
if [ \$? != 0 ]; then
test_ok=no
echo "error : \$gzfile failed compression integrity test" 2>&1
fi
done
if [ \$test_ok != yes ]; then
echo "error one or more fastq.gz files failed compression integrity test (did you run out of disk space ?)"
exit 1
fi
END_NOSPLIT
fi
chmod +x $script
fi
done
}
function fake() {
echo "dry run !
"
exit 0
}
function run() {
make -f demultiplex_prism.mk -d -k --no-builtin-rules -j 16 `cat $OUT_DIR/demultiplex_targets.txt` > $OUT_DIR/demultiplex_prism.log 2>&1
# run summaries
}
function html() {
echo "tba" > $OUT_DIR/demultiplex_prism.html 2>&1
}
function clean() {
if [ -f $OUT_DIR/tardis.toml.orig ]; then
mv $OUT_DIR/tardis.toml.orig $OUT_DIR/tardis.toml
else
echo "warning - did not find $OUT_DIR/tardis.toml.orig to restore, check that $OUT_DIR/tardis.toml is what you want"
fi
nohup rm -rf $OUT_DIR/tardis_* > $OUT_DIR/demultiplex_clean.log 2>&1 &
rm -f $OUT_DIR/*.fastq
}
function main() {
get_opts "$@"
check_opts
echo_opts
check_env
configure_env
get_targets
if [ $DRY_RUN != "no" ]; then
fake
else
run
if [ $? == 0 ] ; then
html
clean
else
echo "error state from demultiplex run - skipping html page generation and clean-up"
exit 1
fi
fi
}
set -x
main "$@"
set +x