-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconduit.nim
1544 lines (1434 loc) · 66.6 KB
/
conduit.nim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import os
import times
import osproc
import parseopt
import strutils
import strformat
import poGraphUtils
import tables
import heapqueue
import threadpool_simple as tps
import hts
import sets
import poaV2/header
import poaV2/poa
{.experimental.}
type
ConduitOptions = object
run_flag : bool
final_polish : bool
intermediates : bool
mode : string
clusters_directory : string
nanopore_format : string
illumina_format : string
u2t : bool
bowtie_strand_constraint : string
bowtie_alignment_mode : string
bowtie_read_inputs : string
score_matrix_path : string
output_dir : string
tmp_dir : string
files : seq[string]
trims : seq[string]
isoform_delta : uint64
ends_delta : uint64
illumina_weight : uint64
thread_num : uint64
max_iterations : uint64
stringent : bool
stringent_tolerance : int
samtools_memory : string
max_alignments : uint64
#Minor TODOs:
#TODO - convert from passing tuple back to passing vars individually; relic of older threading approach
#Major TODOs (Future release versions?):
#TODO - Add clustering tool that runs with the benefit of a reference genome ( Cluster based on splice junctions / or overlap of alignments )
#TODO - Move filtering step to a graph based polishing step, the linear step has a problem of too low coverage or too many isoforms leading to isoforms not being completely covered by illumina reads when they should be.
#TODO - Add less stringent filtering step that extracts out the longest contiguous region covered by Illumina reads (with ends tolerance?) should be easy to do and reduce # of false negatives.
#TODO - Add support for duplicate read ID's that doesn't break everything.
#TODO - Add option to output both sensitive and stringent isoform sets in the same run
#TODO - .gz support for nanopore scaffolds. Can probably do what Trinity does and just add a decompress step for generating temp files.
#TODO - Add output indicating completion percentage for each iteration. Use https://github.com/euantorano/progress.nim ?
#TODO - Add mode that continues polishing where a previous run left off
#TODO - Rewrite poa in nim(?) - Probably faster as the C code, only advantage to the rewrite is it makes doing clever things with the poa easier down the line. (Unless we did SIMD poa, which would require learning nim SIMD, and probably step on Eyras Lab's toes)
#TODO - Clustering mode for when you DO have a reference genome? Or is that too similar to Stringtie2 to be worth doing? -- Probably too similar?
#TODO - Break poParser into smaller .nim files with more accurate and descriptive names
#TODO - Figure out if there's anything to be done about the left-aligned problem inherent to the partial order graph based correction? (thereby allowing us to get rid of linear polishing step)
#TODO - Add option to output .po files (or a new format, a multi-po file) to explain the relationships between isoforms.
#TODO - Add quantification output for each isoform / gene
#TODO - Add options that allow you to specify the path to the bowtie2 and samtools binaries
proc conduitVersion() : string =
return "CONDUIT Version 0.1.0 by Nathan Roach ( nroach2@jhu.edu, https://github.com/NatPRoach/conduit/ )"
proc writeDefaultHelp() =
echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
echo conduitVersion()
echo "Usage:"
echo " ./conduit <nano | hybrid>"
echo "NOTE: nano mode not yet implemented... coming soon"
echo " to run the equivalent of nano mode, run hybrid mode with -i:0 and --no-final-polish"
echo " this will still require 'illumina' files to be passed, but will not check that they actually exist"
echo " so including -U this_file_does_not_exist.fq should run."
proc writeNanoHelp() =
echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
echo conduitVersion()
echo "Usage:"
echo " ./conduit nano [options] <clusters_directory>"
echo " <clusters_directory> Directory containing the .fasta/.fa or .fastq/.fq files of reads separated by gene cluster"
echo ""
echo "Options (defaults in parentheses):"
echo " Scaffold type:"
echo " --drna (default)"
echo " Scaffold reads are stranded forward relative to coding strand, and may contain U characters instead of Ts"
echo " --cdna-rev-stranded"
echo " Scaffold reads are stranded reverse complemented relative to coding strand"
echo " --cdna"
echo " Scaffold reads are NOT stranded"
echo " --sfq (default)"
echo " Scaffold reads are in FASTQ format"
echo " --sfa"
echo " Scaffold reads are in FASTA format"
echo " Consensus Collapsing:"
echo " -d, --isoform-delta (35)"
echo " Maximum indel size to be 'corrected', beyond this size a new isoform is declared"
echo " -e, --ends-delta (35)"
echo " Maximum size at the ends of isoforms to 'correct' before splitting"
echo " Ouput:"
echo " -o, --output-dir <path> (conduit/)"
echo " <path> where corrected clusters will be written"
echo " NOTE: THIS WILL OVERWRITE EXISTING FILES!"
echo " Miscellaneous:"
echo " -h, --help"
echo " Display this help message and exit"
echo " -v, --version"
echo " Display the installed version number of CONDUIT and exit"
echo " --tmp-dir <path> (conduit-tmp/)"
echo " <path> where temporary files will be created"
echo " -t, --threads (4)"
echo " Number of threads to run in parallel (used for both Bowtie2 and Partial Order Graph correction)"
proc writeHybridHelp() =
echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
echo conduitVersion()
echo "Usage:"
echo " ./conduit hybrid [options] <clusters_directory> {-1 <m1> -2 <m2> | -U <r> | --interleaved <i> | -b <bam>}"
echo " <clusters_directory> Directory containing the .fasta/.fa or .fastq/.fq files of reads separated by gene cluster"
echo " NOTE: .gz support coming for nanopore scaffold data, but is not an option at this time"
echo ""
echo " Illumina data is aligned with Bowtie2, therefore Illumina data is provided in the same format as Bowtie2, namely:"
echo ""
echo " <m1> Files with #1 mates, paired with files in <m2>"
echo " Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2)."
echo " <m2> Files with #2 mates, paired with files in <m1>"
echo " Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2)."
echo " <r> Files with unpaired reads"
echo " Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2)."
echo " <i> File with interleaved paired-end FASTQ/FASTA reads"
echo " Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2)."
echo " <bam> Files are unaligned BAM sorted by read name."
echo ""
echo " <m1>, <m2>, <r> can be comma-separated lists (no whitespace) and can be specified many times."
echo " E.g. '-U file1.fq,file2.fq -U file3.fq'."
echo ""
echo "Options (defaults in parentheses):"
echo " Scaffold Type:"
echo " --drna (default)"
echo " Scaffold reads are stranded forward relative to coding strand, enforces --UtoT"
echo " --cdna-rev-stranded"
echo " Scaffold reads are stranded reverse complemented relative to coding strand"
echo " --cdna"
echo " Scaffold reads are NOT stranded"
echo " --sfq (default)"
echo " Scaffold reads are in FASTQ format, enforces --UtoT"
echo " --sfa"
echo " Scaffold reads are in FASTA format"
echo " --UtoT (default)"
echo " Scaffold reads contain Us instead of Ts. Converts U nucleotides to Ts in the sequences"
echo " NOTE: This adds a bit of I/O overhead but doesn't affect things if your sequences are already U free"
echo " --noUtoT"
echo " Scaffold reads do not contain Us and do not need to be converted."
# echo " --duplicate-filter"
# echo " Scaffold reads have duplicate read IDs, filter out the duplicate reads"
echo " Illumina Type:"
echo " -u, --unstranded"
echo " Illumina reads are unstranded"
echo " -f, --fwd-stranded"
echo " Illumina reads are stranded s.t. the first mate originates from the RNA strand"
echo " Ignored if scaffold reads are not stranded"
echo " -r, --rev-stranded (default)"
echo " Illumina reads are stranded s.t. the first mate is the reverse complement of the RNA strand"
echo " Ignored if scaffold reads are not stranded"
echo " --ifq (default)"
echo " Illumina reads are in FASTQ format; Mutually exclusive with --ifa"
echo " --ifa"
echo " Illumina reads are in FASTA format; Mutually exclusive with --ifq"
echo " Consensus Collapsing:"
echo " -m, --score-matrix <path>"
echo " Provide an alternative scoring matrix to use in partial order alignment"
echo " Example formatting for the score matrix can be found at poaV2/myNUC3.4.4.mat"
echo " -d, --isoform-delta (35)"
echo " Maximum indel size to be 'corrected', beyond this size a new isoform is declared. Must be between 2 and 255"
echo " -e, --ends-delta (35)"
echo " Maximum size at the ends of isoforms to 'correct' before splitting. Must be between 2 and 255"
echo " -i, --max-iterations (5)"
echo " Maximum number of iterations to align to and correct scaffolds. Does not include optional final polshing step"
echo " Note: Providing a value of 0 will not perform any graph based illumina correction"
echo " -w, --illumina-weight (10)"
echo " Weight of illumina reads relative to nanopore reads when generating consensus"
echo " --final-polish (default)"
echo " Include a final correction of individual isoforms, not in a splice graph"
echo " --no-final-polish"
echo " Do not do a final correction of individual isoforms, not in a splice graph"
echo " --stringent (default)"
echo " Enforce that every base / edge in each final reported isoform is supported by an Illumina read, excluding --stringent-tolerance bp on each end of each isoform"
echo " --no-stringent"
echo " Do not enforce that every base / edge in each final reported isoform is supported by an Illumina read"
echo " --stringent-tolerance (100)"
echo " Number of bases at the end of each isoform that do not have to have Illumina reads supporting them when run in --stringent mode; ignored when run with --no-stringents"
# echo " --scaffold-minimum (1)" #TODO
# echo " Minimum number of scaffolding reads supporting an isoform necessary to report the isoform in the final output"
echo " Ouput:"
echo " -o, --output-dir <path> (conduit/)"
echo " <path> where corrected clusters will be written"
echo " NOTE: THIS WILL OVERWRITE EXISTING FILES!"
echo " -n, --no-intermediates (default)"
echo " Does not save FASTA file generated for intermediate rounds of polishing"
echo " -s, --save-intermediates"
echo " Saves the FASTA file generated for intermediate rounds of polishing"
echo " Bowtie2:"
echo " --end-to-end (default)"
echo " Align Illumina reads to ONT scaffolds in end-to-end alignment mode"
echo " --local"
echo " Align Illumina reads to ONT scaffolds in local alignment mode"
echo " -k,--bowtie2-max-alignments (50)"
echo " Maximum number of alignments per Illumina read to be used in final polishing step"
# echo " --bowtie2-path (bowtie2)" #TODO
echo " SAMtools:"
echo " --samtools-thread-memory (768 MiB)"
echo " Memory amount to use per SAMtools thread"
echo " Specified either in bytes or with a K, M, or G suffix"
# echo " --samtools-path (samtools)" #TODO
echo " Miscellaneous:"
echo " -h, --help"
echo " Display this help message and exit"
echo " -v, --version"
echo " Display the installed version number of CONDUIT and exit"
echo " --tmp-dir <path> (conduit-tmp/)"
echo " <path> where temporary files will be created"
echo " -t, --threads (4)"
echo " Number of threads to run in parallel (used for both Bowtie2 and Partial Order Graph correction)"
# echo " NOTE: Providing a value of 0 will attempt to autodetect the number of CPUs availible and use that." #TODO
# echo " If CPU number cannot be detected, the default of 4 threads will be used. #TODO
proc removeFiles(files : openArray[string]) =
for file in files:
removeFile(file)
proc createDirs(dirs : openArray[string]) =
for dir in dirs:
createDir(dir)
proc returnFalse() : bool {.thread.} =
return false
proc convertFASTQtoFASTA(infilepath,outfilepath:string) =
var infile,outfile : File
discard open(infile,infilepath,fmRead)
discard open(outfile,outfilepath,fmWrite)
var read_ids : HashSet[string]
while true:
try:
let line1 = infile.readLine()
try:
assert line1[0] == '@'
except AssertionError:
echo "File not in FASTQ format"
raise
if line1 notin read_ids:
read_ids.incl(line1)
outfile.write(&">{line1[1..^1]}\n")
outfile.write(&"{infile.readLine().replace(sub='U',by='T')}\n")
discard infile.readLine()
discard infile.readLine()
else:
discard infile.readLine()
discard infile.readLine()
discard infile.readLine()
except EOFError:
break
infile.close()
outfile.close()
proc convertUtoTinFASTA(infilepath,outfilepath:string) =
var infile,outfile : File
discard open(infile,infilepath,fmRead)
discard open(outfile,outfilepath,fmWrite)
while true:
try:
let line = infile.readLine()
if line[0] == '>':
outfile.write(&"{line}\n")
else:
outfile.write(&"{line.replace(sub='U',by='T')}\n")
except EOFError:
break
infile.close()
outfile.close()
proc outputTiming(outfilepath : string,time_seq : seq[Time],opts : ConduitOptions) =
var outfile : File
discard open(outfile,outfilepath,fmWrite)
outfile.write("CONDUIT Timing Log:\n")
for i in 0..<time_seq.len - 2:
outfile.write(&"Iter {i}: {(time_seq[i + 1] - time_seq[i]).inSeconds} s\n")
if opts.final_polish:
outfile.write(&"Final Polish: {(time_seq[^1] - time_seq[^2]).inSeconds} s\n")
outfile.write(&"Total: {(time_seq[^1] - time_seq[0]).inSeconds} s\n")
outfile.close()
proc outputSettings(outfilepath : string,opts : ConduitOptions) =
var outfile : File
discard open(outfile,outfilepath,fmWrite)
outfile.write("CONDUIT Command Log:\n")
outfile.write("Working directory:\n")
outfile.write(&" {getCurrentDir()}\n")
outfile.write("Command:\n")
let command_params = commandLineParams().join(sep = " ")
outfile.write(&" {getAppFilename()} {command_params}\n")
outfile.write("CONDUIT Settings:\n")
outfile.write(&" mode : {opts.mode}\n")
outfile.write(&" rounds of graph based polishing : {opts.max_iterations}\n")
outfile.write(&" final round of linear polishing : {opts.final_polish}\n")
outfile.write(&" stringent mode : {opts.stringent}\n")
if opts.stringent:
outfile.write(&" stringent tolerance : {opts.stringent_tolerance}\n")
outfile.write(&" save intermediates : {opts.intermediates}\n")
outfile.write(&" scaffold directory : {opts.clusters_directory}\n")
outfile.write(&" scaffold file type : {opts.nanopore_format}\n")
outfile.write(&" Convert U to T : {opts.u2t}\n")
outfile.write(&" Illumina files : {opts.bowtie_read_inputs}\n")
outfile.write(&" Illumina file type : {opts.illumina_format}\n")
outfile.write(&" isoform delta : {opts.isoform_delta}\n")
outfile.write(&" ends delta : {opts.ends_delta}\n")
outfile.write(&" Illumina weight : {opts.illumina_weight}\n")
outfile.write(&" threads : {opts.thread_num}\n")
if opts.score_matrix_path == "":
outfile.write( " score matrix : default\n")
else:
outfile.write(&" score matrix : {opts.score_matrix_path}\n")
outfile.write(&" output directory : {opts.output_dir}\n")
outfile.write(&" temporary directory : {opts.tmp_dir}\n")
outfile.write("SAMtools Settings\n")
outfile.write(&" SAMtools thread memory : {opts.samtools_memory}\n")
outfile.write("Bowtie2 Settings:\n")
outfile.write(&" bowtie2 strand constraint : {opts.bowtie_strand_constraint}\n")
outfile.write(&" bowtie2 alignment mode : {opts.bowtie_alignment_mode}\n")
outfile.write(&" bowtie2 max alignments : {opts.max_alignments}\n")
outfile.close()
proc splitFASTA(infilepath,outfilepath_prefix : string, split_num : int = 200) : (int,int) =
# Takes in a fasta file with more reads than some arbitrary number split_num
# produces fasta files split into sizes of that size or smaller. Similar to RATTLE implementation in that to avoid size biases the sampling is performed with an offset.
var infile : File
discard open(infile,infilepath,fmRead)
var records : seq[FastaRecord]
var read_id : string
var sequence : string
var count = 0
while true:
try:
let line = infile.readLine()
if line[0] == '>':
if count != 0:
records.add(FastaRecord( read_id : read_id, sequence : sequence))
sequence = ""
count += 1
read_id = line.strip(leading=true,trailing=false,chars = {'>'}).strip(leading=false,trailing=true,chars = {'\n'})
else:
sequence = sequence & line.strip(leading=false,trailing=true,chars = {'\n'})
except EOFError:
records.add(FastaRecord( read_id : read_id, sequence : sequence))
break
infile.close()
let num_outfiles = int(records.len mod split_num != 0) + (records.len div split_num)
if num_outfiles > 1:
for i in 0..<num_outfiles:
var outfile : File
discard open(outfile,&"{outfilepath_prefix}_subfasta{i}.fa",fmWrite)
for j in 0..<split_num:
let idx = i + (j * num_outfiles)
if idx < records.len:
outfile.write(">",records[idx].read_id,"\n")
outfile.writeLine(records[idx].sequence)
outfile.close()
return (num_outfiles,records.len)
proc mergeFiles(infilepaths : openArray[string],outfilepath : string,delete_old_files = false) =
var outfile : File
discard open(outfile,outfilepath,fmWrite)
for infilepath in infilepaths:
var infile : File
discard open(infile,infilepath,fmRead)
outfile.write(infile.readAll)
infile.close()
if delete_old_files:
removeFiles(infilepaths)
outfile.close()
proc splitFASTA2(infilepath,outfilepath_prefix : string, split_num : int = 200) : (int,int) =
# Takes in a fasta file with more reads than some arbitrary number split_num
# produces fasta files split into sizes of that size or smaller. Different from RATTLE implementation in that size bias is not considered and linear blocks of sequences are extracted
var infile : File
discard open(infile,infilepath,fmRead)
var records : seq[FastaRecord]
var read_id : string
var sequence : string
var count = 0
while true:
try:
let line = infile.readLine()
if line[0] == '>':
if count != 0:
records.add(FastaRecord( read_id : read_id, sequence : sequence))
sequence = ""
count += 1
read_id = line.strip(leading=true,trailing=false,chars = {'>'}).strip(leading=false,trailing=true,chars = {'\n'})
else:
sequence = sequence & line.strip(leading=false,trailing=true,chars = {'\n'})
except EOFError:
records.add(FastaRecord( read_id : read_id, sequence : sequence))
break
infile.close()
let num_outfiles = int(records.len mod split_num != 0) + (records.len div split_num)
if num_outfiles > 1:
for i in 0..<num_outfiles:
var outfile : File
discard open(outfile,&"{outfilepath_prefix}_subfasta{i}.fa",fmWrite)
for j in 0..<split_num:
let idx = (i*split_num) + j
if idx < records.len:
outfile.write(">",records[idx].read_id,"\n")
outfile.writeLine(records[idx].sequence)
outfile.close()
return (num_outfiles,records.len)
# proc runPOAandCollapsePOGraph(intuple : (string,string,string,string,uint16,uint16)) {.thread.} =
# let (infilepath,outdir,matrix_filepath,format,isoform_delta,ends_delta) = intuple
# let trim = infilepath.split(os.DirSep)[^1].split(".")[0]
# var fasta_file : string
# if format == "fasta":
# fasta_file = infilepath
# elif format == "fastq":
# fasta_file = &"{outdir}{trim}.tmp.fa"
# convertFASTQtoFASTA(infilepath,fasta_file)
# var seq_file : PFile = fopen(cstring(fasta_file), "r")
# # echo matrix_filepath
# # echo cstring(matrix_filepath)
# # let matrix_filepath : cstring = "../poaV2/myNUC3.4.4.mat"
# var po = getPOGraphFromFasta(seq_file,cstring(matrix_filepath),cint(1),matrix_scoring_function)
# if format == "fastq":
# removeFile(fasta_file)
# var trim_po = poParser.convertPOGraphtoTrimmedPOGraph(po)
# var representative_paths = poParser.getRepresentativePaths3(addr trim_po, psi = isoform_delta, ends_delta = ends_delta)
# let consensus_po = poParser.buildConsensusPO(addr po, representative_paths,trim)
# let outFASTAfilepath = &"{outdir}fasta{os.DirSep}{trim}.consensus.fa"
# var outFASTAfile : File
# discard open(outFASTAfile,outFASTAfilepath,fmWrite)
# poParser.writeCorrectedReads(consensus_po,outFASTAfile)
# outFASTAfile.close()
proc runPOAandCollapsePOGraph(intuple : (string,string,string,string,uint16,uint16,bool)) {.thread.} =
let (infilepath,outdir,matrix_filepath,format,isoform_delta,ends_delta,u2t) = intuple
let trim = infilepath.split(os.DirSep)[^1].split(".")[0]
var fasta_file : string
if format == "fasta":
if u2t:
fasta_file = &"{outdir}{trim}.tmp.fa"
convertUtoTinFASTA(infilepath,fasta_file)
else:
fasta_file = infilepath
elif format == "fastq":
fasta_file = &"{outdir}{trim}.tmp.fa"
convertFASTQtoFASTA(infilepath,fasta_file)
var split_num = 200
var (num_fastas,_) = splitFASTA2(fasta_file,&"{outdir}{trim}.tmp",split_num = split_num)
var total_fastas = num_fastas
var delete_fasta_flag = false
if format == "fastq" or u2t or num_fastas > 1:
delete_fasta_flag = true
if num_fastas > 1:
removeFile(fasta_file)
# Cacluate representative reads for each subfasta, store each in separate consensus fasta file
var merge_queue : HeapQueue[(int, int)]
for i in 0..<num_fastas:
let outFASTAfilepath = &"{outdir}{trim}.tmp_consensus{i}.fa"
var outFASTAfile : File
discard open(outFASTAfile,outFASTAfilepath,fmWrite)
let tmp_fasta = &"{outdir}{trim}.tmp_subfasta{i}.fa"
var seq_file : PFile = fopen(cstring(tmp_fasta))
var po = getPOGraphFromFasta(seq_file,cstring(matrix_filepath),cint(1),matrix_scoring_function)
removeFile(tmp_fasta)
var trim_po = poGraphUtils.convertPOGraphtoTrimmedPOGraph(po)
var (representative_paths,read_supports) = poGraphUtils.getRepresentativePaths3(addr trim_po, psi = isoform_delta, ends_delta = ends_delta)
let consensus_po = poGraphUtils.buildConsensusPO(addr po, representative_paths, read_supports, &"{trim}.tmp_subfasta{i}")
poGraphUtils.writeCorrectedReads(consensus_po,outFASTAfile)
outFASTAfile.close()
merge_queue.push((consensus_po.reads.len,i))
var (_,file_idx1) = merge_queue.pop()
var (_,file_idx2) = merge_queue.pop()
var new_filepath = ""
while true:
# Figure out which 2 files we're merging
let filepath1 = &"{outdir}{trim}.tmp_consensus{file_idx1}.fa"
let filepath2 = &"{outdir}{trim}.tmp_consensus{file_idx2}.fa"
# Merge the files
let new_tmp_filepath = &"{outdir}{trim}.tmp_subfasta{total_fastas}.fa"
mergeFiles([filepath1,filepath2], new_tmp_filepath, delete_old_files=true)
# Decompose the new temp file
var seq_file : PFile = fopen(cstring(new_tmp_filepath))
var po = getPOGraphFromFasta(seq_file,cstring(matrix_filepath),cint(1),matrix_scoring_function,weight_support = true)
removeFile(new_tmp_filepath)
var trim_po = poGraphUtils.convertPOGraphtoTrimmedPOGraph(po)
var (representative_paths,read_supports) = poGraphUtils.getRepresentativePaths3(addr trim_po, psi = isoform_delta, ends_delta = ends_delta)
let consensus_po = poGraphUtils.buildConsensusPO(addr po, representative_paths, read_supports, &"{trim}.tmp_subfasta{total_fastas}")
# Write the decomposition to a new consensus file
new_filepath = &"{outdir}{trim}.tmp_consensus{total_fastas}.fa"
var new_file : File
discard open(new_file,new_filepath,fmWrite)
poGraphUtils.writeCorrectedReads(consensus_po,new_file)
new_file.close()
if merge_queue.len > 0:
var tmp : int
merge_queue.push((consensus_po.reads.len,total_fastas))
(tmp,file_idx1) = merge_queue.pop()
(tmp,file_idx2) = merge_queue.pop()
total_fastas += 1
else:
break
moveFile(new_filepath, &"{outdir}fasta{os.DirSep}{trim}.consensus.fa")
else:
var seq_file : PFile = fopen(cstring(fasta_file), "r")
var po2 = getPOGraphFromFasta(seq_file,cstring(matrix_filepath),cint(1),matrix_scoring_function)
if delete_fasta_flag:
removeFile(fasta_file)
var trim_po2 = poGraphUtils.convertPOGraphtoTrimmedPOGraph(po2)
var (representative_paths,read_supports) = poGraphUtils.getRepresentativePaths3(addr trim_po2, psi = isoform_delta, ends_delta = ends_delta)
let consensus_po = poGraphUtils.buildConsensusPO(addr po2, representative_paths, read_supports, trim)
let outFASTAfilepath = &"{outdir}fasta{os.DirSep}{trim}.consensus.fa"
var outFASTAfile : File
discard open(outFASTAfile,outFASTAfilepath,fmWrite)
poGraphUtils.writeCorrectedReads(consensus_po,outFASTAfile)
outFASTAfile.close()
# proc runPOAandCollapsePOGraph(intuple : (string,string,string,string,uint16,uint16,bool)) {.thread.} =
# let (infilepath,outdir,matrix_filepath,format,isoform_delta,ends_delta,u2t) = intuple
# let trim = infilepath.split(os.DirSep)[^1].split(".")[0]
# var fasta_file : string
# if format == "fasta":
# if u2t:
# fasta_file = &"{outdir}{trim}.tmp.fa"
# convertUtoTinFASTA(infilepath,fasta_file)
# else:
# fasta_file = infilepath
# elif format == "fastq":
# fasta_file = &"{outdir}{trim}.tmp.fa"
# convertFASTQtoFASTA(infilepath,fasta_file)
# var last_num_reads = 0
# var split_num = 200
# var (num_fastas,num_reads) = splitFASTA2(fasta_file,&"{outdir}{trim}.tmp",split_num = split_num)
# var delete_fasta_flag = false
# if format == "fastq" or u2t or num_fastas > 1:
# delete_fasta_flag = true
# if num_fastas > 1:
# removeFile(fasta_file)
# # Cacluate representative reads for each subfasta, store each in separate consensus fasta file
# for i in 0..<num_fastas:
# let outFASTAfilepath = &"{outdir}{trim}.tmp_consensus{i}.fa"
# var outFASTAfile : File
# discard open(outFASTAfile,outFASTAfilepath,fmWrite)
# let tmp_fasta = &"{outdir}{trim}.tmp_subfasta{i}.fa"
# var seq_file : PFile = fopen(cstring(tmp_fasta))
# var po = getPOGraphFromFasta(seq_file,cstring(matrix_filepath),cint(1),matrix_scoring_function)
# removeFile(tmp_fasta)
# var trim_po = poGraphUtils.convertPOGraphtoTrimmedPOGraph(po)
# var representative_paths = poGraphUtils.getRepresentativePaths3(addr trim_po, psi = isoform_delta, ends_delta = ends_delta)
# let consensus_po = poGraphUtils.buildConsensusPO(addr po, representative_paths,&"{trim}.tmp_subfasta{i}")
# poGraphUtils.writeCorrectedReads(consensus_po,outFASTAfile)
# outFASTAfile.close()
# fasta_file = &"{outdir}{trim}.tmp_consensus0.fa"
# while num_fastas > 1:
# for i in 0..<(num_fastas div 2):
# # Merge subfastas:
# let subfastapath1 = &"{outdir}{trim}.tmp_consensus{2*i}.fa"
# let subfastapath2 = &"{outdir}{trim}.tmp_consensus{2*i + 1}.fa"
# let tmp_fasta = &"{outdir}{trim}.tmp_merged_subfasta{i}.fa"
# mergeFiles([subfastapath1,subfastapath2],tmp_fasta, delete_old_files = true)
# #Calculate POGraph for merged subfastas:
# let outFASTAfilepath = &"{outdir}{trim}.tmp_consensus{i}.fa"
# var outFASTAfile : File
# discard open(outFASTAfile,outFASTAfilepath,fmWrite)
# var seq_file : PFile = fopen(cstring(tmp_fasta))
# var po = getPOGraphFromFasta(seq_file,cstring(matrix_filepath),cint(1),matrix_scoring_function)
# removeFile(tmp_fasta)
# var trim_po = poGraphUtils.convertPOGraphtoTrimmedPOGraph(po)
# # Decompose graph into representative paths
# var representative_paths = poGraphUtils.getRepresentativePaths3(addr trim_po, psi = isoform_delta, ends_delta = ends_delta)
# let consensus_po = poGraphUtils.buildConsensusPO(addr po, representative_paths,&"{trim}.tmp_subfasta{i}")
# # Output those path to out outfile
# poGraphUtils.writeCorrectedReads(consensus_po,outFASTAfile)
# outFASTAfile.close()
# if bool(num_fastas mod 2):
# #uneven split, merge last subfasta with last merged subfasta
# let idx1 = num_fastas - 1
# let idx2 = (num_fastas div 2) - 1
# let last_fasta = &"{outdir}{trim}.tmp_consensus{idx1}.fa"
# let last_merge = &"{outdir}{trim}.tmp_consensus{idx2}.fa"
# let tmp_file = &"{outdir}{trim}.tmp_final_merge.fa"
# mergeFiles([last_fasta,last_merge],tmp_file, delete_old_files = true)
# moveFile(tmp_file,last_merge)
# num_fastas = num_fastas div 2
# # if num_reads == last_num_reads:
# # for i in 0..<num_fastas:
# # let tmp_fasta = &"{outdir}{trim}.tmp_subfasta{i}.fa"
# # removeFile(tmp_fasta)
# var seq_file : PFile = fopen(cstring(fasta_file), "r")
# var po2 = getPOGraphFromFasta(seq_file,cstring(matrix_filepath),cint(1),matrix_scoring_function)
# if delete_fasta_flag:
# removeFile(fasta_file)
# var trim_po2 = poGraphUtils.convertPOGraphtoTrimmedPOGraph(po2)
# var representative_paths = poGraphUtils.getRepresentativePaths3(addr trim_po2, psi = isoform_delta, ends_delta = ends_delta)
# let consensus_po = poGraphUtils.buildConsensusPO(addr po2, representative_paths,trim)
# let outFASTAfilepath = &"{outdir}fasta{os.DirSep}{trim}.consensus.fa"
# var outFASTAfile : File
# discard open(outFASTAfile,outFASTAfilepath,fmWrite)
# poGraphUtils.writeCorrectedReads(consensus_po,outFASTAfile)
# outFASTAfile.close()
# proc runPOAandCollapsePOGraph(intuple : (string,string,string,string,uint16,uint16,bool)) {.thread.} =
# let (infilepath,outdir,matrix_filepath,format,isoform_delta,ends_delta,u2t) = intuple
# let trim = infilepath.split(os.DirSep)[^1].split(".")[0]
# var fasta_file : string
# if format == "fasta":
# if u2t:
# fasta_file = &"{outdir}{trim}.tmp.fa"
# convertUtoTinFASTA(infilepath,fasta_files)
# else:
# fasta_file = infilepath
# elif format == "fastq":
# fasta_file = &"{outdir}{trim}.tmp.fa"
# convertFASTQtoFASTA(infilepath,fasta_file)
# var last_num_reads = 0
# var split_num = 200
# var (num_fastas,num_reads) = splitFASTA(fasta_file,&"{outdir}{trim}.tmp",split_num = split_num)
# var delete_fasta_flag = false
# if format == "fastq" or u2t or num_fastas > 1:
# delete_fasta_flag = true
# while num_fastas > 1 and num_reads != last_num_reads: #num_fastas > 1
# if format == "fastq":
# removeFile(fasta_file)
# let outFASTAfilepath = &"{outdir}{trim}.tmp_consensus.fa"
# var outFASTAfile : File
# discard open(outFASTAfile,outFASTAfilepath,fmWrite)
# for i in 0..<num_fastas:
# #Calculate subclusters
# let tmp_fasta = &"{outdir}{trim}.tmp_subfasta{i}.fa"
# var seq_file : PFile = fopen(cstring(tmp_fasta))
# var po = getPOGraphFromFasta(seq_file,cstring(matrix_filepath),cint(1),matrix_scoring_function)
# removeFile(tmp_fasta)
# var trim_po = poParser.convertPOGraphtoTrimmedPOGraph(po)
# var representative_paths = poParser.getRepresentativePaths3(addr trim_po, psi = isoform_delta, ends_delta = ends_delta)
# let consensus_po = poParser.buildConsensusPO(addr po, representative_paths,&"{trim}.tmp_subfasta{i}")
# poParser.writeCorrectedReads(consensus_po,outFASTAfile)
# outFASTAfile.close()
# fasta_file = outFASTAfilepath
# last_num_reads = num_reads
# split_num = split_num * 2 #As consensus sequences are collapsed, we should generate fewer spurious edges and nodes with more reads, should be able to handle more reads faster.
# (num_fastas,num_reads) = splitFASTA(fasta_file,&"{outdir}{trim}.tmp",split_num = split_num)
# if num_reads == last_num_reads:
# for i in 0..<num_fastas:
# let tmp_fasta = &"{outdir}{trim}.tmp_subfasta{i}.fa"
# removeFile(tmp_fasta)
# var seq_file : PFile = fopen(cstring(fasta_file), "r")
# var po2 = getPOGraphFromFasta(seq_file,cstring(matrix_filepath),cint(1),matrix_scoring_function)
# if delete_fasta_flag:
# removeFile(fasta_file)
# var trim_po2 = poParser.convertPOGraphtoTrimmedPOGraph(po2)
# var representative_paths = poParser.getRepresentativePaths3(addr trim_po2, psi = isoform_delta, ends_delta = ends_delta)
# let consensus_po = poParser.buildConsensusPO(addr po2, representative_paths,trim)
# let outFASTAfilepath = &"{outdir}fasta{os.DirSep}{trim}.consensus.fa"
# var outFASTAfile : File
# discard open(outFASTAfile,outFASTAfilepath,fmWrite)
# poParser.writeCorrectedReads(consensus_po,outFASTAfile)
# outFASTAfile.close()
proc runGraphBasedIlluminaCorrection(intuple : (string,string,string,uint64,uint16,uint16)) : bool {.thread.} =
let (tmp_dir, trim, matrix_filepath, iter,isoform_delta,ends_delta) = intuple
let last_fasta_dir = &"{tmp_dir}{iter-1}{os.DirSep}fasta{os.DirSep}"
let this_fasta_dir = &"{tmp_dir}{iter}{os.DirSep}fasta{os.DirSep}"
let last_fasta_filepath = &"{last_fasta_dir}{trim}.consensus.fa"
var seq_file : PFile = fopen(cstring(last_fasta_filepath), "r")
var po = getPOGraphFromFasta(seq_file,matrix_filepath,cint(1),matrix_scoring_function,weight_support = true)
let this_fasta_filepath = &"{this_fasta_dir}{trim}.consensus.fa"
let bamfilepath = &"{tmp_dir}{iter-1}{os.DirSep}alignments.bam"
var bam : Bam
var trim_po = convertPOGraphtoTrimmedPOGraph(po)
discard open(bam,bamfilepath,index=true)
illuminaPolishPOGraph(addr trim_po, bam)
var (representative_paths,read_supports) = getRepresentativePaths3(addr trim_po, psi = isoform_delta,ends_delta = ends_delta)
var records = getFastaRecordsFromTrimmedPOGraph(addr trim_po, representative_paths, read_supports, trim)
var outfile : File
discard open(outfile,this_fasta_filepath,fmWrite)
writeCorrectedReads(records,outfile)
outfile.close()
result = sameFileContent(last_fasta_filepath,this_fasta_filepath)
removeFile(last_fasta_filepath)
proc runLinearBasedIlluminaCorrection(intuple : (string,string,uint64,uint64,uint16,bool,int)) {.thread.} =
let (tmp_dir, trim, converged_iter, iter,isoform_delta,stringent,stringent_tolerance) = intuple
let last_fasta_dir = &"{tmp_dir}{converged_iter}{os.DirSep}fasta{os.DirSep}"
let this_fasta_dir = &"{tmp_dir}{iter}{os.DirSep}fasta{os.DirSep}"
let last_fasta_filepath = &"{last_fasta_dir}{trim}.consensus.fa"
let this_fasta_filepath = &"{this_fasta_dir}{trim}.consensus.fa"
let bamfilepath = &"{tmp_dir}{iter-1}{os.DirSep}alignments.bam"
var infile : File
var bam : Bam
discard open(infile,last_fasta_filepath)
var reads = parseFasta(infile)
infile.close()
var corrected : seq[FastaRecord]
discard open(bam,bamfilepath,index=true)
for read in reads:
var trim_po = getTrimmedGraphFromFastaRecord(read)
illuminaPolishPOGraph(addr trim_po, bam,debug=true)
discard getRepresentativePaths3(addr trim_po, psi = isoform_delta)
if (not stringent) or stringencyCheck(addr trim_po,trim_po.reads[0].corrected_path,stringent_tolerance = stringent_tolerance):
corrected.add(FastaRecord(read_id : read.read_id, sequence : getSequenceFromPath(trim_po,trim_po.reads[0].corrected_path)))
var outfile : File
discard open(outfile,this_fasta_filepath,fmWrite)
writeCorrectedReads(corrected,outfile)
outfile.close()
# proc runStringencyFilter() {.thread.}
proc combineFiles(indirectory : string, intrims : openArray[string], outfilepath : string) =
var outfile : File
discard open(outfile,outfilepath,fmWrite)
for i,trim in intrims:
let filepath = &"{indirectory}{trim}.consensus.fa"
if existsFile(filepath):
var file : File
echo &"appending {filepath}"
discard open(file,filepath,fmRead)
outfile.write(file.readAll)
file.close()
else:
echo &"{filepath} doesn't exist, poaV2 went wrong with that cluster"
outfile.close()
proc combineFilesIntermediate(indirectory : string, intrims : openArray[string], outfilepath : string, last_correction : Table[int,int]) =
var outfile : File
discard open(outfile,outfilepath,fmWrite)
for i,trim in intrims:
if i in last_correction:
continue
let filepath = &"{indirectory}{trim}.consensus.fa"
if existsFile(filepath):
var file : File
echo &"appending {filepath}"
discard open(file,filepath,fmRead)
outfile.write(file.readAll)
file.close()
else:
echo &"{filepath} doesn't exist, poaV2 went wrong with that cluster..."
outfile.close()
proc combineFilesFinal(tmp_directory : string,last_num : uint64, intrims : openArray[string], outfilepath : string, last_correction : Table[int,int]) =
var outfile : File
discard open(outfile,outfilepath,fmWrite)
for i,trim in intrims:
var last = last_num
if i in last_correction:
last = uint64(last_correction[i]) #TODO convert last_correction to uint64 types.
let indirectory = &"{tmp_directory}{last}{os.DirSep}fasta{os.DirSep}"
let filepath = &"{indirectory}{trim}.consensus.fa"
if existsFile(filepath):
var file : File
echo &"appending {filepath}"
discard open(file,filepath,fmRead)
outfile.write(file.readAll)
file.close()
else:
echo &"{filepath} doesn't exist, poaV2 went wrong with that cluster..."
outfile.close()
proc getBowtie2options(opt : ConduitOptions, index_prefix, sam : string, final_polish : bool = false) : seq[string] =
var arguments : seq[string]
arguments.add("--xeq")
arguments.add("--no-unal")
arguments.add("-p")
arguments.add(&"{opt.thread_num}")
if final_polish:
arguments.add("-k")
arguments.add(&"{opt.max_alignments}")
arguments.add(opt.bowtie_strand_constraint)
arguments.add(opt.bowtie_alignment_mode)
arguments.add("--n-ceil")
arguments.add("L,0,0")
arguments.add("-x")
arguments.add(index_prefix)
for arg in opt.bowtie_read_inputs.split(" "):
arguments.add(arg)
arguments.add("-S")
arguments.add(sam)
return arguments
proc parseOptions() : ConduitOptions =
var clusters_directory = ""
var clusters_directory_flag = false
var mate1s : seq[string]
var mate2s : seq[string]
var unpaireds : seq[string]
var interleaved : seq[string]
var bams : seq[string]
var illumina_strand="reverse"
var illumina_strand_flag = false
var illumina_format="fastq"
var illumina_format_flag = false
var nanopore_type="drna"
var nanopore_type_flag = false
var nanopore_strand="forward"
# var nanopore_strand_flag = false
var nanopore_format="fastq"
var nanopore_format_flag = false
var u2t = true
var u2t_flag = false
var score_matrix_path = ""
var score_matrix_path_flag = false
var isoform_delta = 35'u64
var isoform_delta_flag = false
var ends_delta = 35'u64
var ends_delta_flag = false
var max_iterations = 5'u64
var max_iterations_flag = false
var illumina_weight = 10'u64
var illumina_weight_flag = false
var final_polish = true
var final_polish_flag = false
var stringent = true
var stringent_flag = false
var stringent_tolerance = 100
var stringent_tolerance_flag = false
var output_dir = &"conduit-out{os.DirSep}"
var output_dir_flag = false
var intermediates = false
var intermediates_flag = false
var local = false
var local_flag = false
var max_alignments = 50'u64
var max_alignments_flag = false
var samtools_memory = "768M"
var samtools_memory_flag = false
var help_flag = true
var version_flag = false
var tmp_dir = &"conduit-tmp{os.DirSep}"
var tmp_dir_flag = false
var thread_num = 4'u64
var thread_num_flag = false
var run_flag = true
var i = 0
var mode = ""
var last = ""
for kind, key, val in getopt():
# echo kind," ", key," ", val
if i == 0:
mode = key
i += 1
continue
if i == 1:
help_flag = false
i += 1
case mode:
of "nano", "hybrid":
case kind:
of cmdEnd:
break
of cmdShortOption, cmdLongOption:
if last != "":
echo &"ERROR - Option {last} provided without an argument"
run_flag = false
break
case key:
of "1":
if val != "":
mate1s.add(val)
else:
last = "1"
of "2":
if val != "":
mate2s.add(val)
else:
last = "2"
of "U":
if val != "":
unpaireds.add(val)
else:
last = "U"
of "interleaved":
if val != "":
interleaved.add(val)
else:
last = "interleaved"
of "b":
if val != "":
bams.add(val)
else:
last = "b"
of "drna":
if not nanopore_type_flag:
nanopore_type_flag = true
else:
run_flag = false
echo "ERROR - Multiple scaffold types input, choose one of \"--drna\", \"--cdna-rev-stranded\", or \"--cdna\""
of "cdna-rev-stranded":
if not nanopore_type_flag:
nanopore_type_flag = true
nanopore_type = "cdna"
nanopore_strand = "reverse"
else:
run_flag = false
echo "ERROR - Multiple scaffold types input, choose one of \"--drna\", \"--cdna-rev-stranded\", or \"--cdna\""
help_flag = true
break
of "cdna":
if not nanopore_type_flag:
nanopore_type_flag = true
nanopore_type = "cdna"
nanopore_strand = "unstranded"
else:
run_flag = false
echo "ERROR - Multiple scaffold types input, choose one of \"--drna\", \"--cdna-rev-stranded\", or \"--cdna\""
help_flag = true
break
of "sfq":
if not nanopore_format_flag:
#already fastq
nanopore_format_flag = true
else:
run_flag = false
echo "ERROR - Multiple scaffold format input, choose one of FASTA (--sfa) or FASTQ (--sfq)"
help_flag = true
break
of "sfa":
if not nanopore_format_flag:
#already fastq
nanopore_format_flag = true
nanopore_format = "fasta"
else:
run_flag = false
echo "ERROR - Multiple scaffold format input, choose one of FASTA (--sfa) or FASTQ (--sfq)"
help_flag = true
break
of "UtoT":
if not u2t_flag:
u2t_flag = true
elif not u2t:
run_flag = false
echo "ERROR - Conflicting flags: --UtoT and --noUtoT both specified"
break
of "noUtoT":
if not u2t_flag:
u2t_flag = true
u2t = false
elif u2t:
run_flag = false
echo "ERROR - Conflicting flags: --UtoT and --noUtoT both specified"
break
of "m", "score-matrix":
if not score_matrix_path_flag:
score_matrix_path_flag = true
if val != "":
score_matrix_path = val