more code restructuring

NatPRoach · Nov 21, 2020 · c9e7bf3 · c9e7bf3
1 parent 6616821
commit c9e7bf3
Show file tree

Hide file tree

Showing 6 changed files with 193 additions and 51 deletions.
diff --git a/conduit.nimble b/conduit.nimble
@@ -17,8 +17,3 @@ before install:
     exec "make"
 
 bin = @["conduit", "conduitUtils", "conduit_clustering"]
-# skipDirs = @["tests"]
-# skipFiles = @["GT04008021.bam"]
-
-# task test, "run the tests":
-#   exec "nim c --lineDir:on --debuginfo -r tests/all"
diff --git a/src/conduitUtils.nim b/src/conduitUtils.nim
@@ -9,6 +9,7 @@ import poGraphUtils
 import algorithm
 import fasta
 import fastq
+import na
 
 type
   BLASTmatch* = object
@@ -49,16 +50,19 @@ type
     end_idx* : uint64
     introns* : seq[(uint64,uint64)]
 
+
 # proc intersect(a,b : HashSet[GTFTranscript]) =
 #   var a_,b_ = HashSet[(string,char,seq[(uint64,uint64)])]
 #   for tx in a.items:
 #     a_.incl((tx.chr,tx.strand,tx.introns))
 #   for tx in b.items:
 #     b_.incl(())
 
+
 proc conduitUtilsVersion() : string =
   return "CONDUIT Utilities Version 0.1.0 by Nathan Roach ( nroach2@jhu.edu, https://github.com/NatPRoach/conduit/ )"
 
+
 proc writeDefaultHelp() = 
   echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
   echo conduitUtilsVersion()
@@ -110,6 +114,7 @@ proc writeTranslateHelp() =
   echo "    -l, --min-length (75)"
   echo "        Minimum length in Amino Acids necessary for a putative ORF to be reported"
 
+
 proc writeStrandTranscriptsHelp() =
   echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
   echo conduitUtilsVersion()
@@ -141,6 +146,7 @@ proc writeBED2GTFHelp() =
   echo "    -s, --stranded"
   echo "        Report gtf fields with strand information"
 
+
 proc writeBLASTPHelp() = 
   echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
   echo conduitUtilsVersion()
@@ -151,6 +157,7 @@ proc writeBLASTPHelp() =
   echo "  <outPutativeOrthologs.tsv>  Tab separated file of putative ortholog matches"
   echo "                              In format: <Query ID>\t<Reference proteome top match ID>\t<E value>"
 
+
 proc writeCompareBLASTPHelp() = 
   echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
   echo conduitUtilsVersion()
@@ -160,6 +167,7 @@ proc writeCompareBLASTPHelp() =
   echo "  <reference_proteome.fa>     FASTA file describing the reference proteome used in the BLASTP search"
   echo "  <inBLASTP.txt>              Default output of BLASTP search of translated protein products vs some reference proteome"
 
+
 proc writeCompareFASTAHelp() = 
   echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
   echo conduitUtilsVersion()
@@ -169,6 +177,7 @@ proc writeCompareFASTAHelp() =
   echo "  <reference.fa>              Reference FASTA file defining the truth set"
   echo "  <query.fa>                  Query FASTA files defining the query set"
 
+
 proc writeSplitFASTAHelp() = 
   echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
   echo conduitUtilsVersion()
@@ -178,6 +187,7 @@ proc writeSplitFASTAHelp() =
   echo "  <conduit_output.fa>         CONDUIT produced FASTA file to be split based on number of reads supporting each isoform"
   echo "  <outprefix>                 Prefix for the fasta files to be output, suffix will describe the bin being reported"
 
+
 proc writeFilterFASTAHelp() = 
   echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
   echo conduitUtilsVersion()
@@ -191,6 +201,7 @@ proc writeFilterFASTAHelp() =
   echo "     -n (5)"
   echo "        Minimum number of reads that must support an isoform for it to be reported in the filtered FASTA"
 
+
 proc writeExtractIntronsHelp() = 
   echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
   echo conduitUtilsVersion()
@@ -200,6 +211,7 @@ proc writeExtractIntronsHelp() =
   echo "  <transcripts.bed12>         Transcripts in BED12 format to extract introns from"
   echo "  <introns.bed>               BED6 output of extracted introns"
 
+
 proc writeCallNonCanonicalHelp() = 
   echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
   echo conduitUtilsVersion()
@@ -210,6 +222,7 @@ proc writeCallNonCanonicalHelp() =
   echo "                            Introns sequences can be obtained using `bedtools getfasta -name -s`"
   echo "  <noncanonical.txt>        Read IDs of the sequences that didn't begin with GT and end with AG"
 
+
 proc writeCallNovelNonCanonicalHelp() = 
   echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
   echo conduitUtilsVersion()
@@ -220,6 +233,7 @@ proc writeCallNovelNonCanonicalHelp() =
   echo "  <noncanonical.txt>           Read IDs specifying intron structure in the format produced by `bedtools getfasta -name`"
   echo "  <novel.bed>                  Output of introns found in the noncanonical.txt file but not found in the reference, in BED6 format"
 
+
 proc writeCallOverlappingHelp() = 
   echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
   echo conduitUtilsVersion()
@@ -230,6 +244,7 @@ proc writeCallOverlappingHelp() =
   echo "  <introns2.txt>              Read IDs specifying introns in the format produced by `bedtools getfasta -name`"
   echo "  <shared_introns.txt>        The introns in common between the two files"
 
+
 proc parseBLASTPoutput*(infilepath : string) : seq[BLASTmatch] = 
   ##
   ## Takes in BLASTP default output file and outputs seq of matching proteins
@@ -486,7 +501,8 @@ proc parseBLASTPoutput*(infilepath : string) : seq[BLASTmatch] =
     except EOFError:
       break
   infile.close()
-
+
+
 proc translateORF*(nts : string,to_stop = true) : string =
   let translation_table = { "TTT" : 'F',
                             "TTC" : 'F',
@@ -561,6 +577,7 @@ proc translateORF*(nts : string,to_stop = true) : string =
       break
     translation.add(aa)
 
+
 proc findAll*(s,sub : string) : seq[int] = 
   #Very stupid means of finding ALL the matches of sub in s
   var start = 0
@@ -569,23 +586,8 @@ proc findAll*(s,sub : string) : seq[int] =
     if idx == -1:
       break
     result.add(idx)
-    start = idx + 1 
-
-proc revComp*(nts : string) : string =
-  let wc_pairs = {'A' : 'T',
-                  'a' : 'T',
-                  'C' : 'G',
-                  'c' : 'G',
-                  'T' : 'A',
-                  't' : 'A',
-                  'U' : 'A',
-                  'u' : 'A',
-                  'G' : 'C',
-                  'g' : 'C'}.toTable()
-  var revcomp : seq[char]
-  for i in 1..nts.len:
-    revcomp.add(wc_pairs[nts[^i]])
-  result = revcomp.join("")
+    start = idx + 1
+
 
 proc translateTranscript*(nts : string) : string =
   let start_codon_indices = findAll(nts,"ATG")
@@ -595,6 +597,7 @@ proc translateTranscript*(nts : string) : string =
     if translation.len > result.len:
       result = translation
 
+
 proc translateTranscripts*(transcripts : openArray[FastaRecord],outfilepath : string , threshold : int = 75,wrap_len : int = 60, stranded : bool = false) =
   var outfile : File
   discard open(outfile,outfilepath,fmWrite)
@@ -620,9 +623,11 @@ proc translateTranscripts*(transcripts : openArray[FastaRecord],outfilepath : st
         outfile.write(&"{translation[wrap_len*(translation.len div wrap_len)..^1]}\n")
   outfile.close()
 
+
 proc translateTranscripts*(transcripts : openArray[FastqRecord],outfilepath : string , threshold : int = 75,wrap_len : int = 60,stranded = false) =
   translateTranscripts(convertFASTQtoFASTA(transcripts),outfilepath,threshold,wrap_len,stranded)
 
+
 proc convertBED12toGTF*(infilepath : string, outfilepath : string ,stranded : bool = false ) =
   ## Converts BED12 formatted file to well-formed GTF file suitable for evaluation with GFFcompare
   var infile,outfile : File
@@ -665,6 +670,7 @@ proc convertBED12toGTF*(infilepath : string, outfilepath : string ,stranded : bo
   infile.close()
   outfile.close()
 
+
 proc strandTranscripts*(transcripts : openArray[FastaRecord],outfilepath : string, wrap_len : int = 60) =
   var outfile : File
   discard open(outfile,outfilepath,fmWrite)
@@ -687,9 +693,11 @@ proc strandTranscripts*(transcripts : openArray[FastaRecord],outfilepath : strin
       outfile.write(&"{stranded[wrap_len*(stranded.len div wrap_len)..^1]}\n")
   outfile.close()
 
+
 proc strandTranscripts*(transcripts : openArray[FastqRecord],outfilepath : string, wrap_len : int = 60) =
   translateTranscripts(convertFASTQtoFASTA(transcripts),outfilepath,wrap_len)
 
+
 proc compareExactTranslations*(reference_infilepath : string, translation_infilepath : string) =
   var r_infile, t_infile : File
   discard open(r_infile,reference_infilepath,fmRead)
@@ -713,6 +721,7 @@ proc compareExactTranslations*(reference_infilepath : string, translation_infile
   echo &"Precision: {float(tp) / float(tp+fp)}"
   echo &"Recall:    {float(tp) / float(tp+fn)}"
 
+
 proc compareBLASTPTranslations*(reference_infilepath : string, blastp_infilepath : string,) =
   var fp,tp1 = 0
   var reference_id_set : HashSet[string]
@@ -743,6 +752,7 @@ proc compareBLASTPTranslations*(reference_infilepath : string, blastp_infilepath
   echo &"Precision (uses TP1): {float(tp1) / float(tp1+fp)}"
   echo &"Recall (uses TP2):    {float(tp2) / float(tp2+fn)}"
 
+
 proc extractIntronsFromBED12*(infilepath : string, outfilepath : string) = 
   ## Grabs Introns from BED12 file and reports one per line in BED format, with ID = ID from the BED12 line
   var infile,outfile : File
@@ -771,6 +781,7 @@ proc extractIntronsFromBED12*(infilepath : string, outfilepath : string) =
   infile.close()
   outfile.close()
 
+
 proc testOverlap(read : (uint64,uint64),single_exon_gene_list : seq[(uint64,uint64,string)]) : string = 
   result = ""
   for gene in single_exon_gene_list: #gene list must be sorted
@@ -795,6 +806,7 @@ proc testOverlap(read : (uint64,uint64),single_exon_gene_list : seq[(uint64,uint
     elif read[1] <= gene[0]:
       break
 
+
 proc callNonCanonicalSplicingFromFASTA*(infilepath : string,outfilepath : string) =
   var infile,outfile : File
   discard open(infile,infilepath,fmRead)
@@ -809,6 +821,7 @@ proc callNonCanonicalSplicingFromFASTA*(infilepath : string,outfilepath : string
         outfile.writeLine(record.read_id)
   outfile.close()
 
+
 proc splitFASTAByReadCounts*(infilepath : string, outfile_prefix : string, bins : openArray[uint64] = [1'u64,2'u64,5'u64,10'u64,20'u64,40'u64,80'u64,160'u64,320'u64,640'u64]) = 
   var infile : File
   discard open(infile,infilepath, fmRead)
@@ -840,6 +853,7 @@ proc splitFASTAByReadCounts*(infilepath : string, outfile_prefix : string, bins
       outfile.writeLine(record.sequence)
     outfile.close()
 
+
 proc filterFASTAByReadCounts*(infilepath,outfilepath : string, filter : uint64 = 5'u64) =
   var infile,outfile : File
   discard open(infile,infilepath, fmRead)
@@ -853,6 +867,7 @@ proc filterFASTAByReadCounts*(infilepath,outfilepath : string, filter : uint64 =
       outfile.writeLine(record.sequence)
   outfile.close()
 
+
 proc parseAttributes(s : string) : Table[string,string] = 
   let fields = s.split(';')[0..^2]
   for field in fields:
@@ -861,6 +876,7 @@ proc parseAttributes(s : string) : Table[string,string] =
     let val = fields1[1].strip(leading=true,trailing=true,chars = {'"'})
     result[key] = val
 
+
 proc callNovelNonCanonical(reference_infilepath, infilepath,outfilepath : string,threshold : uint = 5) =
   var reference_infile,infile,outfile : File
   discard open(reference_infile,reference_infilepath,fmRead)
@@ -930,6 +946,7 @@ proc callNovelNonCanonical(reference_infilepath, infilepath,outfilepath : string
   echo &"Total introns above threshold - {total_counter}"
   echo &"Novel introns - {novel_counter}"
 
+
 proc callOverlappingNonCanonical(reference_infilepath, infilepath,outfilepath : string) =
   var reference_infile,infile,outfile : File
   discard open(reference_infile,reference_infilepath,fmRead)
@@ -977,6 +994,7 @@ proc callOverlappingNonCanonical(reference_infilepath, infilepath,outfilepath :
   echo &"Non-overlapping introns - {novel_counter}"
   echo &"Overlapping introns - {overlapping_counter}"
 
+
 proc assignTxIDs(reference_infilepath,infilepath,outfilepath : string) = 
   var reference_infile,infile,outfile : File
   discard open(reference_infile,reference_infilepath,fmRead)
@@ -1060,6 +1078,7 @@ proc assignTxIDs(reference_infilepath,infilepath,outfilepath : string) =
   infile.close()
   outfile.close()
 
+
 proc parseGTF*(infile : File) : HashSet[(string,char,seq[(uint64,uint64)])] = 
   var exons : Table[string,seq[(string,char,uint64,uint64)]]
   try:
@@ -1113,6 +1132,7 @@ proc parseGTF*(infile : File) : HashSet[(string,char,seq[(uint64,uint64)])] =
     # result.incl((total_chr,total_start,total_end,total_strand,introns))
     result.incl((total_chr,total_strand,introns))
 
+
 proc idNovelIsoforms*(infilepath,reference1_infilepath,reference2_infilepath,outfilepath : string) =
   var infile, ref1file, ref2file, outfile : File
   discard open(ref1file,reference1_infilepath,fmRead)
@@ -1139,9 +1159,11 @@ proc idNovelIsoforms*(infilepath,reference1_infilepath,reference2_infilepath,out
   # discard open(outfile,outfilepath,fmWrite)
   # outfile.close
 
+
 proc getTxId*(s : string) : string =
   result = s.split(':')[1].split('|')[0]
 
+
 proc getNovelLociFASTA*(infilepath,gffcompare_infilepath,outfilepath : string,field = 1) = 
   var novel_loci : HashSet[string]
   var infile,gfffile,outfile : File
@@ -1177,8 +1199,8 @@ proc getNovelLociFASTA*(infilepath,gffcompare_infilepath,outfilepath : string,fi
   writeFASTArecordsToFile(outfile,new_records)
   outfile.close
 
+
 proc parseOptions() : UtilOptions = 
-
   var i = 0
   var mode = ""
   var last = ""
@@ -1383,6 +1405,7 @@ proc parseOptions() : UtilOptions =
                      stranded : stranded
                      )
 
+
 proc main() =
   let opt = parseOptions()
   if opt.run_flag: