Skip to content

Commit

Permalink
more code restructuring
Browse files Browse the repository at this point in the history
  • Loading branch information
Nathan Roach committed Nov 21, 2020
1 parent 6616821 commit c9e7bf3
Show file tree
Hide file tree
Showing 6 changed files with 193 additions and 51 deletions.
5 changes: 0 additions & 5 deletions conduit.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,3 @@ before install:
exec "make"

bin = @["conduit", "conduitUtils", "conduit_clustering"]
# skipDirs = @["tests"]
# skipFiles = @["GT04008021.bam"]

# task test, "run the tests":
# exec "nim c --lineDir:on --debuginfo -r tests/all"
61 changes: 42 additions & 19 deletions src/conduitUtils.nim
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import poGraphUtils
import algorithm
import fasta
import fastq
import na

type
BLASTmatch* = object
Expand Down Expand Up @@ -49,16 +50,19 @@ type
end_idx* : uint64
introns* : seq[(uint64,uint64)]


# proc intersect(a,b : HashSet[GTFTranscript]) =
# var a_,b_ = HashSet[(string,char,seq[(uint64,uint64)])]
# for tx in a.items:
# a_.incl((tx.chr,tx.strand,tx.introns))
# for tx in b.items:
# b_.incl(())


proc conduitUtilsVersion() : string =
return "CONDUIT Utilities Version 0.1.0 by Nathan Roach ( nroach2@jhu.edu, https://github.com/NatPRoach/conduit/ )"


proc writeDefaultHelp() =
echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
echo conduitUtilsVersion()
Expand Down Expand Up @@ -110,6 +114,7 @@ proc writeTranslateHelp() =
echo " -l, --min-length (75)"
echo " Minimum length in Amino Acids necessary for a putative ORF to be reported"


proc writeStrandTranscriptsHelp() =
echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
echo conduitUtilsVersion()
Expand Down Expand Up @@ -141,6 +146,7 @@ proc writeBED2GTFHelp() =
echo " -s, --stranded"
echo " Report gtf fields with strand information"


proc writeBLASTPHelp() =
echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
echo conduitUtilsVersion()
Expand All @@ -151,6 +157,7 @@ proc writeBLASTPHelp() =
echo " <outPutativeOrthologs.tsv> Tab separated file of putative ortholog matches"
echo " In format: <Query ID>\t<Reference proteome top match ID>\t<E value>"


proc writeCompareBLASTPHelp() =
echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
echo conduitUtilsVersion()
Expand All @@ -160,6 +167,7 @@ proc writeCompareBLASTPHelp() =
echo " <reference_proteome.fa> FASTA file describing the reference proteome used in the BLASTP search"
echo " <inBLASTP.txt> Default output of BLASTP search of translated protein products vs some reference proteome"


proc writeCompareFASTAHelp() =
echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
echo conduitUtilsVersion()
Expand All @@ -169,6 +177,7 @@ proc writeCompareFASTAHelp() =
echo " <reference.fa> Reference FASTA file defining the truth set"
echo " <query.fa> Query FASTA files defining the query set"


proc writeSplitFASTAHelp() =
echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
echo conduitUtilsVersion()
Expand All @@ -178,6 +187,7 @@ proc writeSplitFASTAHelp() =
echo " <conduit_output.fa> CONDUIT produced FASTA file to be split based on number of reads supporting each isoform"
echo " <outprefix> Prefix for the fasta files to be output, suffix will describe the bin being reported"


proc writeFilterFASTAHelp() =
echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
echo conduitUtilsVersion()
Expand All @@ -191,6 +201,7 @@ proc writeFilterFASTAHelp() =
echo " -n (5)"
echo " Minimum number of reads that must support an isoform for it to be reported in the filtered FASTA"


proc writeExtractIntronsHelp() =
echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
echo conduitUtilsVersion()
Expand All @@ -200,6 +211,7 @@ proc writeExtractIntronsHelp() =
echo " <transcripts.bed12> Transcripts in BED12 format to extract introns from"
echo " <introns.bed> BED6 output of extracted introns"


proc writeCallNonCanonicalHelp() =
echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
echo conduitUtilsVersion()
Expand All @@ -210,6 +222,7 @@ proc writeCallNonCanonicalHelp() =
echo " Introns sequences can be obtained using `bedtools getfasta -name -s`"
echo " <noncanonical.txt> Read IDs of the sequences that didn't begin with GT and end with AG"


proc writeCallNovelNonCanonicalHelp() =
echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
echo conduitUtilsVersion()
Expand All @@ -220,6 +233,7 @@ proc writeCallNovelNonCanonicalHelp() =
echo " <noncanonical.txt> Read IDs specifying intron structure in the format produced by `bedtools getfasta -name`"
echo " <novel.bed> Output of introns found in the noncanonical.txt file but not found in the reference, in BED6 format"


proc writeCallOverlappingHelp() =
echo "CONDUIT - CONsensus Decomposition Utility In Transcriptome-assembly:"
echo conduitUtilsVersion()
Expand All @@ -230,6 +244,7 @@ proc writeCallOverlappingHelp() =
echo " <introns2.txt> Read IDs specifying introns in the format produced by `bedtools getfasta -name`"
echo " <shared_introns.txt> The introns in common between the two files"


proc parseBLASTPoutput*(infilepath : string) : seq[BLASTmatch] =
##
## Takes in BLASTP default output file and outputs seq of matching proteins
Expand Down Expand Up @@ -486,7 +501,8 @@ proc parseBLASTPoutput*(infilepath : string) : seq[BLASTmatch] =
except EOFError:
break
infile.close()



proc translateORF*(nts : string,to_stop = true) : string =
let translation_table = { "TTT" : 'F',
"TTC" : 'F',
Expand Down Expand Up @@ -561,6 +577,7 @@ proc translateORF*(nts : string,to_stop = true) : string =
break
translation.add(aa)


proc findAll*(s,sub : string) : seq[int] =
#Very stupid means of finding ALL the matches of sub in s
var start = 0
Expand All @@ -569,23 +586,8 @@ proc findAll*(s,sub : string) : seq[int] =
if idx == -1:
break
result.add(idx)
start = idx + 1

proc revComp*(nts : string) : string =
let wc_pairs = {'A' : 'T',
'a' : 'T',
'C' : 'G',
'c' : 'G',
'T' : 'A',
't' : 'A',
'U' : 'A',
'u' : 'A',
'G' : 'C',
'g' : 'C'}.toTable()
var revcomp : seq[char]
for i in 1..nts.len:
revcomp.add(wc_pairs[nts[^i]])
result = revcomp.join("")
start = idx + 1


proc translateTranscript*(nts : string) : string =
let start_codon_indices = findAll(nts,"ATG")
Expand All @@ -595,6 +597,7 @@ proc translateTranscript*(nts : string) : string =
if translation.len > result.len:
result = translation


proc translateTranscripts*(transcripts : openArray[FastaRecord],outfilepath : string , threshold : int = 75,wrap_len : int = 60, stranded : bool = false) =
var outfile : File
discard open(outfile,outfilepath,fmWrite)
Expand All @@ -620,9 +623,11 @@ proc translateTranscripts*(transcripts : openArray[FastaRecord],outfilepath : st
outfile.write(&"{translation[wrap_len*(translation.len div wrap_len)..^1]}\n")
outfile.close()


proc translateTranscripts*(transcripts : openArray[FastqRecord],outfilepath : string , threshold : int = 75,wrap_len : int = 60,stranded = false) =
translateTranscripts(convertFASTQtoFASTA(transcripts),outfilepath,threshold,wrap_len,stranded)


proc convertBED12toGTF*(infilepath : string, outfilepath : string ,stranded : bool = false ) =
## Converts BED12 formatted file to well-formed GTF file suitable for evaluation with GFFcompare
var infile,outfile : File
Expand Down Expand Up @@ -665,6 +670,7 @@ proc convertBED12toGTF*(infilepath : string, outfilepath : string ,stranded : bo
infile.close()
outfile.close()


proc strandTranscripts*(transcripts : openArray[FastaRecord],outfilepath : string, wrap_len : int = 60) =
var outfile : File
discard open(outfile,outfilepath,fmWrite)
Expand All @@ -687,9 +693,11 @@ proc strandTranscripts*(transcripts : openArray[FastaRecord],outfilepath : strin
outfile.write(&"{stranded[wrap_len*(stranded.len div wrap_len)..^1]}\n")
outfile.close()


proc strandTranscripts*(transcripts : openArray[FastqRecord],outfilepath : string, wrap_len : int = 60) =
translateTranscripts(convertFASTQtoFASTA(transcripts),outfilepath,wrap_len)


proc compareExactTranslations*(reference_infilepath : string, translation_infilepath : string) =
var r_infile, t_infile : File
discard open(r_infile,reference_infilepath,fmRead)
Expand All @@ -713,6 +721,7 @@ proc compareExactTranslations*(reference_infilepath : string, translation_infile
echo &"Precision: {float(tp) / float(tp+fp)}"
echo &"Recall: {float(tp) / float(tp+fn)}"


proc compareBLASTPTranslations*(reference_infilepath : string, blastp_infilepath : string,) =
var fp,tp1 = 0
var reference_id_set : HashSet[string]
Expand Down Expand Up @@ -743,6 +752,7 @@ proc compareBLASTPTranslations*(reference_infilepath : string, blastp_infilepath
echo &"Precision (uses TP1): {float(tp1) / float(tp1+fp)}"
echo &"Recall (uses TP2): {float(tp2) / float(tp2+fn)}"


proc extractIntronsFromBED12*(infilepath : string, outfilepath : string) =
## Grabs Introns from BED12 file and reports one per line in BED format, with ID = ID from the BED12 line
var infile,outfile : File
Expand Down Expand Up @@ -771,6 +781,7 @@ proc extractIntronsFromBED12*(infilepath : string, outfilepath : string) =
infile.close()
outfile.close()


proc testOverlap(read : (uint64,uint64),single_exon_gene_list : seq[(uint64,uint64,string)]) : string =
result = ""
for gene in single_exon_gene_list: #gene list must be sorted
Expand All @@ -795,6 +806,7 @@ proc testOverlap(read : (uint64,uint64),single_exon_gene_list : seq[(uint64,uint
elif read[1] <= gene[0]:
break


proc callNonCanonicalSplicingFromFASTA*(infilepath : string,outfilepath : string) =
var infile,outfile : File
discard open(infile,infilepath,fmRead)
Expand All @@ -809,6 +821,7 @@ proc callNonCanonicalSplicingFromFASTA*(infilepath : string,outfilepath : string
outfile.writeLine(record.read_id)
outfile.close()


proc splitFASTAByReadCounts*(infilepath : string, outfile_prefix : string, bins : openArray[uint64] = [1'u64,2'u64,5'u64,10'u64,20'u64,40'u64,80'u64,160'u64,320'u64,640'u64]) =
var infile : File
discard open(infile,infilepath, fmRead)
Expand Down Expand Up @@ -840,6 +853,7 @@ proc splitFASTAByReadCounts*(infilepath : string, outfile_prefix : string, bins
outfile.writeLine(record.sequence)
outfile.close()


proc filterFASTAByReadCounts*(infilepath,outfilepath : string, filter : uint64 = 5'u64) =
var infile,outfile : File
discard open(infile,infilepath, fmRead)
Expand All @@ -853,6 +867,7 @@ proc filterFASTAByReadCounts*(infilepath,outfilepath : string, filter : uint64 =
outfile.writeLine(record.sequence)
outfile.close()


proc parseAttributes(s : string) : Table[string,string] =
let fields = s.split(';')[0..^2]
for field in fields:
Expand All @@ -861,6 +876,7 @@ proc parseAttributes(s : string) : Table[string,string] =
let val = fields1[1].strip(leading=true,trailing=true,chars = {'"'})
result[key] = val


proc callNovelNonCanonical(reference_infilepath, infilepath,outfilepath : string,threshold : uint = 5) =
var reference_infile,infile,outfile : File
discard open(reference_infile,reference_infilepath,fmRead)
Expand Down Expand Up @@ -930,6 +946,7 @@ proc callNovelNonCanonical(reference_infilepath, infilepath,outfilepath : string
echo &"Total introns above threshold - {total_counter}"
echo &"Novel introns - {novel_counter}"


proc callOverlappingNonCanonical(reference_infilepath, infilepath,outfilepath : string) =
var reference_infile,infile,outfile : File
discard open(reference_infile,reference_infilepath,fmRead)
Expand Down Expand Up @@ -977,6 +994,7 @@ proc callOverlappingNonCanonical(reference_infilepath, infilepath,outfilepath :
echo &"Non-overlapping introns - {novel_counter}"
echo &"Overlapping introns - {overlapping_counter}"


proc assignTxIDs(reference_infilepath,infilepath,outfilepath : string) =
var reference_infile,infile,outfile : File
discard open(reference_infile,reference_infilepath,fmRead)
Expand Down Expand Up @@ -1060,6 +1078,7 @@ proc assignTxIDs(reference_infilepath,infilepath,outfilepath : string) =
infile.close()
outfile.close()


proc parseGTF*(infile : File) : HashSet[(string,char,seq[(uint64,uint64)])] =
var exons : Table[string,seq[(string,char,uint64,uint64)]]
try:
Expand Down Expand Up @@ -1113,6 +1132,7 @@ proc parseGTF*(infile : File) : HashSet[(string,char,seq[(uint64,uint64)])] =
# result.incl((total_chr,total_start,total_end,total_strand,introns))
result.incl((total_chr,total_strand,introns))


proc idNovelIsoforms*(infilepath,reference1_infilepath,reference2_infilepath,outfilepath : string) =
var infile, ref1file, ref2file, outfile : File
discard open(ref1file,reference1_infilepath,fmRead)
Expand All @@ -1139,9 +1159,11 @@ proc idNovelIsoforms*(infilepath,reference1_infilepath,reference2_infilepath,out
# discard open(outfile,outfilepath,fmWrite)
# outfile.close


proc getTxId*(s : string) : string =
result = s.split(':')[1].split('|')[0]


proc getNovelLociFASTA*(infilepath,gffcompare_infilepath,outfilepath : string,field = 1) =
var novel_loci : HashSet[string]
var infile,gfffile,outfile : File
Expand Down Expand Up @@ -1177,8 +1199,8 @@ proc getNovelLociFASTA*(infilepath,gffcompare_infilepath,outfilepath : string,fi
writeFASTArecordsToFile(outfile,new_records)
outfile.close


proc parseOptions() : UtilOptions =

var i = 0
var mode = ""
var last = ""
Expand Down Expand Up @@ -1383,6 +1405,7 @@ proc parseOptions() : UtilOptions =
stranded : stranded
)


proc main() =
let opt = parseOptions()
if opt.run_flag:
Expand Down
Loading

0 comments on commit c9e7bf3

Please sign in to comment.