Skip to content

Commit

Permalink
Merge pull request #118 from J35P312/master
Browse files Browse the repository at this point in the history
TIDDIT 3.8.0
  • Loading branch information
J35P312 authored Jul 19, 2024
2 parents 17172e6 + 7b40f59 commit 9bb201f
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 62 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ tiddit --sv --help
tiddit --cov --help
```

Optionally, the assembly calling may be turned off using the "--skip_assembly" option.

TIDDIT may be installed using bioconda:
```
conda install tiddit
Expand All @@ -60,6 +62,7 @@ Where bam is the input bam or cram file. And reference.fasta is the reference fa

TIDDIT may be fine-tuned by altering these optional parameters:


-o output prefix(default=output)
-i paired reads maximum allowed insert size. Pairs aligning on the same chr at a distance higher than this are considered candidates for SV (default= 99.9th percentile of insert size)
-d expected reads orientations, possible values "innie" (-> <-) or "outtie" (<- ->). Default: major orientation within the dataset
Expand All @@ -74,6 +77,7 @@ TIDDIT may be fine-tuned by altering these optional parameters:
-s Number of reads to sample when computing library statistics(default=25000000)
-z minimum variant size (default=50), variants smaller than this will not be printed ( z < 10 is not recomended)
--force_ploidy force the ploidy to be set to -n across the entire genome (i.e skip coverage normalisation of chromosomes)
--force_overwrite force the analysis and overwrite any data in the output folder
--n_mask exclude regions from coverage calculation if they contain more than this fraction of N (default = 0.5)
--skip_assembly Skip running local assembly, tiddit will perform worse, but wont require fermi2, bwa, ropebwt and bwa indexed ref
--bwa path to bwa executable file(default=bwa)
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@
"tiddit/tiddit_coverage.pyx",
"tiddit/tiddit_cluster.pyx",
"tiddit/tiddit_coverage_analysis.pyx",
"tiddit/tiddit_gc.pyx",
"tiddit/tiddit_variant.pyx",
"tiddit/tiddit_contig_analysis.pyx"])
else:
ext_modules = []

setup(
name = 'tiddit',
version = '3.7.0',
version = '3.8.0',


url = "https://github.com/SciLifeLab/TIDDIT",
Expand Down
13 changes: 10 additions & 3 deletions tiddit/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@
import tiddit.tiddit_cluster as tiddit_cluster
import tiddit.tiddit_variant as tiddit_variant
import tiddit.tiddit_contig_analysis as tiddit_contig_analysis
import tiddit.tiddit_gc as tiddit_gc

def main():
version="3.7.0"
version="3.8.0"
parser = argparse.ArgumentParser("""tiddit-{}""".format(version),add_help=False)
parser.add_argument("--sv" , help="call structural variation", required=False, action="store_true")
parser.add_argument("--cov" , help="generate a coverage bed file", required=False, action="store_true")
Expand Down Expand Up @@ -95,6 +96,10 @@ def main():
samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=args.ref)

bam_header=samfile.header
chromosomes=[]

for chr in bam_header["SQ"]:
chromosomes.append(chr["SN"])
samfile.close()


Expand Down Expand Up @@ -149,8 +154,10 @@ def main():
print("extracted signals in:")
print(t-time.time())

gc_dictionary=tiddit_gc.main(args.ref,chromosomes,args.threads,50,0.5)

t=time.time()
library=tiddit_coverage_analysis.determine_ploidy(coverage_data,contigs,library,args.n,prefix,args.c,args.ref,50,bam_header)
library=tiddit_coverage_analysis.determine_ploidy(coverage_data,contigs,library,args.n,prefix,args.c,args.ref,50,bam_header,gc_dictionary)
print("calculated coverage in:")
print(time.time()-t)

Expand Down Expand Up @@ -179,7 +186,7 @@ def main():
f.write(vcf_header+"\n")

t=time.time()
variants=tiddit_variant.main(bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len)
variants=tiddit_variant.main(bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len,gc_dictionary)
print("analyzed clusters in")
print(time.time()-t)

Expand Down
51 changes: 1 addition & 50 deletions tiddit/tiddit_coverage_analysis.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,56 +6,7 @@ import gzip

import tiddit.tiddit_coverage as tiddit_coverage


def get_gc(str reference_fasta,bam_header,bin_size):

gc=tiddit_coverage.create_coverage(bam_header,bin_size)[0]

reference={}
chromosomes=[]
if not reference_fasta.endswith(".gz"):
with open(reference_fasta, 'r') as f:
sequence=f.read()
else:
with gzip.open(reference_fasta, 'r') as f:
sequence=f.read()


#split_reference=sequence.split(">")
split_reference=re.split("\n>|^>", sequence)
del sequence
del split_reference[0]

for chromosome in split_reference:
content=chromosome.split("\n",1)
reference[content[0].strip().split()[0]]=content[1].replace("\n","")

N=set(["N","n"])
GC=set(["G","g","C","c"])

for chromosome in gc:
for i in range(0,len(gc)):
start=i*bin_size
end=(i+1)*bin_size
if end > len(reference[chromosome]):
end=len(reference[chromosome])

N_count=0
GC_count=0
for j in range(start,end):
if reference[chromosome][j] in N:
N_count +=1

#masked bin
if N_count > bin_size/2:
gc[chromosome][i] = -1

return(gc)


def determine_ploidy(dict coverage_data,contigs,dict library,int ploidy,str prefix,c, str reference_fasta,int bin_size,bam_header):

gc=get_gc(reference_fasta,bam_header,bin_size)
def determine_ploidy(dict coverage_data,contigs,dict library,int ploidy,str prefix,c, str reference_fasta,int bin_size,bam_header,gc):

f=open( "{}.ploidies.tab".format(prefix),"w" )
f.write("Chromosome\tPloidy\tPloidy_rounded\tMean_coverage\n")
Expand Down
45 changes: 45 additions & 0 deletions tiddit/tiddit_gc.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import pysam
import numpy
import math
from joblib import Parallel, delayed

def binned_gc(fasta_path,contig,bin_size,n_cutoff):
fasta=pysam.FastaFile(fasta_path)
contig_length=fasta.get_reference_length(contig)
elements=int(math.ceil(contig_length/bin_size))

contig_gc=numpy.zeros(elements,dtype=numpy.int8)

start=0
for i in range(0,elements):
slice=fasta.fetch(contig, start, start+bin_size)
n=0
gc=0

for charachter in slice:
if charachter == "N" or charachter == "n":
n+=1
elif charachter == "C" or charachter == "c" or charachter == "G" or charachter == "g":
gc+=1

if n/bin_size > n_cutoff:
contig_gc[i]=-1

else:
contig_gc[i]=round(100*gc/elements)

start+=bin_size

return([contig,contig_gc])

def main(reference,contigs,threads,bin_size,n_cutoff):
gc_list=Parallel(n_jobs=threads)( delayed(binned_gc)(reference,contig,bin_size,n_cutoff) for contig in contigs)

gc_dictionary={}
for gc in gc_list:
gc_dictionary[gc[0]]=gc[1]

return(gc_dictionary)


#contigs=["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17"]
2 changes: 1 addition & 1 deletion tiddit/tiddit_signal.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_i
bam_header=samfile.header
samfile.close()
cdef int bin_size=50
cdef str file_type=u"wig"
cdef str file_type="wig"
cdef str outfile=prefix+".tiddit_coverage.wig"

cdef long t_tot=0
Expand Down
18 changes: 11 additions & 7 deletions tiddit/tiddit_variant.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,11 @@ def find_sv_type(chrA,chrB,inverted,non_inverted,args,sample_data,samples,librar
return("DUP:INV",cn)
else:
return("DUP:TANDEM",cn)
elif cn < p:
return("DEL",cn)

elif inverted > non_inverted:
if inverted > non_inverted:
return("INV",cn)
elif cn < p:
return("DEL",cn)
else:
return("BND",cn)

Expand Down Expand Up @@ -231,7 +231,7 @@ def sv_filter(sample_data,args,chrA,chrB,posA,posB,max_ins_len,n_discordants,n_s

return(filt)

def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,samples,dict coverage_data,contig_number,max_ins_len,contig_seqs):
def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,samples,dict coverage_data,contig_number,max_ins_len,contig_seqs,gc):
cdef AlignmentFile samfile = AlignmentFile(bam_file_name, "r",reference_filename=args.ref,index_filename="{}_tiddit/{}.csi".format(args.o,samples[0]))
variants=[]

Expand Down Expand Up @@ -270,6 +270,7 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar

s=int(math.floor(sv_clusters[chrA][chrB][cluster]["startB"]/50.0))
e=int(math.floor(sv_clusters[chrA][chrB][cluster]["endB"]/50.0))+1

avg_b=numpy.average(coverage_data[chrB][s:e])

if avg_b == 0:
Expand Down Expand Up @@ -302,7 +303,10 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar
else:
s=int(math.floor(posA/50.0))
e=int(math.floor(posB/50.0))+1
sample_data[sample]["covM"]=numpy.average(coverage_data[chrA][s:e] )
coverage_between=coverage_data[chrA][s:e]
gc_between=gc[chrA][s:e]

sample_data[sample]["covM"]=numpy.average(coverage_between[ gc_between > -1 ] )

inverted=0
non_inverted=0
Expand Down Expand Up @@ -528,7 +532,7 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar
samfile.close()
return(variants)

def main(str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,samples,dict coverage_data,contig_number,max_ins_len):
def main(str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,samples,dict coverage_data,contig_number,max_ins_len,gc):
contig_seqs={}
new_seq=False
if not args.skip_assembly:
Expand All @@ -547,7 +551,7 @@ def main(str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,sampl
for chrB in sv_clusters[chrA]:
variants[chrB]=[]

variants_list=Parallel(n_jobs=args.threads,prefer="threads",timeout=99999)( delayed(define_variant)(chrA,bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len,contig_seqs) for chrA in sv_clusters)
variants_list=Parallel(n_jobs=args.threads,prefer="threads",timeout=99999)( delayed(define_variant)(chrA,bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len,contig_seqs,gc) for chrA in sv_clusters)

ratios={"fragments_A":[],"fragments_B":[],"reads_A":[],"reads_B":[]}
for v in variants_list:
Expand Down

0 comments on commit 9bb201f

Please sign in to comment.