Skip to content

Commit

Permalink
Merge pull request #116 from J35P312/master
Browse files Browse the repository at this point in the history
TIDDIT 3.7.0
  • Loading branch information
J35P312 authored Jun 28, 2024
2 parents 0afb2d4 + ddc3b9e commit 17172e6
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 50 deletions.
10 changes: 8 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
from setuptools import setup
import numpy
import pyximport
import pysam
pyximport.install()


try:
from Cython.Build import cythonize
Expand All @@ -20,14 +24,16 @@

setup(
name = 'tiddit',
version = '3.6.1',
version = '3.7.0',


url = "https://github.com/SciLifeLab/TIDDIT",
author = "Jesper Eisfeldt",
author_email= "jesper.eisfeldt@scilifelab.se",
ext_modules = ext_modules,
include_dirs=[numpy.get_include()],
extra_link_args=pysam.get_libraries(),
define_macros=pysam.get_defines(),
include_dirs=[numpy.get_include()]+pysam.get_include(),
packages = ['tiddit'],
install_requires = ['numpy','pysam'],
entry_points = {'console_scripts': ['tiddit = tiddit.__main__:main']},
Expand Down
25 changes: 20 additions & 5 deletions tiddit/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import tiddit.tiddit_contig_analysis as tiddit_contig_analysis

def main():
version="3.6.1"
version="3.7.0"
parser = argparse.ArgumentParser("""tiddit-{}""".format(version),add_help=False)
parser.add_argument("--sv" , help="call structural variation", required=False, action="store_true")
parser.add_argument("--cov" , help="generate a coverage bed file", required=False, action="store_true")
Expand All @@ -27,6 +27,7 @@ def main():

parser = argparse.ArgumentParser("""tiddit --sv --bam inputfile [-o prefix] --ref ref.fasta""")
parser.add_argument('--sv' , help="call structural variation", required=False, action="store_true")
parser.add_argument('--force_overwrite' , help="force the analysis and overwrite any data in the output folder", required=False, action="store_true")
parser.add_argument('--bam', type=str,required=True, help="coordinate sorted bam file(required)")
parser.add_argument('-o', type=str,default="output", help="output prefix(default=output)")
parser.add_argument('-i', type=int, help="paired reads maximum allowed insert size. Pairs aligning on the same chr at a distance higher than this are considered candidates for SV (default= 99.9th percentile of insert size)")
Expand All @@ -48,6 +49,7 @@ def main():
parser.add_argument('--fermi2', type=str,default="fermi2", help="path to fermi2 executable file (default=fermi2)")
parser.add_argument('--ropebwt2', type=str , default="ropebwt2", help="path to ropebwt2 executable file (default=ropebwt2)")
parser.add_argument('--skip_assembly', action="store_true", help="Skip running local assembly, tiddit will perform worse, but wont require fermi2, bwa, ropebwt and bwa indexed ref")
#parser.add_argument('--skip_index', action="store_true", help="Do not generate the csi index")
parser.add_argument('--p_ratio', type=float,default=0.1, help="minimum discordant pair/normal pair ratio at the breakpoint junction(default=0.1)")
parser.add_argument('--r_ratio', type=float,default=0.1, help="minimum split read/coverage ratio at the breakpoint junction(default=0.1)")
parser.add_argument('--max_coverage', type=float,default=4, help="filter call if X times higher than chromosome average coverage (default=4)")
Expand Down Expand Up @@ -115,10 +117,21 @@ def main():
i+=1

prefix=args.o
os.mkdir(f"{prefix}_tiddit")
os.mkdir(f"{prefix}_tiddit/clips")
try:
os.mkdir(f"{prefix}_tiddit")
os.mkdir(f"{prefix}_tiddit/clips")
except:
if args.force_overwrite:
pass
else:
print("Eror output folder exists")
quit()

pysam.index("-c","-m","6","-@",str(args.threads),bam_file_name,"{}_tiddit/{}.csi".format(args.o,sample_id))
#if not args.skip_index:
t=time.time()
print("Creating index")
pysam.index("-c","-m","4","-@",str(args.threads),bam_file_name,"{}_tiddit/{}.csi".format(args.o,sample_id))
print("Created index in: " + str(time.time()-t) )

min_mapq=args.q
max_ins_len=100000
Expand All @@ -132,7 +145,7 @@ def main():


t=time.time()
coverage_data=tiddit_signal.main(bam_file_name,args.ref,prefix,min_mapq,max_ins_len,sample_id,args.threads,args.min_contig)
coverage_data=tiddit_signal.main(bam_file_name,args.ref,prefix,min_mapq,max_ins_len,sample_id,args.threads,args.min_contig,False)
print("extracted signals in:")
print(t-time.time())

Expand All @@ -153,6 +166,8 @@ def main():

if not args.e:
args.e=int(library["avg_insert_size"]/2.0)
if not args.e:
args.e=50

t=time.time()
sv_clusters=tiddit_cluster.main(prefix,contigs,contig_length,samples,library["mp"],args.e,args.l,max_ins_len,args.min_contig,args.skip_assembly,args.r)
Expand Down
20 changes: 9 additions & 11 deletions tiddit/tiddit_cluster.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@ import sys
import os
import tiddit.DBSCAN as DBSCAN
import numpy
import statistics
from statistics import mode
from collections import Counter

def find_discordant_pos(fragment,is_mp):
if is_mp:
Expand Down Expand Up @@ -267,16 +266,16 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi


if candidates[chrA][chrB][candidate]["N_splits"] and min_reads <= candidates[chrA][chrB][candidate]["N_splits"]:
candidates[chrA][chrB][candidate]["posA"]=mode(candidates[chrA][chrB][candidate]["positions_A"]["splits"])
candidates[chrA][chrB][candidate]["posB"]=mode(candidates[chrA][chrB][candidate]["positions_B"]["splits"])
candidates[chrA][chrB][candidate]["posA"]=Counter(candidates[chrA][chrB][candidate]["positions_A"]["splits"]).most_common(1)[0][0]
candidates[chrA][chrB][candidate]["posB"]=Counter(candidates[chrA][chrB][candidate]["positions_B"]["splits"]).most_common(1)[0][0]

elif candidates[chrA][chrB][candidate]["N_contigs"]:
candidates[chrA][chrB][candidate]["posA"]=mode(candidates[chrA][chrB][candidate]["positions_A"]["contigs"])
candidates[chrA][chrB][candidate]["posB"]=mode(candidates[chrA][chrB][candidate]["positions_B"]["contigs"])
candidates[chrA][chrB][candidate]["posA"]=Counter(candidates[chrA][chrB][candidate]["positions_A"]["contigs"]).most_common(1)[0][0]
candidates[chrA][chrB][candidate]["posB"]=Counter(candidates[chrA][chrB][candidate]["positions_B"]["contigs"]).most_common(1)[0][0]

elif candidates[chrA][chrB][candidate]["N_splits"]:
candidates[chrA][chrB][candidate]["posA"]=mode(candidates[chrA][chrB][candidate]["positions_A"]["splits"])
candidates[chrA][chrB][candidate]["posB"]=mode(candidates[chrA][chrB][candidate]["positions_B"]["splits"])
candidates[chrA][chrB][candidate]["posA"]=Counter(candidates[chrA][chrB][candidate]["positions_A"]["splits"]).most_common(1)[0][0]
candidates[chrA][chrB][candidate]["posB"]=Counter(candidates[chrA][chrB][candidate]["positions_B"]["splits"]).most_common(1)[0][0]

else:
reverse_A = candidates[chrA][chrB][candidate]["positions_A"]["orientation_discordants"].count("True")
Expand Down Expand Up @@ -329,9 +328,8 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi
candidates[chrA][chrB][candidate]["posB"]=min(candidates[chrA][chrB][candidate]["positions_B"]["discordants"])

else:
candidates[chrA][chrB][candidate]["posA"]=mode(candidates[chrA][chrB][candidate]["positions_A"]["discordants"])
candidates[chrA][chrB][candidate]["posB"]=mode(candidates[chrA][chrB][candidate]["positions_B"]["discordants"])

candidates[chrA][chrB][candidate]["posA"]=Counter(candidates[chrA][chrB][candidate]["positions_A"]["discordants"]).most_common(1)[0][0]
candidates[chrA][chrB][candidate]["posB"]=Counter(candidates[chrA][chrB][candidate]["positions_B"]["discordants"]).most_common(1)[0][0]

candidates[chrA][chrB][candidate]["startB"]=min(candidates[chrA][chrB][candidate]["positions_B"]["start"])
candidates[chrA][chrB][candidate]["endB"]=max(candidates[chrA][chrB][candidate]["positions_B"]["end"])
Expand Down
31 changes: 19 additions & 12 deletions tiddit/tiddit_signal.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@ import os
import itertools
import time
from joblib import Parallel, delayed
from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment

import tiddit.tiddit_coverage as tiddit_coverage

def find_SA_query_range(SA):
a =pysam.AlignedSegment()
cdef a =pysam.AlignedSegment()
a.reference_start=int( SA[1] )

if SA[2] == "+":
Expand Down Expand Up @@ -143,21 +144,27 @@ def SA_analysis(read,min_q,tag,reference_name):

return(split)

def worker(str chromosome, str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id, int bin_size):
def worker(str chromosome, str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id, int bin_size,skip_index):
print("Collecting signals on contig: {}".format(chromosome))
samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref,index_filename="{}_tiddit/{}.csi".format(prefix,sample_id))

bam_index="{}_tiddit/{}.csi".format(prefix,sample_id)
if skip_index:
bam_index=False

cdef AlignmentFile samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref,index_filename=bam_index)
bam_header=samfile.header
coverage_data,end_bin_size=tiddit_coverage.create_coverage(bam_header,bin_size,chromosome)

clips=[]
data=[]
splits=[]
cdef list clips=[]
cdef list data=[]
cdef list splits=[]

clip_dist=100
cdef int clip_dist=100

cdef long read_position
cdef long read_end
cdef int mapq
cdef AlignedSegment read

for read in samfile.fetch(chromosome,until_eof=True):

Expand Down Expand Up @@ -219,16 +226,16 @@ def worker(str chromosome, str bam_file_name,str ref,str prefix,int min_q,int ma

return(chromosome,data,splits,coverage_data, "{}_tiddit/clips/{}.fa".format(prefix,chromosome) )

def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id, int threads, int min_contig):
def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id, int threads, int min_contig,skip_index):

samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref,index_filename="{}_tiddit/{}.csi".format(prefix,sample_id))
cdef AlignmentFile samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref)
bam_header=samfile.header
samfile.close()
cdef int bin_size=50
cdef str file_type="wig"
cdef str file_type=u"wig"
cdef str outfile=prefix+".tiddit_coverage.wig"

t_tot=0
cdef long t_tot=0

cdef dict data={}
cdef dict splits={}
Expand All @@ -248,7 +255,7 @@ def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_i
splits[chrA["SN"]][chrB["SN"]]={}

t=time.time()
res=Parallel(n_jobs=threads)( delayed(worker)(chromosome,bam_file_name,ref,prefix,min_q,max_ins,sample_id,bin_size) for chromosome in chromosomes )
res=Parallel(n_jobs=threads,timeout=99999)( delayed(worker)(chromosome,bam_file_name,ref,prefix,min_q,max_ins,sample_id,bin_size,skip_index) for chromosome in chromosomes )

chromosomes=set(chromosomes)
for i in range(0,len(res)):
Expand Down
31 changes: 22 additions & 9 deletions tiddit/tiddit_stats.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pysam
import numpy
import time

def statistics(bam_file_name,ref,min_mapq,max_ins_len,n_reads):
library={}
samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref)
Expand All @@ -10,8 +12,16 @@ def statistics(bam_file_name,ref,min_mapq,max_ins_len,n_reads):
is_outtie=0

n_sampled=0
t=time.time()

for read in samfile.fetch():


read_length.append( read.query_length )
n_sampled+=1

if n_sampled > n_reads:
break

if read.mate_is_unmapped:
continue

Expand All @@ -27,25 +37,27 @@ def statistics(bam_file_name,ref,min_mapq,max_ins_len,n_reads):
if read.is_supplementary or read.is_secondary or read.is_duplicate or read.mapq < min_mapq:
continue

n_sampled+=1

insert_size.append( read.template_length )
read_length.append( read.query_length )

if read.is_reverse and not read.mate_is_reverse:
is_outtie+=1
else:
is_innie+=1

if n_sampled > n_reads:
break

samfile.close()

library["avg_read_length"]=numpy.average(read_length)
library["avg_insert_size"]=numpy.average(insert_size)
library["std_insert_size"]=numpy.std(insert_size)
library["percentile_insert_size"]=numpy.percentile(insert_size, 99.9)
if len(insert_size):
library["avg_insert_size"]=numpy.average(insert_size)
library["std_insert_size"]=numpy.std(insert_size)
library["percentile_insert_size"]=numpy.percentile(insert_size, 99.9)
else:
library["avg_insert_size"]=0
library["std_insert_size"]=0
library["percentile_insert_size"]=0



print("LIBRARY STATISTICS")
if is_innie > is_outtie:
Expand All @@ -60,6 +72,7 @@ def statistics(bam_file_name,ref,min_mapq,max_ins_len,n_reads):
print("\tAverage insert size = {}".format(library["avg_insert_size"]) )
print("\tStdev insert size = {}".format(library["std_insert_size"] ) )
print("\t99.95 percentile insert size = {}".format( library["percentile_insert_size"]) )
print("Calculated statistics in: " + str( t-time.time() ))
print("")

return(library)
25 changes: 14 additions & 11 deletions tiddit/tiddit_variant.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,8 @@ import math
import numpy
from joblib import Parallel, delayed

#from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
from pysam import AlignmentFile, AlignedSegment


import pysam
from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment

def percentile(a, q):
size = len(a)
Expand Down Expand Up @@ -53,15 +51,14 @@ def scoring(scoring_dict,percentiles):

return(max(score))

def get_region(samfile,str chr,int start,int end,int bp,int min_q,int max_ins, contig_number):
def get_region(AlignmentFile samfile,str chr,int start,int end,int bp,int min_q,int max_ins, contig_number):

cdef int low_q=0
cdef int n_reads=0
cdef long bases=0
cdef int n_discs=0
cdef int n_splits=0


cdef int crossing_r=0
cdef int crossing_f=0

Expand All @@ -83,6 +80,8 @@ def get_region(samfile,str chr,int start,int end,int bp,int min_q,int max_ins, c
cdef long r_start
cdef long r_end

cdef AlignedSegment read

for read in samfile.fetch(chr, q_start, q_end):
if read.is_unmapped:
continue
Expand Down Expand Up @@ -233,13 +232,14 @@ def sv_filter(sample_data,args,chrA,chrB,posA,posB,max_ins_len,n_discordants,n_s
return(filt)

def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,samples,dict coverage_data,contig_number,max_ins_len,contig_seqs):

samfile = AlignmentFile(bam_file_name, "r",reference_filename=args.ref,index_filename="{}_tiddit/{}.csi".format(args.o,samples[0]))
cdef AlignmentFile samfile = AlignmentFile(bam_file_name, "r",reference_filename=args.ref,index_filename="{}_tiddit/{}.csi".format(args.o,samples[0]))
variants=[]

var_n=0
for chrB in sv_clusters[chrA]:

for cluster in sv_clusters[chrA][chrB]:

n_discordants=sv_clusters[chrA][chrB][cluster]["N_discordants"]
n_splits=sv_clusters[chrA][chrB][cluster]["N_splits"]
n_contigs=sv_clusters[chrA][chrB][cluster]["N_contigs"]
Expand All @@ -261,21 +261,24 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar
s=int(math.floor(sv_clusters[chrA][chrB][cluster]["startA"]/50.0))
e=int(math.floor(sv_clusters[chrA][chrB][cluster]["endA"]/50.0))+1
avg_a=numpy.average(coverage_data[chrA][s:e])
#print(f"{chrA}-{posA}-{chrB}")

if avg_a > args.max_coverage*library[ "avg_coverage_{}".format(chrA) ]:
continue
elif (args.max_coverage*n_discordants/avg_a < args.p_ratio/2 and args.max_coverage*n_splits/avg_a < args.r_ratio/2) and not n_contigs:
continue

avg_b=numpy.average(coverage_data[chrA][s:e])
s=int(math.floor(sv_clusters[chrA][chrB][cluster]["startB"]/50.0))
e=int(math.floor(sv_clusters[chrA][chrB][cluster]["endB"]/50.0))+1
avg_b=numpy.average(coverage_data[chrB][s:e])

if avg_b == 0:
continue
elif avg_b > args.max_coverage*library[ "avg_coverage_{}".format(chrB) ]:
continue
elif (args.max_coverage*n_discordants/avg_b < args.p_ratio/2 and args.max_coverage*n_splits/avg_b < args.r_ratio/2) and not n_contigs:
continue


var_n+=1
sample_data={}
for sample in samples:
Expand Down Expand Up @@ -544,7 +547,7 @@ def main(str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,sampl
for chrB in sv_clusters[chrA]:
variants[chrB]=[]

variants_list=Parallel(n_jobs=args.threads)( delayed(define_variant)(chrA,bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len,contig_seqs) for chrA in sv_clusters)
variants_list=Parallel(n_jobs=args.threads,prefer="threads",timeout=99999)( delayed(define_variant)(chrA,bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len,contig_seqs) for chrA in sv_clusters)

ratios={"fragments_A":[],"fragments_B":[],"reads_A":[],"reads_B":[]}
for v in variants_list:
Expand Down

0 comments on commit 17172e6

Please sign in to comment.