-
Notifications
You must be signed in to change notification settings - Fork 8
/
jitterbug.py
executable file
·140 lines (97 loc) · 7.25 KB
/
jitterbug.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python
# for CRG cluster users:
##!/software/so/el6.3/PythonPackages-2.7.6/bin/python
# old CRG cluster paths:
######!/software/so/el6.3/PythonPackages-2.7.3-VirtualEnv/bin/python
#####!/software/so/el6.3/PythonPackagesVirtualEnv/bin/python
# path for loki:
##!/soft/bin/python
# elizabeth henaff
# Feb 2012
# elizabeth.m.henaff@gmail.com
import getopt,sys,os
import argparse,stat
from Run_TE_ID_reseq import *
from Run_TE_ID_reseq_streaming import *
def main(argv):
parser = argparse.ArgumentParser()
#required arguments
parser.add_argument("mapped_reads", help="reads mapped to reference genome in bam format, sorted by position")
parser.add_argument("TE_annot", help="annotation of transposable elements in reference genome, in gff3 format")
#optional arguments - flags
parser.add_argument("-v", "--verbose", help="print more output to the terminal", action="store_true")
#flags for debug
# parser.add_argument("--keep_temp", help="do not delete temp files: bam of discordant reads", action="store_true")
parser.add_argument("--pre_filter", help="pre-filter reads with samtools, and save intermediate filtered read subset", action="store_true")
parser.add_argument("--mem", help="bam file has been mapped with bwa-mem and you want to use split (=chimeric=supplementary) reads. If unset, bam files mapped with either bwa or bwa-mem can be used, but split reads will be ignored", action="store_true")
#optional arguments
parser.add_argument("-l", "--lib_name", help="sample or library name, to be included in final gff output")
parser.add_argument("-d","--sdev_mult", type=int, help="use SDEV_MULT*fragment_sdev + fragment_length when calculating insertion intervals. Best you don't touch this." , default=2)
parser.add_argument("-o", "--output_prefix", help="prefix of output files. Can be path/to/directory/file_prefix", default="jitterbug")
parser.add_argument("-n", "--numCPUs", type=int, help="number of CPUs to use", default=1)
parser.add_argument("-b", "--bin_size", type=int, help="If parallelized, size of bins to use, in bp. If numCPUs > 1 and bin_size == 0, will parallelize by entire chromosomes", default=50000000)
parser.add_argument("-q", "--minMAPQ", type=int, help="minimum read mapping quality to be considered", default=15)
parser.add_argument("-t", "--TE_name_tag", help="name of tag in TE annotation gff file to use to record inserted TEs", default="Name")
parser.add_argument("-s", "--conf_lib_stats", help="tabulated config file that sets the values to use for fragment length and sdev, read length and sdev. \
4 tab-deliminated lines: key\tvalue, keys are:fragment_length, fragment_length_SD, read_length, read_length_SD")
parser.add_argument("-c", "--min_cluster_size", type=int, help="min number of both fwd and rev reads to predict an insertion", default=2)
parser.add_argument("--step_one_only", dest='step_one_only', action='store_true', help="if specified it stops execution after the first BAM filtering", default=False)
parser.add_argument("--step_two_only", dest='step_two_only', action='store_true', help="if specified it starts exection after the first BAM filtering", default=False)
#optional arguments for debug
parser.add_argument("--disc_reads_bam", help="for debug. Use as input bam file of discordant reads only (generated by running with --pre_filter), and skip the step of perusing the input bam for discordant reads")
args = parser.parse_args()
fail_string = "use jitterbug.py -h for complete option list"
#check args
#check ouput folder write permission
output_folder = os.path.dirname(os.path.abspath(args.output_prefix))
print output_folder
print args.output_prefix
if output_folder == "":
output_folder = os.getcwd()
st = os.stat(output_folder)
if bool(st.st_mode & stat.S_IXOTH) and bool(st.st_mode & stat.S_IROTH) and bool(st.st_mode & stat.S_IWOTH):
pass #its ok
else:
print "Error in output folder permissions"
parser.error( "please set to chmod -R 777 for %s"%output_folder)
if not os.path.exists(args.mapped_reads):
parser.error("error in required argument mapped_reads: file %s cannot be found. " % (args.mapped_reads))
bamFileName, bamFileExtension = os.path.splitext(args.mapped_reads)
if bamFileExtension != ".bam":
parser.error("the file specified for argument mapped_reads does not end in .bam, are you sure it's a BAM file?")
#print bamFileName, bamFileExtension
if not os.path.exists(args.mapped_reads + ".bai"):
parser.error("it does not seem that the bam file specified for argument mapped_reads is indexed. Please index using samtools index.")
if not os.path.exists(args.mapped_reads + ".bai"):
parser.error("error in required argument mapped_reads: file %s cannot be found. " % (args.mapped_reads))
if not os.path.exists(args.TE_annot):
parser.error("error in required argument TE_annot: file %s cannot be found. " % (args.TE_annot))
if os.path.dirname(args.output_prefix) and not os.path.exists(os.path.dirname(args.output_prefix)):
parser.error("error in optional argument --output_prefix: directory %s does not exist" % (args.output_prefix))
if args.disc_reads_bam and not os.path.exists(args.disc_reads_bam):
parser.error("error in optional argument --disc_reads_bam: file %s does not exist" % (args.disc_reads_bam))
if args.conf_lib_stats and not os.path.exists(args.conf_lib_stats):
parser.error("error in optional argument --conf_lib_stats: file %s does not exist" % (args.conf_lib_stats))
# if args.pre_filter and not (args.output_prefix and args.conf_lib_stats):
# parser.error("error: if --pre_filter is set, --output_prefix and --conf_lib_stats must be too")
##set appropriate booleans. This is slightly redundant and could be collapsed, but for legacy reasons it's this way
#TODO: streamline this
parallel = args.numCPUs >= 1
already_calc_discordant_reads = args.disc_reads_bam != None
##legacy vars from previous versions, that are not worth putting in public opts but I don't want to get rid of quite yet
#option of re-mapping to a set of TE (or other) sequences. This could be useful in the future for looking at viral insertions, for example
te_seqs = None
#to generate a test bam file of input bam for testing: extracts read pairs where either read1 or read2 map to chr1 or chr2
generate_test_bam = False
mem = args.mem
if args.pre_filter:
run_jitterbug_streaming(args.mapped_reads, args.verbose, args.TE_annot, te_seqs, \
args.lib_name, args.sdev_mult, args.output_prefix, args.TE_name_tag, parallel, \
args.numCPUs, args.bin_size, args.minMAPQ, generate_test_bam, args.pre_filter, args.conf_lib_stats, mem, args.min_cluster_size,args.step_one_only,args.step_two_only)
else:
run_jitterbug(args.mapped_reads, already_calc_discordant_reads, \
args.disc_reads_bam, args.verbose, args.TE_annot, te_seqs, \
args.lib_name, args.sdev_mult, args.output_prefix, args.TE_name_tag, parallel, \
args.numCPUs, args.bin_size, args.minMAPQ, generate_test_bam, args.pre_filter, args.conf_lib_stats, mem, args.min_cluster_size)
if __name__ == "__main__":
main(sys.argv[1:])