diff --git a/Snakefile b/Snakefile index d35dfd8..d12e3b5 100644 --- a/Snakefile +++ b/Snakefile @@ -11,6 +11,7 @@ include: config['args']['mcc_path']+"/snakefiles/retroseq.snakefile" include: config['args']['mcc_path']+"/snakefiles/tepid.snakefile" include: config['args']['mcc_path']+"/snakefiles/teflon.snakefile" include: config['args']['mcc_path']+"/snakefiles/jitterbug.snakefile" +include: config['args']['mcc_path']+"/snakefiles/tebreak.snakefile" rule setup_reads: input: @@ -223,7 +224,9 @@ rule sam_to_bam: output: bam = config['mcc']['bam'], flagstat = config['mcc']['flagstat'], - tmp_bam = temp(config['mcc']['mcc_files']+config['args']['run_id']+".tmp.bam") + metrics = config['mcc']['mcc_files']+config['args']['run_id']+".metrics", + tmp_bam = temp(config['mcc']['mcc_files']+config['args']['run_id']+".tmp.bam"), + tmp2_bam = temp(config['mcc']['mcc_files']+config['args']['run_id']+".tmp2.bam") script: config['args']['mcc_path']+"/scripts/preprocessing/sam_to_bam.py" diff --git a/auxiliary/simulation/mcc_sim.yml b/auxiliary/simulation/mcc_sim.yml index 4694462..dd5abed 100644 --- a/auxiliary/simulation/mcc_sim.yml +++ b/auxiliary/simulation/mcc_sim.yml @@ -10,4 +10,5 @@ dependencies: - wgsim - seqtk - samtools - - bedtools \ No newline at end of file + - bedtools + - art diff --git a/auxiliary/simulation/mcclintock_simulation.py b/auxiliary/simulation/mcclintock_simulation.py index e380995..78579e7 100644 --- a/auxiliary/simulation/mcclintock_simulation.py +++ b/auxiliary/simulation/mcclintock_simulation.py @@ -43,7 +43,7 @@ def main(): fastq2 = modified_reference.replace(".fasta", "_2.fastq") if not os.path.exists(fastq1) or not os.path.exists(fastq2): num_pairs = calculate_num_pairs(modified_reference, args.coverage, args.length, single=args.single) - fastq1, fastq2 = create_synthetic_reads(modified_reference, num_pairs, args.length, args.insert, args.error, x, args.out, run_id=args.runid, seed=args.seed) + fastq1, fastq2 = create_synthetic_reads(args.sim, modified_reference, args.coverage, num_pairs, args.length, args.insert, args.error, x, args.out, run_id=args.runid, seed=args.seed) if not os.path.exists(summary_report): run_mcclintock(fastq1, fastq2, args.reference, args.consensus, args.locations, args.taxonomy, x, args.proc, args.out, args.config, args.keep_intermediate, run_id=args.runid, single=args.single, reverse=reverse) @@ -71,6 +71,7 @@ def parse_args(): parser.add_argument("--end", type=int, help="The number of replicates to run. [default = 300]", required=False) parser.add_argument("--seed", type=str, help="a seed to the random number generator so runs can be replicated", required=False) parser.add_argument("--runid", type=str, help="a string to prepend to output files so that multiple runs can be run at the same time without file name clashes", required=False) + parser.add_argument("--sim", type=str, help="Short read simulator to use (options=wgsim,art) [default = wgsim]", required=False) parser.add_argument("-s","--single", action="store_true", help="runs the simulation in single ended mode", required=False) args = parser.parse_args() @@ -142,6 +143,13 @@ def parse_args(): elif args.strand != "plus" and args.strand != "minus": sys.exit("ERROR: --strand must be plus or minus \n") + + if args.sim is None: + args.sim = "wgsim" + elif args.sim not in ["wgsim", "art"]: + sys.exit("ERROR: --sim must be wgsim or art \n") + + return args def run_command(cmd_list, log=None): @@ -374,7 +382,7 @@ def calculate_num_pairs(fasta, coverage, length, single=False): return num_pairs -def create_synthetic_reads(reference, num_pairs, length, insert, error, rep, out, run_id="", seed=None): +def create_synthetic_reads(simulator, reference, coverage, num_pairs, length, insert, error, rep, out, run_id="", seed=None): if seed is not None: random.seed(seed+"create_synthetic_reads"+str(rep)) else: @@ -382,12 +390,23 @@ def create_synthetic_reads(reference, num_pairs, length, insert, error, rep, out seed_for_wgsim = random.randint(0,1000) + tmp_fastq1 = reference.replace(".fasta", "") + "1.fq" + tmp_fastq2 = reference.replace(".fasta", "") + "2.fq" + fastq1 = reference.replace(".fasta", "_1.fastq") fastq2 = reference.replace(".fasta", "_2.fastq") report = reference.replace(".fasta", "_wgsim_report.txt") - command = ["wgsim", "-1", str(length), "-2", str(length), "-d", str(insert), "-N", str(num_pairs), "-S", str(seed_for_wgsim), "-e", str(error), "-h", reference, fastq1, fastq2] + if simulator == "wgsim": + command = ["wgsim", "-1", str(length), "-2", str(length), "-d", str(insert), "-N", str(num_pairs), "-S", str(seed_for_wgsim), "-e", str(error), "-h", reference, tmp_fastq1, tmp_fastq2] + + else: + command = ["art_illumina", "-ss", "HS25", "--rndSeed", str(seed_for_wgsim), "-sam", "-i", reference, "-p", "-l", str(length), "-f", str(coverage), "-m", str(insert), "-s", "10", "-o", reference.replace(".fasta", "")] + + run_command_stdout(command, report) + run_command(["mv", tmp_fastq1, fastq1]) + run_command(["mv", tmp_fastq2, fastq2]) return fastq1, fastq2 diff --git a/auxiliary/simulation/mcclintock_simulation_analysis.py b/auxiliary/simulation/mcclintock_simulation_analysis.py index 42b74e0..fa295fd 100644 --- a/auxiliary/simulation/mcclintock_simulation_analysis.py +++ b/auxiliary/simulation/mcclintock_simulation_analysis.py @@ -317,8 +317,15 @@ def write_combined_metrics(forward_metrics, reverse_metrics, outfile): with open(outfile, "w") as out: out.write(",".join(forward_metrics[0]) + "\n") for method in metrics.keys(): - precision = metrics[method][0] / (metrics[method][0] + metrics[method][1]) - recall = metrics[method][0] / (metrics[method][0] + metrics[method][2]) + if metrics[method][0] + metrics[method][1] > 0: + precision = metrics[method][0] / (metrics[method][0] + metrics[method][1]) + else: + precision = 0 + + if metrics[method][0] + metrics[method][2] > 0: + recall = metrics[method][0] / (metrics[method][0] + metrics[method][2]) + else: + recall = 0 outline = [method, str(metrics[method][0]), str(metrics[method][1]), str(metrics[method][2]), str(precision), str(recall)] out.write(",".join(outline) + "\n") diff --git a/auxiliary/simulation/yeast/yeast_config.json b/auxiliary/simulation/yeast/yeast_config.json index 60908e8..e2a47e8 100644 --- a/auxiliary/simulation/yeast/yeast_config.json +++ b/auxiliary/simulation/yeast/yeast_config.json @@ -20,6 +20,6 @@ "mcclintock": { "path" : "/home/pjb68507/git//mcclintock/", - "methods" : "ngs_te_mapper,ngs_te_mapper2,relocate,relocate2,temp,temp2,retroseq,popoolationte,popoolationte2,te-locate,teflon" + "methods" : "ngs_te_mapper,ngs_te_mapper2,relocate,relocate2,temp,temp2,retroseq,popoolationte,popoolationte2,te-locate,teflon,tebreak" } } diff --git a/config/coverage/coverage.py b/config/coverage/coverage.py index 36a8045..15b0687 100644 --- a/config/coverage/coverage.py +++ b/config/coverage/coverage.py @@ -1,10 +1,9 @@ - -# omits the edges of the TE sequence coverage when calculating average depth across the element -OMIT_EDGES = True - -# If OMIT_EDGES = True, use the read length as the length of the edges to omit -OMIT_EDGES_READ_LENGTH = True - -# IF OMIT_EDGES = True and OMIT_EDGES_READ_LENGTH = False -# use this value as the length of the edges to omit -OMIT_EDGES_LENGTH = 300 +PARAMS = { + # omits the edges of the TE sequence coverage when calculating average depth across the element + "omit_edges": True, + # If OMIT_EDGES = True, use the read length as the length of the edges to omit + "omit_edges_read_length" : True, + # IF OMIT_EDGES = True and OMIT_EDGES_READ_LENGTH = False + # use this value as the length of the edges to omit + "omit_edges_length" : 300 +} diff --git a/config/ngs_te_mapper/ngs_te_mapper_post.py b/config/ngs_te_mapper/ngs_te_mapper_post.py index 94feca1..fb5bec2 100644 --- a/config/ngs_te_mapper/ngs_te_mapper_post.py +++ b/config/ngs_te_mapper/ngs_te_mapper_post.py @@ -1,2 +1,3 @@ - -MIN_READ_SUPPORT = 0 \ No newline at end of file +PARAMS = { + "min_read_support" : 0 +} diff --git a/config/ngs_te_mapper/ngs_te_mapper_run.py b/config/ngs_te_mapper/ngs_te_mapper_run.py index f755379..4638df5 100644 --- a/config/ngs_te_mapper/ngs_te_mapper_run.py +++ b/config/ngs_te_mapper/ngs_te_mapper_run.py @@ -1,3 +1,4 @@ - -# max length of junction read overlap to consider a target site duplication -MAX_TSD = 20 \ No newline at end of file +PARAMS = { + # max length of junction read overlap to consider a target site duplication + "tsd=" : 20 +} diff --git a/config/ngs_te_mapper2/ngs_te_mapper2_run.py b/config/ngs_te_mapper2/ngs_te_mapper2_run.py index 274f823..9103a80 100644 --- a/config/ngs_te_mapper2/ngs_te_mapper2_run.py +++ b/config/ngs_te_mapper2/ngs_te_mapper2_run.py @@ -1,17 +1,15 @@ - -# merge window for identifying TE clusters -WINDOW = 10 - -# minimum mapping quality of alignment -MIN_MAPQ = 20 - -# minimum allele frequency -MIN_ALLELE_FREQUENCY = 0.1 - -# max length of junction read overlap to consider a target site duplication -MAX_TSD = 25 - -# maximum gap size -MAX_GAP = 5 +PARAMS = { + "--window" : 10, + "--min_mapq" : 20, + "--min_af" : 0.1, + "--tsd_max" : 25, + "--gap_max" : 5 +} + +# --window WINDOW merge window for identifying TE clusters (default = 10) +# --min_mapq MIN_MAPQ minimum mapping quality of alignment (default = 20) +# --min_af MIN_AF minimum allele frequency (default = 0.1) +# --tsd_max TSD_MAX maximum TSD size (default = 25) +# --gap_max GAP_MAX maximum gap size (default = 5) diff --git a/config/popoolationte/popoolationte_post.py b/config/popoolationte/popoolationte_post.py index f421a86..e24f433 100644 --- a/config/popoolationte/popoolationte_post.py +++ b/config/popoolationte/popoolationte_post.py @@ -1,6 +1,6 @@ - -# requires that final results have support on both ends of prediction -REQUIRE_BOTH_END_SUPPORT=True - -# threshold for the minimum acceptable fraction of the reads supporting the prediction -PERCENT_READ_SUPPORT_THRESHOLD=0.1 \ No newline at end of file +PARAMS = { + # requires that final results have support on both ends of prediction + "require_both_end_support" : True, + # threshold for the minimum acceptable fraction of the reads supporting the prediction + "percent_read_support_threshold" : 0.1 +} \ No newline at end of file diff --git a/config/popoolationte/popoolationte_run.py b/config/popoolationte/popoolationte_run.py index 18ff8f4..306b44f 100644 --- a/config/popoolationte/popoolationte_run.py +++ b/config/popoolationte/popoolationte_run.py @@ -1,3 +1,23 @@ + +PARAMS = { + "identify-te-insertsites.pl" : { + "--min-count" : 3, + "--min-map-qual" : 15 + }, + "crosslink-te-sites.pl" : { + "--single-site-shift": 100 + }, + "update-teinserts-with-knowntes.pl" : { + "--single-site-shift": 100 + }, + "estimate-polymorphism.pl" : { + "--min-map-qual": 15 + }, + "filter-teinserts.pl" : { + "--min-count": 5 + } +} + ''' NAME perl identify-te-insertsites.pl - Identifies TE insertion sites (forward @@ -11,27 +31,7 @@ --min-map-qual the minimum mapping quality; this will only apply to reads mapping to a reference contig. -''' -IDENTIFY_TE_INSERTSITES = { - "min-count" : 3, - "min-map-qual" : 15 -} - - - - -CROSSLINK_TE_SITES = { - "single-site-shift": 100 -} - - -UPDATE_TEINSERTS_WITH_KNOWNTES = { - "single-site-shift": 100 -} - - -''' NAME perl estimate-polymorphism.pl - Estimte the insertion frequencies for a given set of TE insertions @@ -43,13 +43,4 @@ --min-map-qual the minimum mapping quality -''' - -ESTIMATE_POLYMORPHISM = { - "min-map-qual": 15 -} - - -FILTER = { - "min-count": 5 -} \ No newline at end of file +''' \ No newline at end of file diff --git a/config/popoolationte2/popoolationte2_post.py b/config/popoolationte2/popoolationte2_post.py index e077e47..477755e 100644 --- a/config/popoolationte2/popoolationte2_post.py +++ b/config/popoolationte2/popoolationte2_post.py @@ -1,6 +1,7 @@ +PARAMS = { + # require that the TE prediction have support from both junctions + "require_both_end_support" : True, -# require that the TE prediction have support from both junctions -REQUIRE_BOTH_END_SUPPORT = True - -# threshold for the minimum acceptable (average physical coverage supporting the insertion of the given TE) / (average physical coverage) -FREQUENCY_THRESHOLD=0.1 \ No newline at end of file + # threshold for the minimum acceptable (average physical coverage supporting the insertion of the given TE) / (average physical coverage) + "frequency_threshold" : 0.1 +} \ No newline at end of file diff --git a/config/popoolationte2/popoolationte2_run.py b/config/popoolationte2/popoolationte2_run.py index af718b8..8097a69 100644 --- a/config/popoolationte2/popoolationte2_run.py +++ b/config/popoolationte2/popoolationte2_run.py @@ -1,4 +1,39 @@ +PARAMS = { + "ppileup" : { + "--map-qual": 15, + "--sr-mindist" : 10000, + "--id-up-quant": 0.01 + }, + + "subsampleppileup" : { + "run" : False, + "--target-coverage": 100, + "--with-replace": False + }, + + "identifySignatures" : { + "--min-count": 2.0, + "--signature-window": "median", + "--min-valley": "median", + "--chunk-distance": 5 + }, + + "updateStrand" : { + "--map-qual": 15, + "--max-disagreement": 0.1, + "--sr-mindist": 10000, + "--id-up-quant": 0.01 + }, + + "pairupSignatures" : { + "--min-distance": -100, + "--max-distance": 500, + "--max-freq-diff": 1.0 + } + +} + #ppileup # create a physical pileup file from one or multiple bam files @@ -11,11 +46,6 @@ # --id-up-quant paired end fragments with an insert size in the upper # quantile will be ignored [fraction]; default=0.01 -ppileup = { - "map-qual": 15, - "sr-mindist" : 10000, - "id-up-quant": 0.01 -} # subsampleppileup @@ -28,11 +58,7 @@ # == Parameters for fine tuning == # --with-replace use sampling with replacement instead of without replacement -subsampleppileup = { - "run" : False, - "target-coverage": 100, - "with-replace": False -} + # identifySignatures @@ -51,12 +77,7 @@ # --chunk-distance minimum distance between chromosomal chunks in multiples # of --min-valley [int]; default=5 -identifySignatures = { - "min-count": 2.0, - "signature-window": "median", - "min-valley": "median", - "chunk-distance": 5 -} + # updateStrand @@ -73,12 +94,6 @@ # --id-up-quant paired end fragments with an insert size in the upper # quantile will be ignored [fraction]; default=0.01 -updateStrand = { - "map-qual": 15, - "max-disagreement": 0.1, - "sr-mindist": 10000, - "id-up-quant": 0.01 -} # pairupSignatures @@ -92,8 +107,3 @@ # --max-freq-diff the maximum frequency difference between signatures; # default=1.0 -pairupSignatures = { - "min-distance": -100, - "max-distance": 500, - "max-freq-diff": 1.0 -} diff --git a/config/relocate/relocate_post.py b/config/relocate/relocate_post.py index 41b2f93..be2b462 100644 --- a/config/relocate/relocate_post.py +++ b/config/relocate/relocate_post.py @@ -1,5 +1,6 @@ -REF_LEFT_THRESHOLD = 0 -REF_RIGHT_THRESHOLD = 0 - -NONREF_LEFT_THRESHOLD = 0 -NONREF_RIGHT_THRESHOLD = 0 \ No newline at end of file +PARAMS = { + "ref_left_threshold" : 0, + "ref_right_threshold" : 0, + "nonref_left_threshold" : 0, + "nonref_right_threshold" : 0 +} \ No newline at end of file diff --git a/config/relocate/relocate_run.py b/config/relocate/relocate_run.py index 7b2e9a1..2173d29 100644 --- a/config/relocate/relocate_run.py +++ b/config/relocate/relocate_run.py @@ -53,10 +53,10 @@ ''' -RELOCATE = { - 'l' : 10, - 'm' : 0.0, - 'bm' : 10, - 'bt' : 7, - 'f' : 100 +PARAMS = { + '-l' : 10, + '-m' : 0.0, + '-bm' : 10, + '-bt' : 7, + '-f' : 100 } \ No newline at end of file diff --git a/config/relocate2/relocate2_post.py b/config/relocate2/relocate2_post.py index 234a45f..bddbe93 100644 --- a/config/relocate2/relocate2_post.py +++ b/config/relocate2/relocate2_post.py @@ -1,9 +1,11 @@ -REF_LEFT_SUPPORT_THRESHOLD = 0 -REF_RIGHT_SUPPORT_THRESHOLD = 0 -REF_LEFT_JUNCTION_THRESHOLD = 0 -REF_RIGHT_JUNCTION_THRESHOLD = 0 +PARAMS = { + "ref_left_support_threshold" : 0, + "ref_right_support_threshold" : 0, + "ref_left_junction_threshold" : 0, + "ref_right_junction_threshold" : 0, -NONREF_LEFT_SUPPORT_THRESHOLD = 0 -NONREF_RIGHT_SUPPORT_THRESHOLD = 0 -NONREF_LEFT_JUNCTION_THRESHOLD = 0 -NONREF_RIGHT_JUNCTION_THRESHOLD = 0 \ No newline at end of file + "nonref_left_support_threshold" : 0, + "nonref_right_support_threshold" : 0, + "nonref_left_junction_threshold" : 0, + "nonref_right_junction_threshold" : 0 +} \ No newline at end of file diff --git a/config/relocate2/relocate2_run.py b/config/relocate2/relocate2_run.py index 18ba0a4..c90d960 100644 --- a/config/relocate2/relocate2_run.py +++ b/config/relocate2/relocate2_run.py @@ -1,3 +1,10 @@ +PARAMS = { + '--aligner' : "blat", + '--len_cut_match' : 10, + '--len_cut_trim' : 10, + '--mismatch' : 2, + '--mismatch_junction' : 2 +} ''' python scripts/relocaTE2.py --help @@ -28,12 +35,3 @@ ''' - - -RELOCATE2 = { - 'aligner' : "blat", - 'len_cut_match' : 10, - 'len_cut_trim' : 10, - 'mismatch' : 2, - 'mismatch_junction' : 2 -} \ No newline at end of file diff --git a/config/retroseq/retroseq_post.py b/config/retroseq/retroseq_post.py index 82f1116..0b0d4d3 100644 --- a/config/retroseq/retroseq_post.py +++ b/config/retroseq/retroseq_post.py @@ -1,11 +1,12 @@ # https://github.com/tk2/RetroSeq/wiki/RetroSeq-Tutorial -''' -Number of correctly mapped read pairs spanning breakpoint -''' -READ_SUPPORT_THRESHOLD = 0 +PARAMS = { + "read_support_threshold" : 0, + "breakpoint_confidence_threshold" : 6 +} ''' -The FL tag ranges from 1-8 and gives information on the breakpoint with 8 being the most confident calls and -lower values indicating calls that don’t meet the breakpoint criteria for reasons such as lack of 5’ or 3’ reads -''' -BREAKPOINT_CONFIDENCE_THRESHOLD = 6 \ No newline at end of file +read_support_threshold Number of correctly mapped read pairs spanning breakpoint + +breakpoint_confidence_threshold The FL tag ranges from 1-8 and gives information on the breakpoint with 8 being the most confident calls and + lower values indicating calls that don’t meet the breakpoint criteria for reasons such as lack of 5’ or 3’ reads +''' \ No newline at end of file diff --git a/config/retroseq/retroseq_run.py b/config/retroseq/retroseq_run.py index a1b622c..556cad9 100644 --- a/config/retroseq/retroseq_run.py +++ b/config/retroseq/retroseq_run.py @@ -1,11 +1,11 @@ +PARAMS = { + "-depth" : 200, + "-reads" : 10, + "-q": 20 +} + ''' [-depth Max average depth of a region to be considered for calling. Default is 200.] [-reads It is the minimum number of reads required to make a call. Default is 5. Parameter is optional.] [-q Minimum mapping quality for a read mate that anchors the insertion call. Default is 30. Parameter is optional.] ''' - -PARAMETERS = { - "depth" : 200, - "reads" : 10, - "q": 20 -} \ No newline at end of file diff --git a/config/tebreak/tebreak_post.py b/config/tebreak/tebreak_post.py new file mode 100644 index 0000000..61e2afb --- /dev/null +++ b/config/tebreak/tebreak_post.py @@ -0,0 +1,29 @@ +# Minimum fraction of bases matched to reference for inserted sequence on insertion seqment of 5' supporting contig +MIN_5P_ELT_MATCH = 0.0 + +# Minimum fraction of bases matched to reference for inserted sequence on insertion seqment of 3' supporting contig +MIN_3P_ELT_MATCH = 0.0 + +# Minimum fraction of bases matched to reference genome on genomic segment of 5' supporting contig +MIN_5P_GENOME_MATCH = 0.0 + +# Minimum fraction of bases matched to reference genome on genomic segment of 3' supporting contig +MIN_3P_GENOME_MATCH = 0.0 + +# Minimum number of split reads supporting 5' end of the insertion +MIN_SPLIT_READS_5P = 0 + +# Minimum number of split reads supporting 3' end of the insertion +MIN_SPLIT_READS_3P = 0 + +# Minimum number of discordant read ends re-mappable to insertion reference sequence +MIN_REMAPPED_DISCORDANT = 0 + +# Minimum proportion of remapped discordant reads mapping to the reference insertion sequence +MIN_REMAP_DISC_FRACTION = 0.0 + +# Minimum number of split reads re-mappable to insertion reference sequence +MIN_REMAPPED_SPLITREADS = 0 + +# Minimum proportion of remapped split reads mapping to the reference insertion sequence +MIN_REMAP_SPLIT_FRACTION = 0.0 \ No newline at end of file diff --git a/config/tebreak/tebreak_run.py b/config/tebreak/tebreak_run.py new file mode 100644 index 0000000..d6fa9d6 --- /dev/null +++ b/config/tebreak/tebreak_run.py @@ -0,0 +1,42 @@ +PARAMS = { + "--minMWP": "0.01", + "--min_minclip" : "3", + "--min_maxclip" : "10", + "--min_sr_per_break" : "1", + "--min_consensus_score" : "0.9", + "--min_chr_len" : "0", + "--max_ins_reads" : "1000", + "--min_split_reads" : "4", + "--min_prox_mapq" : "10", + "--max_N_consensus" : "4", + "--max_disc_fetch" : "50", + "--min_disc_reads" : "4", + "--sr_density" : "2.0", + "--min_ins_match" : "0.90", + "--min_ref_match" : "0.98", + "--min_cons_len" : "250", + "--keep_all_tmp_bams" : False, + "--skip_final_filter" : False, + "--debug": False +} + + +# --minMWP MINMWP minimum Mann-Whitney P-value for split qualities (default = 0.01) +# --min_minclip MIN_MINCLIP min. shortest clipped bases per cluster (default = 3) +# --min_maxclip MIN_MAXCLIP min. longest clipped bases per cluster (default = 10) +# --min_sr_per_break MIN_SR_PER_BREAK minimum split reads per breakend (default = 1) +# --min_consensus_score MIN_CONSENSUS_SCORE quality of consensus alignment (default = 0.9) +# --min_chr_len MIN_CHR_LEN minimum chromosome length to consider in discordant site search (default = 0) +# --max_ins_reads MAX_INS_READS maximum number of reads to use per insertion call (default = 1000) +# --min_split_reads MIN_SPLIT_READS minimum total split reads per insertion call (default = 4) +# --min_prox_mapq MIN_PROX_MAPQ minimum map quality for proximal subread (default = 10) +# --max_N_consensus MAX_N_CONSENSUS exclude breakend seqs with > this number of N bases (default = 4) +# --max_disc_fetch MAX_DISC_FETCH maximum number of discordant reads to fetch per insertion site per BAM (default = 50; 0 = disable fetch) +# --min_disc_reads MIN_DISC_READS if using -d/--disco_target, minimum number of discordant reads to trigger a call (default = 4) +# --sr_density SR_DENSITY maximum split read density in chunk (default = 2.0) +# --min_ins_match MIN_INS_MATCH (output) minumum match to insertion library (default 0.90) +# --min_ref_match MIN_REF_MATCH (output) minimum match to reference genome (default 0.98) +# --min_cons_len MIN_CONS_LEN (output) min total consensus length (default=250) +# --keep_all_tmp_bams leave ALL temporary BAMs (warning: lots of files!) +# --skip_final_filter do not apply final filters or fix for orientation +# --debug \ No newline at end of file diff --git a/config/teflon/teflon_post.py b/config/teflon/teflon_post.py index 8636aa6..b50e4ca 100644 --- a/config/teflon/teflon_post.py +++ b/config/teflon/teflon_post.py @@ -1,15 +1,16 @@ -''' -C10: read count for "presence reads" -C11: read count for "absence reads" -C13: genotype for every TE (allele frequency for pooled data, present/absent for haploid, present/absent/heterozygous for diploid) #Note: haploid/diploid caller is under construction, use "pooled" for presence/absence read counts -''' - -PARAMETERS = { +PARAMS = { "min_presence_reads" : 3, "max_absence_reads" : None, "min_presence_fraction" : 0.1, # presence/(absence+presence), "require_tsd" : True, # non-ref predictions must have a target site duplication "require_both_breakpoints" : True # non-ref predictions must have both breakpoints predicted -} \ No newline at end of file +} + +''' +C10: read count for "presence reads" +C11: read count for "absence reads" +C13: genotype for every TE (allele frequency for pooled data, present/absent for haploid, present/absent/heterozygous for diploid) #Note: haploid/diploid caller is under construction, use "pooled" for presence/absence read counts +''' + diff --git a/config/teflon/teflon_run.py b/config/teflon/teflon_run.py index 5e1c5da..0d0659a 100644 --- a/config/teflon/teflon_run.py +++ b/config/teflon/teflon_run.py @@ -1,3 +1,15 @@ + +# When params: sd, cov, or ht are set to None, they are determined by TEFLoN, Can be set to a specific INT value +PARAMS = { + "-q": 20, + "-sd" : None, + "-cov" : None, + "-n1" : 1, + "-n2" : 1, + "-lt" : 1, + "-ht" : None +} + ''' usage: python usr/local/teflon.v0.4.py [optional] -q #NOTE: Mapped reads with map qualities lower than this number will be discarded @@ -17,14 +29,3 @@ -lt [sites genotyped as -9 if adjusted read counts lower than this threshold, default=1] -ht [sites genotyped as -9 if adjusted read counts higher than this threshold, default=mean_coverage + 2*STDEV] ''' - -# When params: sd, cov, or ht are set to None, they are determined by TEFLoN, Can be set to a specific INT value -PARAMETERS = { - "q": 20, - "sd" : None, - "cov" : None, - "n1" : 1, - "n2" : 1, - "lt" : 1, - "ht" : None -} diff --git a/config/telocate/telocate_post.py b/config/telocate/telocate_post.py index d7bd58c..7be6453 100644 --- a/config/telocate/telocate_post.py +++ b/config/telocate/telocate_post.py @@ -1,3 +1,3 @@ - - -READ_PAIR_SUPPORT_THRESHOLD = 0 \ No newline at end of file +PARAMS = { + "read_pair_support_threshold" : 0 +} \ No newline at end of file diff --git a/config/telocate/telocate_run.py b/config/telocate/telocate_run.py index d9d2886..71bb76a 100644 --- a/config/telocate/telocate_run.py +++ b/config/telocate/telocate_run.py @@ -1,12 +1,15 @@ -# Max memory to use (GB) -MAX_MEM = 4 +PARAMS = { + # Max memory to use (GB) + "max_mem" : 4, -# Multiplied by median insert size -MIN_DISTANCE = 5 + # Multiplied by median insert size + "min_distance" : 5, -# minimal supporting reads -MIN_SUPPORT_READS = 3 + # minimal supporting reads + "min_support_reads" : 3, + + # minimal supporting individuals + "min_support_individuals" : 1 +} -# minimal supporting individuals -MIN_SUPPORT_INDIVIDUALS = 1 diff --git a/config/temp/temp_post.py b/config/temp/temp_post.py index fd6719d..6d98c71 100644 --- a/config/temp/temp_post.py +++ b/config/temp/temp_post.py @@ -1,6 +1,7 @@ - # post processing constants for TEMP module - -ACCEPTABLE_INSERTION_SUPPORT_CLASSES = ["1p1"] - -FREQUENCY_THRESHOLD = 0.1 \ No newline at end of file +PARAMS = { + # valid options: 1p1, 2p, singleton + "acceptable_insertion_support_classes" : ["1p1"], + # minimum frequency of the inserted transposon + "frequency_threshold" : 0.1 +} \ No newline at end of file diff --git a/config/temp/temp_run.py b/config/temp/temp_run.py index b943535..a767215 100644 --- a/config/temp/temp_run.py +++ b/config/temp/temp_run.py @@ -1,22 +1,12 @@ # TEMP_Insertion.sh +PARAMS = { + '-x' : 30, + '-m' : 1 +} + ''' Options: - -i Input file in bam format with full path. Please sort and index the file before calling this program. - Sorting and indexing can be done by 'samtools sort' and 'samtools index' - -s Directory where all the scripts are - -o Path to output directory. Default is current directory - -r Transposon sequence database in fasta format with full path - -t Annotated TEs in BED6 format with full path. Detected insertions that overlap with annoated TEs will be filtered. - -u TE families annotations. If supplied detected insertions overlap with annotated TE of the same family will be filtered. Only use with -t. -m Number of mismatch allowed when mapping to TE concensus sequences. Default is 3 -x The minimum score difference between the best hit and the second best hit for considering a read as uniquely mapped. For BWA mem. - -f An integer specifying the length of the fragments (inserts) of the library. Default is 500 - -c An integer specifying the number of CUPs used. Default is 8 - -h Show help message -''' - -TEMP_Insertion = { - 'x' : 30, - 'm' : 1 -} +''' \ No newline at end of file diff --git a/config/temp2/temp2_post.py b/config/temp2/temp2_post.py index 2b9ffe0..d6dd030 100644 --- a/config/temp2/temp2_post.py +++ b/config/temp2/temp2_post.py @@ -1,5 +1,6 @@ -# valid options: 1p1, 2p, singleton -ACCEPTABLE_INSERTION_SUPPORT_CLASSES = ["1p1"] - -# minimum frequency of the inserted transposon. It generally means what fraction of sequenced genome present this insertion. -FREQUENCY_THRESHOLD = 0.1 \ No newline at end of file +PARAMS = { + # valid options: 1p1, 2p, singleton + "acceptable_insertion_support_classes" : ["1p1"], + # minimum frequency of the inserted transposon. It generally means what fraction of sequenced genome present this insertion. + "frequency_threshold" : 0.1 +} \ No newline at end of file diff --git a/config/temp2/temp2_run.py b/config/temp2/temp2_run.py index 7d47f75..1e567bd 100644 --- a/config/temp2/temp2_run.py +++ b/config/temp2/temp2_run.py @@ -1,33 +1,34 @@ +PARAMS ={ + "insertion" : { + "-M" : 2, + "-m" : 5, + "-U" : 0.8, + "-N" : 300, + "-T" : False, + "-L" : False, + "-S" : False + }, + + "absence" : { + "-x" : 0 + } +} ############################## # TEMP2 INSERTION PARAMETERS # ############################## # -M mismatch% Percentage of mismatch allowed when anchor to genome. Default is 2. -GENOME_MISMATCH_PCT = 2 - # -m mismatch% Percentage of mismatch allowed when mapping to TEs. Default is 5. -TE_MISMATCH_PCT = 5 - # -U ratio The ratio between the second best alignment and the best alignment to judge if a read is uniquely mapped. Default is 0.8. -RATIO = 0.8 - # -N reference_filter_window window sizea (+-n) for filtering insertions overlapping reference insertions. Default is 300. -FILTER_WINDOW = 300 - # -T Set this parameter to allow truncated de novo insertions; For default, only full-length de novo insertions are allowed. -TRUNCATED = False - # -L Set this parameter to use a looser criteria to filter reference annotated copy overlapped insertions; Default not allowed. -LOOSE_FILTER = False - # -S Set this parameter to skip insertion length checking; Default is to remove those insertions that are not full length of shorter than 500bp. -SKIP_INS_LEN_CHECK = False ############################ # TEMP2 ABSENCE PARAMETERS # ############################ -# -x The minimum score difference between the best hit and the second best hit for considering a read as uniquely mapped. For BWA MEM. -UNIQ_MAP_SCORE = 0 \ No newline at end of file +# -x The minimum score difference between the best hit and the second best hit for considering a read as uniquely mapped. For BWA MEM. \ No newline at end of file diff --git a/config/tepid/tepid_run.py b/config/tepid/tepid_run.py new file mode 100644 index 0000000..e69de29 diff --git a/config/trimgalore/trimgalore.py b/config/trimgalore/trimgalore.py index a224069..4e392a4 100644 --- a/config/trimgalore/trimgalore.py +++ b/config/trimgalore/trimgalore.py @@ -1,3 +1,12 @@ # options used for adapter trimming with the trimgalore module -SINGLE_END_FLAGS = ["--fastqc"] -PAIRED_END_FLAGS = ["--paired", "--fastqc"] +PARAMS = { + "single_end": { + "--fastqc": True, + }, + + "paired_end": { + "--fastqc": True, + "--paired": True + } +} + diff --git a/docs/getting_started/quick_start.rst b/docs/getting_started/quick_start.rst new file mode 100644 index 0000000..79392ae --- /dev/null +++ b/docs/getting_started/quick_start.rst @@ -0,0 +1,60 @@ +========== +McClintock +========== + +* `McClintock Github Repository `_ + +Many methods have been developed to detect transposable element (TE) insertions from whole genome shotgun next-generation sequencing (NGS) data, each of which has different dependencies, run interfaces, and output formats. Here, we have developed a meta-pipeline to install and run multiple methods for detecting TE insertions in NGS data, which generates output in the UCSC Browser extensible data (BED) format as well as the Variant Call Format (VCF). + +.. seealso:: + + `Nelson, Linheiro and Bergman (2017) G3 7:2763-2778 `_ + Contains a detailed description of the original McClintock pipeline (v0.1) and evaluation of the original six McClintock component methods on the yeast genome + +--------------------------------- +TE Dectection Software Components +--------------------------------- + +.. csv-table:: + :header: "Method", "Citation" + :widths: 10, 15 + + `ngs_te_mapper `_, `Linheiro and Bergman (2012) `_ + `ngs_te_mapper2 `_, Unpublished + `RelocaTE `_, `Robb et al. (2013) `_ + `RelocaTE2 `_, `Chen et al. (2017) `_ + `TEMP `_, `Zhuang et al. (2014) `_ + `TEMP2 `_, `Yu et al. (2021) `_ + `RetroSeq `_, `Keane et al. (2012) `_ + `PoPoolationTE `_, `Kofler et al. (2012) `_ + `PoPoolationTE2 `_, `Kofler et al. (2016) `_ + `TE-locate `_, `Platzer et al. (2012) `_ + `TEFLoN `_, `Adrion et al. (2017) `_ + + +----------- +Quick Start +----------- + +.. code:: bash + + # INSTALL (Requires Conda) + git clone git@github.com:bergmanlab/mcclintock.git + cd mcclintock + conda env create -f install/envs/mcclintock.yml --name mcclintock + conda activate mcclintock + python3 mcclintock.py --install + + # DOWNLOAD TEST DATA + python3 test/download_test_data.py + + # RUN ON TEST DATA + python3 mcclintock.py \ + -r test/sacCer2.fasta \ + -c test/sac_cer_TE_seqs.fasta \ + -g test/reference_TE_locations.gff \ + -t test/sac_cer_te_families.tsv \ + -1 test/SRR800842_1.fastq.gz \ + -2 test/SRR800842_2.fastq.gz \ + -p 4 \ + -o /path/to/output/directory \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 8a04190..f8d371e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -25,9 +25,11 @@ TE Dectection Software Components :widths: 10, 15 `ngs_te_mapper `_, `Linheiro and Bergman (2012) `_ + `ngs_te_mapper2 `_, Unpublished `RelocaTE `_, `Robb et al. (2013) `_ `RelocaTE2 `_, `Chen et al. (2017) `_ `TEMP `_, `Zhuang et al. (2014) `_ + `TEMP2 `_, `Yu et al. (2021) `_ `RetroSeq `_, `Keane et al. (2012) `_ `PoPoolationTE `_, `Kofler et al. (2012) `_ `PoPoolationTE2 `_, `Kofler et al. (2016) `_ @@ -71,26 +73,9 @@ Quick Start :caption: Getting Started :hidden: + getting_started/quick_start getting_started/install -.. toctree:: - :maxdepth: 2 - :caption: Component methods - :hidden: - - methods/trimgalore - methods/mapreads - methods/coverage - methods/ngs_te_mapper - methods/RelocaTE - methods/RelocaTE2 - methods/TEMP - methods/RetroSeq - methods/PoPoolationTE - methods/PoPoolationTE2 - methods/TE-locate - methods/TEFLoN - .. toctree:: :maxdepth: 2 diff --git a/docs/methods/PoPoolationTE.rst b/docs/methods/PoPoolationTE.rst deleted file mode 100644 index a3bad2d..0000000 --- a/docs/methods/PoPoolationTE.rst +++ /dev/null @@ -1,8 +0,0 @@ - -============= -PoPoolationTE -============= - - - - diff --git a/docs/methods/PoPoolationTE2.rst b/docs/methods/PoPoolationTE2.rst deleted file mode 100644 index 96a7fa3..0000000 --- a/docs/methods/PoPoolationTE2.rst +++ /dev/null @@ -1,5 +0,0 @@ - - -============== -PoPoolationTE2 -============== diff --git a/docs/methods/RelocaTE.rst b/docs/methods/RelocaTE.rst deleted file mode 100644 index 4224712..0000000 --- a/docs/methods/RelocaTE.rst +++ /dev/null @@ -1,4 +0,0 @@ - -======== -RelocaTE -======== diff --git a/docs/methods/RelocaTE2.rst b/docs/methods/RelocaTE2.rst deleted file mode 100644 index d2a24e5..0000000 --- a/docs/methods/RelocaTE2.rst +++ /dev/null @@ -1,5 +0,0 @@ - -========= -RelocaTE2 -========= - diff --git a/docs/methods/RetroSeq.rst b/docs/methods/RetroSeq.rst deleted file mode 100644 index a2b346c..0000000 --- a/docs/methods/RetroSeq.rst +++ /dev/null @@ -1,6 +0,0 @@ - - -======== -RetroSeq -======== - diff --git a/docs/methods/TE-locate.rst b/docs/methods/TE-locate.rst deleted file mode 100644 index 8541c80..0000000 --- a/docs/methods/TE-locate.rst +++ /dev/null @@ -1,5 +0,0 @@ - -========= -TE-locate -========= - diff --git a/docs/methods/TEFLoN.rst b/docs/methods/TEFLoN.rst deleted file mode 100644 index 32615eb..0000000 --- a/docs/methods/TEFLoN.rst +++ /dev/null @@ -1,5 +0,0 @@ - -====== -TEFLoN -====== - diff --git a/docs/methods/TEMP.rst b/docs/methods/TEMP.rst deleted file mode 100644 index 766225e..0000000 --- a/docs/methods/TEMP.rst +++ /dev/null @@ -1,5 +0,0 @@ - -==== -TEMP -==== - diff --git a/docs/methods/coverage.rst b/docs/methods/coverage.rst deleted file mode 100644 index b7f8f00..0000000 --- a/docs/methods/coverage.rst +++ /dev/null @@ -1,5 +0,0 @@ - -======== -Coverage -======== - diff --git a/docs/methods/mapreads.rst b/docs/methods/mapreads.rst deleted file mode 100644 index 574f2b0..0000000 --- a/docs/methods/mapreads.rst +++ /dev/null @@ -1,5 +0,0 @@ - -========= -map_reads -========= - diff --git a/docs/methods/ngs_te_mapper.rst b/docs/methods/ngs_te_mapper.rst deleted file mode 100644 index 2770b47..0000000 --- a/docs/methods/ngs_te_mapper.rst +++ /dev/null @@ -1,5 +0,0 @@ - -============= -ngs_te_mapper -============= - diff --git a/docs/methods/trimgalore.rst b/docs/methods/trimgalore.rst deleted file mode 100644 index 415c4ef..0000000 --- a/docs/methods/trimgalore.rst +++ /dev/null @@ -1,5 +0,0 @@ - -========== -TrimGalore -========== - diff --git a/docs/running/config.rst b/docs/running/config.rst index 38fdb3d..170ee62 100644 --- a/docs/running/config.rst +++ b/docs/running/config.rst @@ -1,5 +1,842 @@ -=================== +################### Method config files -=================== +################### +Each component method has associated config file(s) that enable the modification of run parameters and filtering parameters. TE detection components have separate run and post processing config files. These files are located in the `config directory `_ of mcclintock :code:`/path/to/mcclintock/config`. + +*********************** +Example run config file +*********************** +.. code:: python + + PARAMS = { + '-l' : 10, + '-m' : 0.0, + '-bm' : 10, + '-bt' : 7, + '-f' : 100 + } + +Each config file contains a python :code:`dict` named :code:`PARAMS` which contains :code:`key : value` pairs with the :code:`key` being the flag or name of the parameter, and the :code:`value` being the value associated with the parameter. Most of the :code:`values` are set to the default parameters for each tool. The :code:`values` can be modified to better fit the data being used. + +******** +Coverage +******** +:code:`/path/to/mcclintock/config/coverage/coverage.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "omit_edges": True, + "omit_edges_read_length" : True, + "omit_edges_length" : 300 + } + +This config file contains the parameters that can be modified for the :code:`coverage` component method. + +omit_edges + * Omits the edges of the TE sequence coverage when calculating average depth across the element. + +omit_edges_read_length + * If :code:`omit_edges: True` and :code:`omit_edges_read_length True`, then the read length will be used as the length of the edges to omit + +omit_edges_length + * If :code:`omit_edges: True` and :code:`omit_edges_read_length False`, the value of :code:`omit_edges_length` will be used as the length of the edges to omit + +************* +ngs_te_mapper +************* + +run config +========== +:code:`/path/to/mcclintock/config/ngs_te_mapper/ngs_te_mapper_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "tsd=" : 20 + } + +This config file contains the parameters that can be modified to influence the running of the :code:`ngs_te_mapper` component method. + +tsd= + * The max length of junction read overlap to consider a target site duplication + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/ngs_te_mapper/ngs_te_mapper_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "min_read_support" : 0 + } + +This config file contains the parameters that influence the post processing of the :code:`ngs_te_mapper` predictions. + +min_read_support + * the minimum number of reads supporting an insertion required to be considered a valid prediction + + +************** +ngs_te_mapper2 +************** + +run config +========== +:code:`/path/to/mcclintock/config/ngs_te_mapper2/ngs_te_mapper2_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "--window" : 10, + "--min_mapq" : 20, + "--min_af" : 0.1, + "--tsd_max" : 25, + "--gap_max" : 5 + } + +This config file contains the parameters that can be modified to influence the running of the :code:`ngs_te_mapper2` component method. + +--window + * size of the window to merge for identifying TE clusters + +--min_mapq + * minimum mapping quality of alignment + +--min_af + * minimum allele frequency + +--tsd_max + * maximum target site duplication size + +--gap_max + * maximum gap size + +************* +popoolationte +************* + +run config +========== +:code:`/path/to/mcclintock/config/popoolationte/popoolationte_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "identify-te-insertsites.pl" : { + "--min-count" : 3, + "--min-map-qual" : 15 + }, + "crosslink-te-sites.pl" : { + "--single-site-shift": 100 + }, + "update-teinserts-with-knowntes.pl" : { + "--single-site-shift": 100 + }, + "estimate-polymorphism.pl" : { + "--min-map-qual": 15 + }, + "filter-teinserts.pl" : { + "--min-count": 5 + } + } + +identify-te-insertsites.pl + * Identifies TE insertion sites (forward or reverse insertion) from a sam file + * :code:`--min-count`: the minimum number of PE-fragments that confirm the insertion of a TE of a certain family + * :code:`--min-map-qual`: the minimum mapping quality; this will only apply to reads mapping to a reference contig. + +crosslink-te-sites.pl + * Crosslinks forward and reverse insertions and outputs transposable element insertions + * :code:`--single-site-shift`: the exact position of a TE insertion can only be approximated for TE insertions where only the forward or only the reverse insertion was found. For forward insertions the positon of the TE insertion is calculated as the end of the range. For reverse insertions the position of the TE insertion is calculated as the start of the range + +update-teinserts-with-knowntes.pl + * :code:`--single-site-shift`: the exact position of a TE insertion can only be approximated for TE insertions where only the forward or only the reverse insertion was found. For forward insertions the positon of the TE insertion is calculated as the end of the range. For reverse insertions the position of the TE insertion is calculated as the start of the range + +estimate-polymorphism.pl + * Estimate the insertion frequencies for a given set of TE insertions + * :code:`--min-map-qual`: the minimum mapping quality + +filter-teinserts.pl + * :code:`--min-count`: the minimum number of PE-fragments that confirm the insertion of a TE of a certain family + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/popoolationte/popoolationte_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "require_both_end_support" : True, + "percent_read_support_threshold" : 0.1 + } + +require_both_end_support + * requires that final results have support on both ends of prediction + +percent_read_support_threshold + * threshold for the minimum acceptable fraction of the reads supporting the prediction + + +************** +popoolationte2 +************** + +run config +========== +:code:`/path/to/mcclintock/config/popoolationte2/popoolationte2_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "ppileup" : { + "--map-qual": 15, + "--sr-mindist" : 10000, + "--id-up-quant": 0.01 + }, + + "subsampleppileup" : { + "run" : False, + "--target-coverage": 100, + "--with-replace": False + }, + + "identifySignatures" : { + "--min-count": 2.0, + "--signature-window": "median", + "--min-valley": "median", + "--chunk-distance": 5 + }, + + "updateStrand" : { + "--map-qual": 15, + "--max-disagreement": 0.1, + "--sr-mindist": 10000, + "--id-up-quant": 0.01 + }, + + "pairupSignatures" : { + "--min-distance": -100, + "--max-distance": 500, + "--max-freq-diff": 1.0 + } + + } + +ppileup + * create a physical pileup file from one or multiple bam files + * :code:`--map-qual`: minimum mapping quality + * :code:`--sr-mindist`: minimum inner distance for structural rearrangements + * :code:`--id-up-quant`: paired end fragments with an insert size in the upper quantile will be ignored + +subsampleppileup + * subsample a ppileup file to uniform coverage + * :code:`run`: The subsampleppileup step is optional. Set this option to :code:`True` if you wish to perform this step + * :code:`--target-coverage`: the target coverage of the output file + * :code:`--with-replace`: use sampling with replacement instead of without replacement + +identifySignatures + * identify signatures of TE insertions + * :code:`--min-count`: the minimum count of a TE insertion + * :code:`--signature-window`: the window size of the signatures of TE insertions. options: :code:`median`, :code:`fixNNNN`, :code:`minimumSampleMedian`, :code:`maximumSampleMedian` + * :code:`min-valley`: the minimum size of the valley between two consecutive signatures of the same family. options: :code:`median`, :code:`fixNNNN`, :code:`minimumSampleMedian`, :code:`maximumSampleMedian` + * :code:`--chunk-distance`: minimum distance between chromosomal chunks in multiples of :code:`--min-valley` + +updateStrand + * estimate the strand of TEs for signatures of TE insertions + * :code:`--map-qual`: minimum mapping quality + * :code:`--max-disagreement`: the maximum disagreement for the strand of the TE insertion in fraction of reads + * :code:`--sr-mindist`: minimum inner distance for structural rearrangements + * :code:`--id-up-quant`: paired end fragments with an insert size in the upper quantile will be ignored + +pairupSignatures + * pairs up signatures of TE insertions and yields TE insertions + * :code:`--min-distance`: the minimum distance between signatures + * :code:`--max-distance`: the maximum distance between signatures + * :code:`--max-freq-diff`: the maximum frequency difference between signatures + + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/popoolationte2/popoolationte2_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "require_both_end_support" : True, + "frequency_threshold" : 0.1 + } + +require_both_end_support + * require that the TE prediction have support from both junctions + +frequency_threshold + * threshold for the minimum acceptable (average physical coverage supporting the insertion of the given TE) / (average physical coverage) + + +******** +relocate +******** + +run config +========== +:code:`/path/to/mcclintock/config/relocate/relocate_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + '-l' : 10, + '-m' : 0.0, + '-bm' : 10, + '-bt' : 7, + '-f' : 100 + } + +-l + * len cutoff for the TE trimmed reads to be aligned + +-m + * mismatch allowance for alignment to TE + +-bm + * blat minScore value, used by blat in the comparison of reads to TE sequence + +-bt + * blat tileSize value, used by blat in the comparison of reads to TE sequence + +-f + * length of the sequence flanking the found insertion to be returned. This sequence is taken from the reference genome + + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/relocate/relocate_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "ref_left_threshold" : 0, + "ref_right_threshold" : 0, + "nonref_left_threshold" : 0, + "nonref_right_threshold" : 0 + } + +ref_left_threshold + * minimum number of left flanking reads for a reference prediction. + +ref_right_threshold + * minimum number of right flanking reads for a reference prediction. + +nonref_left_threshold + * minimum number of left flanking reads for a non-reference prediction. + +nonref_right_threshold + * minimum number of right flanking reads for a non-reference prediction. + + +********* +relocate2 +********* + +run config +========== +:code:`/path/to/mcclintock/config/relocate2/relocate2_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + '--aligner' : "blat", + '--len_cut_match' : 10, + '--len_cut_trim' : 10, + '--mismatch' : 2, + '--mismatch_junction' : 2 + } + +--aligner + * aligner used to map reads to repeat elements + +--len_cut_match + * length cutoff threshold for match between reads and repeat elements. Large value will lead to less sensitive but more accuracy + +--len_cut_trim + * length cutoff threshold for trimed reads after trimming repeat sequence from reads. Large value will lead to less sensitive but more accuracy + +--mismatch + * Number of mismatches allowed for matches between reads and repeat elements + +--mismatch_junction + * Number of mismatches allowed for matches between junction reads and repeat elements + + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/relocate2/relocate2_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "ref_left_support_threshold" : 0, + "ref_right_support_threshold" : 0, + "ref_left_junction_threshold" : 0, + "ref_right_junction_threshold" : 0, + + "nonref_left_support_threshold" : 0, + "nonref_right_support_threshold" : 0, + "nonref_left_junction_threshold" : 0, + "nonref_right_junction_threshold" : 0 + } + +ref_left_support_threshold + * Minimum number of reads not covering the junction of TE insertion, but supporting TE insertion by paired-end reads on left side/downstream of a reference prediction + +ref_right_support_threshold + * Minimum number of reads not covering the junction of TE insertion, but supporting TE insertion by paired-end reads on right side/downstream of a reference prediction + +ref_left_junction_threshold + * Minimum number of reads covering the junction of TE insertion on left side/upstream of a reference prediction + +ref_right_junction_threshold + * Minimum number of reads covering the junction of TE insertion on right side/downstream of a reference prediction + +nonref_left_support_threshold + * Minimum number of reads not covering the junction of TE insertion, but supporting TE insertion by paired-end reads on left side/downstream of a non-reference prediction + +nonref_right_support_threshold + * Minimum number of reads not covering the junction of TE insertion, but supporting TE insertion by paired-end reads on right side/downstream of a non-reference prediction + +nonref_left_junction_threshold + * Minimum number of reads covering the junction of TE insertion on left side/upstream of a non-reference prediction + +nonref_right_junction_threshold + * Minimum number of reads covering the junction of TE insertion on right side/downstream of a non-reference prediction + + +******** +retroseq +******** + +run config +========== +:code:`/path/to/mcclintock/config/retroseq/retroseq_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "-depth" : 200, + "-reads" : 10, + "-q": 20 + } + +-depth + * Max average depth of a region to be considered for calling + +-reads + * Minimum number of reads required to make a call + +-q + * Minimum mapping quality for a read mate that anchors the insertion call + + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/retroseq/retroseq_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "read_support_threshold" : 0, + "breakpoint_confidence_threshold" : 6 + } + +read_support_threshold + * Minimum number of correctly mapped read pairs spanning breakpoint for predictions + +breakpoint_confidence_threshold + * Minimum FL tag for predictions. The FL tag ranges from 1-8 and gives information on the breakpoint with 8 being the most confident calls and lower values indicating calls that don’t meet the breakpoint criteria for reasons such as lack of 5’ or 3’ reads + + +******* +tebreak +******* + +run config +========== +:code:`/path/to/mcclintock/config/tebreak/tebreak_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "--minMWP": "0.01", + "--min_minclip" : "3", + "--min_maxclip" : "10", + "--min_sr_per_break" : "1", + "--min_consensus_score" : "0.9", + "--min_chr_len" : "0", + "--max_ins_reads" : "1000", + "--min_split_reads" : "4", + "--min_prox_mapq" : "10", + "--max_N_consensus" : "4", + "--max_disc_fetch" : "50", + "--min_disc_reads" : "4", + "--sr_density" : "2.0", + "--min_ins_match" : "0.90", + "--min_ref_match" : "0.98", + "--min_cons_len" : "250", + "--keep_all_tmp_bams" : False, + "--skip_final_filter" : False, + "--debug": False + } + +--minMWP + * minimum Mann-Whitney P-value for split qualities + +--min_minclip + * min. shortest clipped bases per cluster + +--min_maxclip + * min. longest clipped bases per cluster + +--min_sr_per_break + * minimum split reads per breakend + +--min_consensus_score + * quality of consensus alignment + +--min_chr_len + * minimum chromosome length to consider in discordant site search + +--max_ins_reads + * maximum number of reads to use per insertion call + +--min_split_reads + * minimum total split reads per insertion call + +--min_prox_mapq + * minimum map quality for proximal subread + +--max_N_consensus + * exclude breakend seqs with > this number of N bases + +--max_disc_fetch + * maximum number of discordant reads to fetch per insertion site per BAM + +--min_disc_reads + * if using -d/--disco_target, minimum number of discordant reads to trigger a call + +--sr_density + * maximum split read density in chunk + +--min_ins_match + * (output) minumum match to insertion library + +--min_ref_match + * (output) minimum match to reference genome + +--min_cons_len + * (output) min total consensus length + +--keep_all_tmp_bams + * leave ALL temporary BAMs + +--skip_final_filter + * do not apply final filters or fix for orientation + +--debug + * run in debug mode + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/tebreak/tebreak_post.py` : `GitHub link `_ + +.. code:: python + + MIN_5P_ELT_MATCH = 0.0 + MIN_3P_ELT_MATCH = 0.0 + MIN_5P_GENOME_MATCH = 0.0 + MIN_3P_GENOME_MATCH = 0.0 + MIN_SPLIT_READS_5P = 0 + MIN_SPLIT_READS_3P = 0 + MIN_REMAPPED_DISCORDANT = 0 + MIN_REMAP_DISC_FRACTION = 0.0 + MIN_REMAPPED_SPLITREADS = 0 + MIN_REMAP_SPLIT_FRACTION = 0.0 + +MIN_5P_ELT_MATCH + * Minimum fraction of bases matched to reference for inserted sequence on insertion seqment of 5' supporting contig + +MIN_3P_ELT_MATCH + * Minimum fraction of bases matched to reference for inserted sequence on insertion seqment of 3' supporting contig + +MIN_5P_GENOME_MATCH + * Minimum fraction of bases matched to reference genome on genomic segment of 5' supporting contig + +MIN_3P_GENOME_MATCH + * Minimum fraction of bases matched to reference genome on genomic segment of 3' supporting contig + +MIN_SPLIT_READS_5P + * Minimum number of split reads supporting 5' end of the insertion + +MIN_SPLIT_READS_3P + * Minimum number of split reads supporting 3' end of the insertion + +MIN_REMAPPED_DISCORDANT + * Minimum number of discordant read ends re-mappable to insertion reference sequence + +MIN_REMAP_DISC_FRACTION + * Minimum proportion of remapped discordant reads mapping to the reference insertion sequence + +MIN_REMAPPED_SPLITREADS + * Minimum number of split reads re-mappable to insertion reference sequence + +MIN_REMAP_SPLIT_FRACTION + * Minimum proportion of remapped split reads mapping to the reference insertion sequence + +****** +teflon +****** + +run config +========== +:code:`/path/to/mcclintock/config/teflon/teflon_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "-q": 20, + "-sd" : None, + "-cov" : None, + "-n1" : 1, + "-n2" : 1, + "-lt" : 1, + "-ht" : None + } + +-q + * Map quality threshold. Mapped reads with map qualities lower than this number will be discarded + +-sd + * Insert size standard deviation. Used to manually override the insert size StdDev identified by samtools stat (check this number in the generated stats.txt file to ensure it seems more or less correct based on knowledge of sequencing library!) + * Set to :code:`None` if you want this value to be calculated by TEFLoN + +-cov + * Coverage override. Used to manually override the coverage estimate if you get the error: :code:`Warning: coverage could not be estimated` + +-n1 + * TEs must be supported by >= n reads in at least one sample + +-n2 + * TEs must be supported by >= n reads summed across all samples + +-lt + * sites genotyped as -9 if adjusted read counts lower than this threshold + +-ht + * sites genotyped as -9 if adjusted read counts higher than this threshold + * Set to :code:`None` if you want this value to be calculated by TEFLoN + + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/teflon/teflon_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "min_presence_reads" : 3, + "max_absence_reads" : None, + "min_presence_fraction" : 0.1, + "require_tsd" : True, + "require_both_breakpoints" : True + + } + +min_presence_reads + * Minimum number of reads supporting the presence of the insertion + +max_absence_reads + * Maximum number of reads that support the absence of the insertion + +min_presence_fraction + * The minimum fraction of reads supporting the presence of the insertion. presence/(absence+presence) + +require_tsd + * If :code:`True`, non-ref predictions must have a target site duplication + +require_both_breakpoints + * If :code:`True`, non-ref predictions must have both breakpoints predicted + + +********* +te-locate +********* + +run config +========== +:code:`/path/to/mcclintock/config/telocate/telocate_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "max_mem" : 4, + "min_distance" : 5, + "min_support_reads" : 3, + "min_support_individuals" : 1 + } + +max_mem + * Max memory to use (GB) + +min_distance + * resolution for the loci, if a supporting read-pair is found in a distance up to this value it is counted for the same event. Multiplied by median insert size. + +min_support_reads + * only events supported by this number of reads in all accessions are kept + +min_support_individuals + * only events supported in this number of individuals are kept. + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/telocate/telocate_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "read_pair_support_threshold" : 0 + } + +read_pair_support_threshold + * Minimum number of read pairs supporting a prediction + +**** +temp +**** + +run config +========== +:code:`/path/to/mcclintock/config/temp/temp_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + '-x' : 30, + '-m' : 1 + } + +-x + * The minimum score difference between the best hit and the second best hit for considering a read as uniquely mapped. For BWA mem. + +-m + * Number of mismatch allowed when mapping to TE concensus sequences. + + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/temp/temp_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "acceptable_insertion_support_classes" : ["1p1"], + "frequency_threshold" : 0.1 + } + +acceptable_insertion_support_classes + * valid options: :code:`1p1`, :code:`2p`, :code:`singleton` + * The class of the insertions that are acceptable for final predictions. + * :code:`1p1` means that the detected insertion is supported by reads at both sides. + * :code:`2p` means the detected insertion is supported by more than 1 read at only 1 side. + * :code:`singleton` means the detected insertion is supported by only 1 read at 1 side. + +frequency_threshold + * minimum frequency of the inserted transposon + + +***** +temp2 +***** + +run config +========== +:code:`/path/to/mcclintock/config/temp2/temp2_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS ={ + "insertion" : { + "-M" : 2, + "-m" : 5, + "-U" : 0.8, + "-N" : 300, + "-T" : False, + "-L" : False, + "-S" : False + }, + + "absence" : { + "-x" : 0 + } + } + +insertion + * :code:`-M` : Percentage of mismatch allowed when anchor to genome. + * :code:`-m` : Percentage of mismatch allowed when mapping to TEs. + * :code:`-U` : The ratio between the second best alignment and the best alignment to judge if a read is uniquely mapped. + * :code:`-N` : window size (+-n) for filtering insertions overlapping reference insertions. + * :code:`-T` : Set this parameter to :code:`True` to allow truncated de novo insertions. + * :code:`-L` : Set this parameter to :code:`True` to use a looser criteria to filter reference annotated copy overlapped insertions. + * :code:`-S` : Set this parameter to :code:`True` to skip insertion length checking; Default is to remove those insertions that are not full length of shorter than 500bp. + +absence + * :code:`-x` : The minimum score difference between the best hit and the second best hit for considering a read as uniquely mapped. For BWA MEM. + + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/temp2/temp2_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "acceptable_insertion_support_classes" : ["1p1"], + "frequency_threshold" : 0.1 + } + +acceptable_insertion_support_classes + * valid options: :code:`1p1`, :code:`2p`, :code:`singleton` + * The class of the insertions that are acceptable for final predictions. + * :code:`1p1` means that the detected insertion is supported by reads at both sides. + * :code:`2p` means the detected insertion is supported by more than 1 read at only 1 side. + * :code:`singleton` means the detected insertion is supported by only 1 read at 1 side. + +frequency_threshold + * minimum frequency of the inserted transposon. It generally means what fraction of sequenced genome present this insertion. + + +********** +trimgalore +********** + +:code:`/path/to/mcclintock/config/trimgalore/trimgalore.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "single_end": { + "--fastqc": True, + }, + + "paired_end": { + "--fastqc": True, + "--paired": True + } + } + +single_end + * :code:`--fastqc`: Set to true to run fastqc when using single end data + +paired_end + * :code:`--fastqc`: Set to true to run fastqc when using paired end data + * :code:`--paired`: Run trimgalore using length trimming of quality/adapter/RRBS trimmed reads for paired-end data \ No newline at end of file diff --git a/install/Snakefile b/install/Snakefile index 022372f..4b7c346 100644 --- a/install/Snakefile +++ b/install/Snakefile @@ -221,6 +221,21 @@ rule jitterbug: script: config['paths']['install']+"scripts/jitterbug.py" +rule tebreak: + params: + url = config['URLs']['tebreak'], + md5 = config['MD5s']['tebreak'], + zipfile = config['paths']['install']+"tools/tebreak/tebreak.zip", + log = config['paths']['log_dir']+"install.log", + + output: + config['output']['tebreak'] + + conda: config['ENVs']['tebreak'] + + script: + config['paths']['install']+"scripts/tebreak.py" + rule install_all: input: config['output']['te-locate'], diff --git a/install/envs/mcc_map_reads.yml b/install/envs/mcc_map_reads.yml index e8ca860..144bbc2 100644 --- a/install/envs/mcc_map_reads.yml +++ b/install/envs/mcc_map_reads.yml @@ -12,3 +12,4 @@ dependencies: - perl-bioperl-run=1.006900 - bwa=0.7.4 - jinja2=2.10.1 + - picard=2.25.4 diff --git a/install/envs/mcc_ngs_te_mapper2.yml b/install/envs/mcc_ngs_te_mapper2.yml index 4f72cde..80bd3aa 100644 --- a/install/envs/mcc_ngs_te_mapper2.yml +++ b/install/envs/mcc_ngs_te_mapper2.yml @@ -8,11 +8,12 @@ dependencies: - _openmp_mutex=4.5=1_gnu - bcftools=1.9=h68d8f2e_9 - bedtools=2.29.2=hc088bd4_0 + - biopython=1.78=py36h8f6f2f9_2 - bwa=0.7.17=hed695b0_7 - bzip2=1.0.8=h7f98852_4 - c-ares=1.11.0=h470a237_1 - - ca-certificates=2020.12.5=ha878542_0 - - certifi=2020.12.5=py36h5fab9bb_0 + - ca-certificates=2021.5.30=ha878542_0 + - certifi=2021.5.30=py36h5fab9bb_0 - curl=7.71.1=he644dc0_8 - entrez-direct=13.9=pl526h375a9b1_0 - expat=2.2.9=he1b5a44_2 @@ -35,6 +36,7 @@ dependencies: - libgfortran-ng=9.3.0=he4bcb1c_17 - libgfortran5=9.3.0=he4bcb1c_17 - libgomp=9.3.0=h5dbcf3e_17 + - liblapack=3.9.0=6_openblas - libnghttp2=1.41.0=hab1572f_1 - libopenblas=0.3.12=pthreads_h4812303_1 - libssh2=1.9.0=hab1572f_5 @@ -42,7 +44,8 @@ dependencies: - minimap2=2.17=hed695b0_3 - ncurses=6.1=hf484d3e_1002 - nettle=3.3=0 - - openssl=1.1.1i=h7f98852_0 + - numpy=1.19.5=py36h2aa4a07_1 + - openssl=1.1.1k=h7f98852_0 - pcre=8.44=he1b5a44_0 - perl=5.26.2=h36c2ea0_1008 - perl-app-cpanminus=1.7044=pl526_1 diff --git a/install/envs/mcc_tebreak.yml b/install/envs/mcc_tebreak.yml new file mode 100644 index 0000000..76af78b --- /dev/null +++ b/install/envs/mcc_tebreak.yml @@ -0,0 +1,20 @@ +name: tebreak +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - python=3.8 + - bwa=0.7.17 + - pysam=0.16.0.1 + - scipy=1.6.3 + - bx-python=0.8.11 + - scikit-bio=0.5.6 + - last=1238 + - htslib=1.12 + - samtools=1.12 + - bcftools=1.12 + - minia=3.2.4 + - exonerate=2.4.0 + - lockfile=0.12.2 + - ipython_genutils=0.2.0 \ No newline at end of file diff --git a/install/envs/mcc_tepid.yml b/install/envs/mcc_tepid.yml index eda6502..d36ac9a 100644 --- a/install/envs/mcc_tepid.yml +++ b/install/envs/mcc_tepid.yml @@ -4,4 +4,5 @@ channels: - conda-forge - defaults dependencies: - - tepid=0.10 \ No newline at end of file + - tepid=0.10 + - tbb=2020.2 \ No newline at end of file diff --git a/install/scripts/ngs_te_mapper2.py b/install/scripts/ngs_te_mapper2.py index 6c9f9ce..0888892 100644 --- a/install/scripts/ngs_te_mapper2.py +++ b/install/scripts/ngs_te_mapper2.py @@ -6,7 +6,7 @@ def main(): install_path = snakemake.config['paths']['install']+"/tools/" - raw_name="ngs_te_mapper2-1.01" + raw_name="ngs_te_mapper2-220619a3dd00b8c05c3b96328eb2424c6aad623e" method_name = "ngs_te_mapper2" mccutils.remove(snakemake.params.zipfile) diff --git a/install/scripts/tebreak.py b/install/scripts/tebreak.py new file mode 100644 index 0000000..a35e3c3 --- /dev/null +++ b/install/scripts/tebreak.py @@ -0,0 +1,43 @@ +import sys +import os +sys.path.append(snakemake.config['paths']['mcc_path']) +import scripts.mccutils as mccutils + +def main(): + install_path = snakemake.config['paths']['install']+"/tools/" + + raw_name="tebreak-3f00badda822dcb6390bbea5a6a1e233cff3e99c" + method_name = "tebreak" + + mccutils.remove(snakemake.params.zipfile) + download_success = mccutils.download(snakemake.params.url, snakemake.params.zipfile, md5=snakemake.params.md5, max_attempts=3) + + if not download_success: + print(method_name+" download failed... exiting...") + print("try running --install with --clean for clean installation") + sys.exit(1) + + mccutils.remove(snakemake.config['paths']['install']+raw_name) + command = ["unzip", snakemake.params.zipfile] + mccutils.run_command(command, log=snakemake.params.log) + + mccutils.remove(install_path+raw_name) + command = ["mv", snakemake.config['paths']['install']+raw_name, install_path] + mccutils.run_command(command, log=snakemake.params.log) + + mccutils.remove(install_path+method_name) + mccutils.mkdir(install_path+method_name) + for f in os.listdir(install_path+raw_name): + command = ["mv", install_path+raw_name+"/"+f, install_path+method_name] + mccutils.run_command(command, log=snakemake.params.log) + + + mccutils.remove(install_path+raw_name) + mccutils.remove(snakemake.params.zipfile) + + # write version to file + with open(install_path+"/"+method_name+"/version.log","w") as version: + version.write(snakemake.params.md5) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/internal/install.py b/internal/install.py index 916d094..1e563d1 100644 --- a/internal/install.py +++ b/internal/install.py @@ -5,13 +5,14 @@ "temp2": "https://github.com/weng-lab/TEMP2/archive/4d25d050dc7832b82374502573e882a242457a0c.zip", "relocate" : "https://github.com/srobb1/RelocaTE/archive/ce3a2066e15f5c14e2887fdf8dce0485e1750e5b.zip", "ngs_te_mapper" : "https://github.com/bergmanlab/ngs_te_mapper/archive/f9f48996ac346ac86d57edbd00534aa1227b753e.zip", - "ngs_te_mapper2": "https://github.com/bergmanlab/ngs_te_mapper2/archive/v1.01.zip", + "ngs_te_mapper2": "https://github.com/bergmanlab/ngs_te_mapper2/archive/220619a3dd00b8c05c3b96328eb2424c6aad623e.zip", "popoolationte" : "http://downloads.sourceforge.net/project/popoolationte/popoolationte_1.02.zip", "popoolationte2": "http://downloads.sourceforge.net/project/popoolation-te2/popte2-v1.10.03.jar", "relocate2" : "https://github.com/stajichlab/RelocaTE2/archive/v2.0.1.zip", "tepid" : "https://github.com/ListerLab/TEPID/archive/ad46d65b5c41bf8a9171215d49b3ffaecdceaab0.zip", "teflon": "https://github.com/jradrion/TEFLoN/archive/3e2d67886b70644fd1f7d79263b3c8dbed639e46.zip", - "jitterbug": "https://github.com/elzbth/jitterbug/archive/b6b3f9c7ee4af042d4410137269f174a1399b752.zip" + "jitterbug": "https://github.com/elzbth/jitterbug/archive/b6b3f9c7ee4af042d4410137269f174a1399b752.zip", + "tebreak": "https://github.com/adamewing/tebreak/archive/3f00badda822dcb6390bbea5a6a1e233cff3e99c.zip" } MD5 = { @@ -21,13 +22,14 @@ "temp2" : "55b6449113e85540f8a39bb02cfec9b0", "relocate" : "64cc60f65154368518529fb57becc3d6", "ngs_te_mapper" : "a00e9962ee3daca9fac915c23c6af203", - "ngs_te_mapper2": "84232b8231bebe3888ab58954ef329e2", + "ngs_te_mapper2": "95198ab68fcb4f9dc267460a7f906fae", "popoolationte" : "315d266bd6da7487c919576dc46b92b4", "popoolationte2": "0c06186f0b1df215949f57a5cbb2d039", "relocate2" : "63eb7dbc09ffeb4063aef4857d198b06", "tepid" : "51f61acf138b07842074f054aba780d5", "teflon" : "a29d2e7411fd06778186ba322f51fdc9", - "jitterbug": "05f165fb6a79a7a611d09981a355d27e" + "jitterbug": "05f165fb6a79a7a611d09981a355d27e", + "tebreak": "ac475d3cc4b0435f0a333c8d86f57d3f" } @@ -50,7 +52,8 @@ "setup_reads": ENV_PATH+"mcc_trimgalore.yml", "processing": ENV_PATH+"mcc_map_reads.yml", "map_reads": ENV_PATH+"mcc_map_reads.yml", - "trimgalore": ENV_PATH+"mcc_trimgalore.yml" + "trimgalore": ENV_PATH+"mcc_trimgalore.yml", + "tebreak": ENV_PATH+"mcc_tebreak.yml" } @@ -71,5 +74,7 @@ "teflon" : INSTALL_PATH+"tools/teflon/teflon.v0.4.py", "jitterbug": INSTALL_PATH+"tools/jitterbug/jitterbug.py", "map_reads" : INSTALL_PATH+"tools/map_reads/map_reads.log", - "trimgalore" : INSTALL_PATH+"tools/trimgalore/trimgalore.log" + "trimgalore" : INSTALL_PATH+"tools/trimgalore/trimgalore.log", + "tebreak" : INSTALL_PATH+"tools/tebreak/tebreak/tebreak" + } diff --git a/internal/sysconfig.py b/internal/sysconfig.py index 91104f3..dc8eca4 100644 --- a/internal/sysconfig.py +++ b/internal/sysconfig.py @@ -1,6 +1,6 @@ -ALL_METHODS = ["ngs_te_mapper", "ngs_te_mapper2", "relocate", "relocate2", "temp", "temp2", "retroseq", "popoolationte", "popoolationte2", "te-locate", "teflon", "coverage", "trimgalore","map_reads"] #"ngs_te_mapper2", -SINGLE_END_METHODS = ["ngs_te_mapper", "ngs_te_mapper2", "relocate", "coverage", "trimgalore", "map_reads"] #"ngs_te_mapper2", -MULTI_THREAD_METHODS = ["coverage", "temp", "temp2", "relocate2", "ngs_te_mapper", "ngs_te_mapper2", "popoolationte", "teflon", "trimgalore"] #"ngs_te_mapper2", +ALL_METHODS = ["ngs_te_mapper", "ngs_te_mapper2", "relocate", "relocate2", "temp", "temp2", "retroseq", "popoolationte", "popoolationte2", "te-locate", "teflon", "coverage", "trimgalore","map_reads", "tebreak"] +SINGLE_END_METHODS = ["ngs_te_mapper", "ngs_te_mapper2", "relocate", "coverage", "trimgalore", "map_reads", "tebreak"] +MULTI_THREAD_METHODS = ["coverage", "temp", "temp2", "relocate2", "ngs_te_mapper", "ngs_te_mapper2", "popoolationte", "teflon", "trimgalore", "tebreak"] NO_INSTALL_METHODS = ["trimgalore", "map_reads", "coverage", "relocate2"] # no source code to install for these methods, just envs INPUT_DIR = "{{indir}}" @@ -23,7 +23,10 @@ "teflon": ["teflon/teflon_run.py", "teflon/teflon_post.py"], "te-locate": ["telocate/telocate_run.py", "telocate/telocate_post.py"], "temp": ["temp/temp_run.py", "temp/temp_post.py"], - "temp2": ["temp2/temp2_run.py", "temp2/temp2_post.py"] + "temp2": ["temp2/temp2_run.py", "temp2/temp2_post.py"], + "jitterbug": ["jitterbug/jitterbug_run.py", "jitterbug/jitterbug_post.py"], + "tepid": ["tepid/tepid_run.py", "tepid/tepid_post.py"], + "tebreak": ["tebreak/tebreak_run.py", "tebreak/tebreak_post.py"] } # rules to re-run if specific config files change @@ -40,7 +43,10 @@ "teflon": ["teflon_run", "teflon_post"], "te-locate": ["telocate_run", "telocate_post"], "temp": ["run_temp", "process_temp"], - "temp2": ["run_temp2", "process_temp2"] + "temp2": ["run_temp2", "process_temp2"], + "jitterbug": ["jitterbug_run", "jitterbug_post"], + "tepid": ["tepid_run", "tepid_post"], + "tebreak": ["tebreak_run", "tebreak_post"] } LOG_DIR = "{{logdir}}" @@ -57,6 +63,9 @@ "te-locate": LOG_DIR+"status/telocate.status", "temp": LOG_DIR+"status/temp.status", "temp2": LOG_DIR+"status/temp2.status", + "jitterbug": LOG_DIR+"status/jitterbug.status", + "tepid": LOG_DIR+"status/tepid.status", + "tebreak": LOG_DIR+"status/tebreak.status" } @@ -86,6 +95,7 @@ 'fq2' : SAM_DIR+"intermediate/fastq/"+SAMPLE_NAME+"_2.fq", 'sam' : SAM_DIR+"intermediate/mapped_reads/"+SAMPLE_NAME+".sam", 'bam' : SAM_DIR+"intermediate/mapped_reads/"+SAMPLE_NAME+".sorted.bam", + 'duplicate_maked_bam' : SAM_DIR+"intermediate/mapped_reads/"+SAMPLE_NAME+".sorted.duplicate_marked.bam", 'telocate_sam' : SAM_DIR+"intermediate/mapped_reads/"+SAMPLE_NAME+".telocate.sam", 'flagstat' : SAM_DIR+"intermediate/mapped_reads/"+SAMPLE_NAME+".bam.flagstat", 'median_insert_size' : SAM_DIR+"intermediate/mapped_reads/median_insert.size", @@ -112,7 +122,8 @@ 'relocate2': RESULTS_DIR+"relocate2/", 'tepid': RESULTS_DIR+"tepid/", 'teflon': RESULTS_DIR+"teflon/", - 'jitterbug': RESULTS_DIR+"jitterbug/" + 'jitterbug': RESULTS_DIR+"jitterbug/", + 'tebreak': RESULTS_DIR+"tebreak/" } METHOD_DIR = "{{method}}" @@ -133,7 +144,8 @@ 'relocate2': METHOD_DIR+SAMPLE_NAME+"_relocate2_nonredundant.bed", 'tepid': METHOD_DIR+SAMPLE_NAME+"_tepid_nonredundant.bed", 'teflon': METHOD_DIR+SAMPLE_NAME+"_teflon_nonredundant.bed", - 'jitterbug': METHOD_DIR+SAMPLE_NAME+"_jitterbug_nonredundant.bed" + 'jitterbug': METHOD_DIR+SAMPLE_NAME+"_jitterbug_nonredundant.bed", + 'tebreak': METHOD_DIR+SAMPLE_NAME+"_tebreak_nonredundant.bed" } ESSENTIAL_PATHS = { @@ -257,5 +269,11 @@ METHOD_DIR+SAMPLE_NAME+"_jitterbug_redundant.bed", METHOD_DIR+SAMPLE_NAME+"_jitterbug_malformed.bed", METHOD_DIR+"unfiltered/"+SAMPLE_NAME+".TE_insertions_paired_clusters.filtered.gff3" + ], + + 'tebreak': [ + METHOD_DIR+SAMPLE_NAME+"_tebreak_nonredundant.bed", + METHOD_DIR+SAMPLE_NAME+"_tebreak_redundant.bed", + METHOD_DIR+"unfiltered/"+SAMPLE_NAME+".tebreak.table.txt" ] } \ No newline at end of file diff --git a/scripts/TEMP/temp_post.py b/scripts/TEMP/temp_post.py index e000e4e..09a1d30 100644 --- a/scripts/TEMP/temp_post.py +++ b/scripts/TEMP/temp_post.py @@ -30,7 +30,7 @@ def main(): absence_bed = make_absence_bed(absence_summary, sample_name, out_dir) non_absent_ref_insertions = get_non_absent_ref_tes(te_gff, absence_bed, sample_name, out_dir, log) insertions += non_absent_ref_insertions - insertions = filter_insertions(insertions, chromosomes, acceptable_classes=config.ACCEPTABLE_INSERTION_SUPPORT_CLASSES, frequency_theshold=config.FREQUENCY_THRESHOLD) + insertions = filter_insertions(insertions, chromosomes, acceptable_classes=config.PARAMS["acceptable_insertion_support_classes"], frequency_theshold=config.PARAMS["frequency_threshold"]) if len(insertions) > 0: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="temp") insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="temp") diff --git a/scripts/TEMP/temp_run.py b/scripts/TEMP/temp_run.py index 7d93153..3cd520f 100644 --- a/scripts/TEMP/temp_run.py +++ b/scripts/TEMP/temp_run.py @@ -81,18 +81,20 @@ def run_temp_insertion(bam, scripts, consensus, te_bed, taxonomy, median_insert_ mccutils.log("temp","running TEMP non-reference insertion prediction", log=log) command = [ "bash", scripts+"TEMP_Insertion.sh", - "-x", str(config.TEMP_Insertion['x']), "-i", bam, "-s", scripts, "-r", consensus, "-t", te_bed, "-u", taxonomy, - "-m", str(config.TEMP_Insertion['m']), "-f", str(median_insert_size), "-c", str(threads), "-o", out ] + for param in config.PARAMS.keys(): + command.append(param) + command.append(str(config.PARAMS[param])) + mccutils.run_command(command, log=log) diff --git a/scripts/coverage/coverage.py b/scripts/coverage/coverage.py index 44f3b64..da5d617 100644 --- a/scripts/coverage/coverage.py +++ b/scripts/coverage/coverage.py @@ -57,11 +57,11 @@ def main(): genome_depth = get_genome_depth(nonte_bed, bam, run_id, coverage_out, log) edge_trim = 0 - if config.OMIT_EDGES: - if config.OMIT_EDGES_READ_LENGTH: + if config.PARAMS["omit_edges"]: + if config.PARAMS["omit_edges_read_length"]: edge_trim = mccutils.estimate_read_length(snakemake.input.fq1) else: - edge_trim = config.OMIT_EDGES_LENGTH + edge_trim = config.PARAMS["omit_edges_length"] te_names, all_coverage_files, uniq_coverage_files, avg_norm_te_depths = make_depth_table(te_seqs, bam, genome_depth, run_id, coverage_out, snakemake.output[0], log, trim_edges=edge_trim) make_plots(te_names, all_coverage_files, uniq_coverage_files, avg_norm_te_depths, genome_depth, snakemake.params.sample, coverage_out, trim_edges=edge_trim) diff --git a/scripts/jitterbug/jitterbug_post.py b/scripts/jitterbug/jitterbug_post.py index 5be8eda..5f84893 100644 --- a/scripts/jitterbug/jitterbug_post.py +++ b/scripts/jitterbug/jitterbug_post.py @@ -1,37 +1,52 @@ import os import sys import subprocess +import importlib.util as il +spec = il.spec_from_file_location("config", snakemake.params.config) +config = il.module_from_spec(spec) +sys.modules[spec.name] = config +spec.loader.exec_module(config) sys.path.append(snakemake.config['args']['mcc_path']) import scripts.mccutils as mccutils -import config.jitterbug.jitterbug_post as config +import scripts.output as output def main(): mccutils.log("jitterbug","jitterbug postprocessing") jitterbug_out = snakemake.input.jitterbug_out te_taxonomy = snakemake.input.taxonomy + reference_fasta = snakemake.input.reference_fasta out_dir = snakemake.params.out_dir log = snakemake.params.log sample_name = snakemake.params.sample_name chromosomes = snakemake.params.chromosomes.split(",") + status_log = snakemake.params.status_log out = snakemake.output.out - insertions = read_insertions( - jitterbug_out, - te_taxonomy, - chromosomes, - sample_name, - min_fwd_read_support=config.FILTER['MIN_FWD_READ_SUPPORT'], - min_rev_read_support=config.FILTER['MIN_REV_READ_SUPPORT'], - min_sr_support=config.FILTER['MIN_SPLIT_READ_SUPPORT'], - min_zygosity=config.FILTER['MIN_ZYGOSITY'] - ) - - if len(insertions) >= 1: - insertions = mccutils.make_redundant_bed(insertions, sample_name, out_dir, method="jitterbug") - mccutils.make_nonredundant_bed(insertions, sample_name, out_dir, method="jitterbug") + prev_steps_succeeded = mccutils.check_status_file(status_log) + + if prev_steps_succeeded: + insertions = read_insertions( + jitterbug_out, + te_taxonomy, + chromosomes, + sample_name, + min_fwd_read_support=config.FILTER['MIN_FWD_READ_SUPPORT'], + min_rev_read_support=config.FILTER['MIN_REV_READ_SUPPORT'], + min_sr_support=config.FILTER['MIN_SPLIT_READ_SUPPORT'], + min_zygosity=config.FILTER['MIN_ZYGOSITY'] + ) + + if len(insertions) >= 1: + insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="jitterbug") + insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="jitterbug") + output.write_vcf(insertions, reference_fasta, sample_name, "jitterbug", out_dir) + else: + mccutils.run_command(["touch", out_dir+"/"+sample_name+"_jitterbug_redundant.bed"]) + mccutils.run_command(["touch", out_dir+"/"+sample_name+"_jitterbug_nonredundant.bed"]) + else: mccutils.run_command(["touch", out_dir+"/"+sample_name+"_jitterbug_redundant.bed"]) mccutils.run_command(["touch", out_dir+"/"+sample_name+"_jitterbug_nonredundant.bed"]) @@ -54,7 +69,8 @@ def read_insertions(jitterbug_gff, taxonomy, chroms, sample_name, min_fwd_read_s line = line.replace("\n","") split_line = line.split("\t") if len(split_line) == 9: - insert = mccutils.Insertion() + # insert = mccutils.Insertion() + insert = output.Insertion(output.Jitterbug()) insert.chromosome = split_line[0] if insert.chromosome in chroms: @@ -85,30 +101,31 @@ def read_insertions(jitterbug_gff, taxonomy, chroms, sample_name, min_fwd_read_s if "predicted_superfam" in feat: te = feat.split("=")[1] family = te_family[te] + insert.family = family if "supporting_fwd_reads" in feat: - insert.jitterbug.supporting_fwd_reads = int(feat.split("=")[1]) + insert.support_info.support['supporting_fwd_reads'].value = int(feat.split("=")[1]) if "supporting_rev_reads" in feat: - insert.jitterbug.supporting_rev_reads = int(feat.split("=")[1]) + insert.support_info.support['supporting_rev_reads'].value = int(feat.split("=")[1]) if "softclipped_support" in feat: - insert.jitterbug.split_read_support = int(feat.split("=")[1]) + insert.support_info.support['softclipped_support'].value = int(feat.split("=")[1]) if "zygosity" in feat: - insert.jitterbug.zygosity = float(feat.split("=")[1]) + insert.support_info.support['zygosity'].value = float(feat.split("=")[1]) - insert.name = family+"|non-reference|"+sample_name+"|jitterbug|" + insert.name = family+"|non-reference|"+str(insert.support_info.support['zygosity'].value)+"|"+sample_name+"|jitterbug|" if sr: insert.name += "sr|" else: insert.name = "rp|" if ( - (insert.jitterbug.supporting_fwd_reads >= min_fwd_read_support) and - (insert.jitterbug.supporting_rev_reads >= min_rev_read_support) and - (insert.jitterbug.split_read_support >= min_sr_support) and - (insert.jitterbug.zygosity >= min_zygosity) + (insert.support_info.support['supporting_fwd_reads'].value >= min_fwd_read_support) and + (insert.support_info.support['supporting_rev_reads'].value >= min_rev_read_support) and + (insert.support_info.support['softclipped_support'].value >= min_sr_support) and + (insert.support_info.support['zygosity'].value >= min_zygosity) ): insertions.append(insert) diff --git a/scripts/jitterbug/jitterbug_run.py b/scripts/jitterbug/jitterbug_run.py index 6ca2c26..dadf9c1 100644 --- a/scripts/jitterbug/jitterbug_run.py +++ b/scripts/jitterbug/jitterbug_run.py @@ -1,9 +1,14 @@ import os import sys import subprocess +import traceback +import importlib.util as il +spec = il.spec_from_file_location("config", snakemake.params.config) +config = il.module_from_spec(spec) +sys.modules[spec.name] = config +spec.loader.exec_module(config) sys.path.append(snakemake.config['args']['mcc_path']) import scripts.mccutils as mccutils -import config.jitterbug.jitterbug_run as config def main(): reference_te_gff = snakemake.input.reference_tes @@ -13,33 +18,46 @@ def main(): script_dir = snakemake.params.script_dir sample_name = snakemake.params.sample_name log = snakemake.params.log + status_log = snakemake.params.status_log threads = snakemake.threads out = snakemake.output.out mccutils.log("jitterbug","Running jitterbug", log=log) - out_gff, config_file = run_jitterbug( - script_dir, - bam, - reference_te_gff, - sample_name, out_dir, - minmapq=config.RUN['MINMAPQ'], - min_cluster_size=config.RUN['MIN_CLUSTER_SIZE'], - threads=threads, - log=log - ) - - config_file = make_config( - config_file, - out_dir, - cluster_size=config.FILTER["CLUSTER_SIZE"], - span=config.FILTER['SPAN'], - int_size=config.FILTER['INT_SIZE'], - softclipped=config.FILTER['SOFTCLIPPED'], - pick_consistent=config.FILTER['PICK_CONSISTENT'] - ) - filter_jitterbug(script_dir, out_gff, config_file, sample_name, out, log=log) + try: + out_gff, config_file = run_jitterbug( + script_dir, + bam, + reference_te_gff, + sample_name, out_dir, + minmapq=config.RUN['MINMAPQ'], + min_cluster_size=config.RUN['MIN_CLUSTER_SIZE'], + threads=threads, + log=log + ) + + config_file = make_config( + config_file, + out_dir, + cluster_size=config.FILTER["CLUSTER_SIZE"], + span=config.FILTER['SPAN'], + int_size=config.FILTER['INT_SIZE'], + softclipped=config.FILTER['SOFTCLIPPED'], + pick_consistent=config.FILTER['PICK_CONSISTENT'] + ) + filter_jitterbug(script_dir, out_gff, config_file, sample_name, out, log=log) + + except Exception as e: + track = traceback.format_exc() + print(track, file=sys.stderr) + with open(log,"a") as l: + print(track, file=l) + mccutils.log("Jitterbug","Jitterbug run failed") + with open(status_log,"w") as l: + l.write("FAILED\n") + + mccutils.run_command(["touch", out]) def run_jitterbug(script_dir, bam, ref_te_gff, sample_name, out_dir, minmapq=15, min_cluster_size=2, threads=1, log=None): diff --git a/scripts/mccutils.py b/scripts/mccutils.py index 6b8381d..d4265e3 100644 --- a/scripts/mccutils.py +++ b/scripts/mccutils.py @@ -18,100 +18,6 @@ def __init__(self, message): pass -class Ngs_te_mapper: - def __init__(self): - self.support = -1 - -class Temp: - def __init__(self): - self.classification = "None" - self.support = -1 - self.frequency = -1 - self.junction1 = -1 - self.junction1Support = -1 - self.junction2 = -1 - self.junction2Support = -1 - self.fivePrimeSupport = -1 - self.threePrimeSupport = -1 - -class Telocate: - def __init__(self): - self.read_pair_support = -1 - -class Retroseq: - def __init__(self): - self.read_support = -1 - self.breakpoint_confidence = -1 - -class Relocate2: - def __init__(self): - self.left_support = -1 - self.right_support = -1 - self.left_junction = -1 - self.right_junction = -1 - -class Relocate: - def __init__(self): - self.left_support = -1 - self.right_support = -1 - -class Popoolationte2: - def __init__(self): - self.support_type = "" - self.frequency = -1 - self.added = False - -class Popoolationte: - def __init__(self): - self.support_type = "" - self.f_read_support = 0 - self.r_read_support = 0 - self.f_read_support_percent = 0 - self.r_read_support_percent = 0 - -class Tepid: - def __init__(self): - self.id = -1 - self.support = 0 - -class Teflon: - def __init__(self): - left_sc_support = False - right_sc_support = False - presence_reads = 0 - absence_reads = 0 - ambiguous_reads = 0 - allele_frequency = 0.0 - -class Jitterbug: - def __init__(self): - supporting_fwd_reads = 0 - supporting_rev_reads = 0 - split_read_support = 0 - zygosity = 0.0 - -class Insertion: - def __init__(self): - self.chromosome = "None" - self.start = -1 - self.end = -1 - self.name = "None" - self.type = "None" - self.strand = "." - self.family = "" - self.popoolationte = Popoolationte() - self.popoolationte2 = Popoolationte2() - self.relocate = Relocate() - self.relocate2 = Relocate2() - self.retroseq = Retroseq() - self.telocate = Telocate() - self.temp = Temp() - self.ngs_te_mapper = Ngs_te_mapper() - self.tepid = Tepid() - self.teflon = Teflon() - self.jitterbug = Jitterbug() - - def mkdir(indir, log=None): if os.path.isdir(indir) == False: os.mkdir(indir) @@ -369,192 +275,3 @@ def log(step, msg, log=None): lines += " &> " + log print(lines) - - -def make_redundant_bed(insertions, sample_name, out_dir, method="popoolationte"): - tmp_bed = out_dir+"/tmp.bed" - - insertion_dict = {} - out_inserts = [] - malformed_inserts = [] - properly_formed_inserts = [] - for insert in insertions: - if insert.start <= insert.end: - insertion_dict[ "_".join([insert.chromosome, str(insert.start-1), str(insert.end), insert.name, "0", insert.strand])] = insert - properly_formed_inserts.append(insert) - else: - malformed_inserts.append(insert) - - insertions = properly_formed_inserts - # write malformed predictions to separate bed file - if len(malformed_inserts) > 0: - malformed_bed = out_dir+"/"+sample_name+"_"+method+"_malformed.bed" - with open(malformed_bed,"w") as out: - for insert in malformed_inserts: - out_line = "\t".join([insert.chromosome, str(insert.start-1), str(insert.end), insert.name, "0", insert.strand]) - out.write(out_line+"\n") - - with open(tmp_bed, "w") as out: - for insert in insertions: - out_line = "\t".join([insert.chromosome, str(insert.start-1), str(insert.end), insert.name, "0", insert.strand]) - out.write(out_line+"\n") - - sorted_bed = out_dir+"/sorted.bed" - command = ["bedtools", "sort", "-i", tmp_bed] - run_command_stdout(command, sorted_bed) - - redundant_bed = out_dir+"/"+sample_name+"_"+method+"_redundant.bed" - with open(redundant_bed, "w") as outbed: - header = 'track name="'+sample_name+'_'+method+'" description="'+sample_name+'_'+method+'"\n' - outbed.write(header) - with open(sorted_bed, "r") as inbed: - for x, line in enumerate(inbed): - - # outputs inserts in sorted order with unique number added to name - key = line.replace("\t","_") - key = key.replace("\n","") - insert = insertion_dict[key] - insert.name += str(x+1) - out_inserts.append(insert) - - # write to bed with unique number added to name - split_line = line.split("\t") - split_line[3] += str(x+1) - line = "\t".join(split_line) - outbed.write(line) - - remove(tmp_bed) - remove(sorted_bed) - - return out_inserts - -def make_nonredundant_bed(insertions, sample_name, out_dir, method="popoolationte"): - uniq_inserts = {} - - for insert in insertions: - key = "_".join([insert.chromosome, str(insert.start), str(insert.end), insert.type]) - if key not in uniq_inserts.keys(): - uniq_inserts[key] = insert - else: - ## method specific way to determine which duplicate to keep - if method == "popoolationte": - if (uniq_inserts[key].popoolationte.f_read_support + uniq_inserts[key].popoolationte.r_read_support) > (insert.popoolationte.f_read_support + insert.popoolationte.r_read_support): - uniq_inserts[key] = insert - - elif method == "popoolationte2": - if (uniq_inserts[key].popoolationte2.frequency) > (insert.popoolationte2.frequency): - uniq_inserts[key] = insert - - elif method == "relocate": - if (uniq_inserts[key].relocate.left_support + uniq_inserts[key].relocate.right_support) < (insert.relocate.left_support + insert.relocate.right_support): - uniq_inserts[key] = insert - - elif method == "relocate2": - if (uniq_inserts[key].relocate2.left_support + uniq_inserts[key].relocate2.right_support) < (insert.relocate2.left_support + insert.relocate2.right_support): - uniq_inserts[key] = insert - - elif method == "retroseq": - if uniq_inserts[key].retroseq.read_support > insert.retroseq.read_support: - uniq_inserts[key] = insert - - elif method == "te-locate": - if uniq_inserts[key].telocate.read_pair_support > insert.telocate.read_pair_support: - uniq_inserts[key] = insert - - elif method == "temp": - if insert.temp.support > uniq_inserts[key].temp.support: - uniq_inserts[key] = insert - - elif method == "ngs_te_mapper": - if insert.ngs_te_mapper.support > uniq_inserts[key].ngs_te_mapper.support: - uniq_inserts[key] = insert - - elif method == "tepid": - if insert.tepid.support > uniq_inserts[key].tepid.support: - uniq_inserts[key] = insert - - elif method == "teflon": - if insert.teflon.presence_reads > uniq_inserts[key].teflon.presence_reads: - uniq_inserts[key] = insert - - elif method == "jitterbug": - if insert.jitterbug.split_read_support > uniq_inserts[key].jitterbug.split_read_support: - uniq_inserts[key] = insert - - tmp_bed = out_dir+"/tmp.bed" - with open(tmp_bed, "w") as outbed: - for key in uniq_inserts.keys(): - insert = uniq_inserts[key] - out_line = "\t".join([insert.chromosome, str(insert.start-1), str(insert.end), insert.name, "0", insert.strand]) - outbed.write(out_line+"\n") - - sorted_bed = out_dir+"/sorted.bed" - command = ["bedtools", "sort", "-i", tmp_bed] - run_command_stdout(command, sorted_bed) - - nonredundant_bed = out_dir+"/"+sample_name+"_"+method+"_nonredundant.bed" - with open(sorted_bed, "r") as inbed: - with open(nonredundant_bed, "w") as outbed: - header = 'track name="'+sample_name+'_'+method+'" description="'+sample_name+'_'+method+'"\n' - outbed.write(header) - for line in inbed: - outbed.write(line) - - out_inserts = [] - with open(nonredundant_bed, "r") as outbed: - for line in outbed: - if "track name=" not in line and "description=" not in line: - split_line = line.split("\t") - ref_type = split_line[3].split("|")[1] - key = "_".join([split_line[0], str(int(split_line[1])+1), split_line[2], ref_type]) - out_inserts.append(uniq_inserts[key]) - remove(tmp_bed) - remove(sorted_bed) - - return out_inserts - -def write_vcf(inserts, genome_fasta, sample_name, method, out_dir): - contig_lengths = {} - with open(genome_fasta+".fai","r") as fai: - for line in fai: - split_line = line.split("\t") - contig_lengths[split_line[0]] = split_line[1] - - - contigs_with_inserts = [] - for insert in inserts: - if insert.chromosome not in contigs_with_inserts: - contigs_with_inserts.append(insert.chromosome) - - - out_vcf = out_dir+"/"+sample_name+"_"+method+"_nonredundant_non-reference.vcf" - with open(out_vcf, "w") as vcf: - today = date.today() - today_date = today.strftime("%Y-%m-%d") - meta = [ - "##fileformat=VCFv4.2", - "##fileDate="+today_date, - "##source=McClintock", - "##reference="+genome_fasta - ] - for contig in contigs_with_inserts: - meta.append("##contig=") - - meta.append('##INFO=') - meta.append('##INFO=') - meta.append('##INFO=') - meta.append('##INFO=') - - for line in meta: - vcf.write(line+"\n") - header = "\t".join(["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO"]) - vcf.write(header+"\n") - - ##TODO get ref sequence - ref = "N" - for insert in inserts: - vals = [insert.chromosome, str(insert.start), ".", ref, "", ".", "."] - info = ["END="+str(insert.end),"SVTYPE=INS", "STRAND="+insert.strand, "FAMILY="+insert.family] - - out_line = ("\t".join(vals)) + (";".join(info)) - vcf.write(out_line+"\n") diff --git a/scripts/ngs_te_mapper/ngs_te_mapper_post.py b/scripts/ngs_te_mapper/ngs_te_mapper_post.py index 5a45e84..47d92e2 100644 --- a/scripts/ngs_te_mapper/ngs_te_mapper_post.py +++ b/scripts/ngs_te_mapper/ngs_te_mapper_post.py @@ -28,7 +28,7 @@ def main(): succeeded = mccutils.check_status_file(status_log) if succeeded: mccutils.log("ngs_te_mapper","processing ngs_te_mapper results", log=log) - insertions = read_insertions(raw_bed, chromosomes, sample_name, out_dir, min_read_cutoff=config.MIN_READ_SUPPORT) + insertions = read_insertions(raw_bed, chromosomes, sample_name, out_dir, min_read_cutoff=config.PARAMS['min_read_support']) if len(insertions) > 0: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="ngs_te_mapper") diff --git a/scripts/ngs_te_mapper/ngs_te_mapper_run.py b/scripts/ngs_te_mapper/ngs_te_mapper_run.py index 54cb754..8463796 100644 --- a/scripts/ngs_te_mapper/ngs_te_mapper_run.py +++ b/scripts/ngs_te_mapper/ngs_te_mapper_run.py @@ -43,7 +43,7 @@ def main(): if snakemake.params.raw_fq2 == "None": is_paired = False - command = ['Rscript', "--vanilla", script_dir+"/ngs_te_mapper.R", "genome="+reference_fasta, "teFile="+consensus_fasta, "tsd="+str(config.MAX_TSD), "thread="+str(threads), "output="+out_dir, "sourceCodeFolder="+script_dir] + command = ['Rscript', "--vanilla", script_dir+"/ngs_te_mapper.R", "genome="+reference_fasta, "teFile="+consensus_fasta, "tsd="+str(config.PARAMS["tsd="]), "thread="+str(threads), "output="+out_dir, "sourceCodeFolder="+script_dir] if is_paired: command.append("sample="+fastq1+";"+fastq2) diff --git a/scripts/ngs_te_mapper2/ngs_te_mapper2_run.py b/scripts/ngs_te_mapper2/ngs_te_mapper2_run.py index 3d09949..aa6c385 100644 --- a/scripts/ngs_te_mapper2/ngs_te_mapper2_run.py +++ b/scripts/ngs_te_mapper2/ngs_te_mapper2_run.py @@ -49,19 +49,18 @@ def main(): 'python', script_dir+"/ngs_te_mapper2.py", "-r", reference_fasta, "-l", consensus_fasta, - "-w", str(config.WINDOW), - "--min_mapq", str(config.MIN_MAPQ), - "--tsd_max", str(config.MAX_TSD), - "--gap_max", str(config.MAX_GAP), "-t", str(threads), "-o", out_dir, "--keep_files", "-p", sample_name, "-a", locations, - "--min_af", str(config.MIN_ALLELE_FREQUENCY), - "-f" ] + for key in config.PARAMS.keys(): + command.append(key) + command.append(str(config.PARAMS[key])) + + command.append("-f") if is_paired: command.append(fastq1+","+fastq2) else: diff --git a/scripts/output.py b/scripts/output.py index e03e774..bad049d 100644 --- a/scripts/output.py +++ b/scripts/output.py @@ -192,20 +192,41 @@ def __init__(self): self.redundancy_filter = RedundancyFilter(["presence_reads"], "value") -################################################# -## TODO convert to proper format for VCF creation -class Tepid: +class Tebreak: def __init__(self): - self.id = -1 - self.support = 0 + self.support = { + "five_p_elt_match" : Info("FIVE_P_ELT_MATCH", "Fraction of bases matched to reference for inserted sequence on insertion seqment of 5' supporting contig", 0.0, "Float"), + "three_p_elt_match" : Info("THREE_P_ELT_MATCH", "Fraction of bases matched to reference for inserted sequence on insertion seqment of 3' supporting contig", 0.0, "Float"), + "five_p_genome_match" : Info("FIVE_P_GENOME_MATCH", "Fraction of bases matched to reference genome on genomic segment of 5' supporting contig", 0.0, "Float"), + "three_p_genome_match" : Info("THREE_P_GENOME_MATCH", "Fraction of bases matched to reference genome on genomic segment of 3' supporting contig", 0.0, "Float"), + "split_reads_5prime" : Info("SPLIT_READS_5PRIME", "Number of split reads supporting 5' end of the insertion", 0, "Integer"), + "split_reads_3prime" : Info("SPLIT_READS_3PRIME", "Number of split reads supporting 3' end of the insertion", 0, "Integer"), + "remapped_discordant" : Info("REMAPPED_DISCORDANT", "Number of discordant read ends re-mappable to insertion reference sequence", 0, "Integer"), + "remap_disc_fraction" : Info("REMAP_DISC_FRACTION", "The proportion of remapped discordant reads mapping to the reference insertion sequence", 0.0, "Float"), + "remapped_splitreads" : Info("REMAPPED_SPLITREADS", "Number of split reads re-mappable to insertion reference sequence", 0, "Integer"), + "remap_split_fraction" : Info("REMAP_SPLIT_FRACTION", "The proportion of remapped split reads mapping to the reference insertion sequence", 0.0, "Float") + } + + self.redundancy_filter = RedundancyFilter(["split_reads_5prime", "split_reads_3prime"], "sum") class Jitterbug: def __init__(self): - supporting_fwd_reads = 0 - supporting_rev_reads = 0 - split_read_support = 0 - zygosity = 0.0 -################################################ + self.support = { + "supporting_fwd_reads" : Info("SUPPORTING_FWD_READS", "Reads supporting the forward junction of the insertion", 0, "Integer"), + "supporting_rev_reads" : Info("SUPPORTING_REV_READS", "Reads supporting the reverse junction of the insertion", 0, "Integer"), + "softclipped_support" : Info("SOFTCLIPPED_SUPPORT", "Number of softclipped reads that support the breakpoint", 0, "Integer"), + "zygosity" : Info("ZYGOSITY", "The ratio of clipped and properly mapped reads at the insertion site", 0.0, "Float") + } + + self.redundancy_filter = RedundancyFilter(["supporting_fwd_reads", "supporting_rev_reads"], "sum") + +class Tepid: + def __init__(self): + self.id = -1 + self.support = { + "supporting_reads" : Info("SUPPORTING_READS", "Number of reads supporting the insertion", 0, "Integer") + } + self.redundancy_filter = RedundancyFilter(["supporting_reads"], "value") def make_redundant_bed(insertions, sample_name, out_dir, method="popoolationte"): diff --git a/scripts/popoolationte/popoolationte_post.py b/scripts/popoolationte/popoolationte_post.py index 5a9104b..27e729e 100644 --- a/scripts/popoolationte/popoolationte_post.py +++ b/scripts/popoolationte/popoolationte_post.py @@ -25,7 +25,7 @@ def main(): succeeded = mccutils.check_status_file(status_log) if succeeded: - insertions = read_insertions(popoolationte_out, sample_name, chromosomes, require_both_end_support=config.REQUIRE_BOTH_END_SUPPORT, percent_read_support_threshold=config.PERCENT_READ_SUPPORT_THRESHOLD) + insertions = read_insertions(popoolationte_out, sample_name, chromosomes, require_both_end_support=config.PARAMS["require_both_end_support"], percent_read_support_threshold=config.PARAMS["percent_read_support_threshold"]) if len(insertions) >= 1: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="popoolationte") insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="popoolationte") diff --git a/scripts/popoolationte/popoolationte_run.py b/scripts/popoolationte/popoolationte_run.py index b2d11d9..9e59360 100644 --- a/scripts/popoolationte/popoolationte_run.py +++ b/scripts/popoolationte/popoolationte_run.py @@ -57,14 +57,9 @@ def main(): max_dist, known_inserts, script_dir, - out_dir, - log=log, - identify_min_count=config.IDENTIFY_TE_INSERTSITES["min-count"], - identify_min_qual=config.IDENTIFY_TE_INSERTSITES["min-map-qual"], - crosslink_site_shift=config.CROSSLINK_TE_SITES['single-site-shift'], - update_te_inserts_site_shift=config.UPDATE_TEINSERTS_WITH_KNOWNTES['single-site-shift'], - estimate_polymorphism_min_qual=config.ESTIMATE_POLYMORPHISM['min-map-qual'], - filter_min_count=config.FILTER['min-count']) + out_dir, + config.PARAMS, + log=log) mccutils.check_file_exists(snakemake.output[0]) @@ -145,14 +140,8 @@ def make_known_insert_file(gff, out): return out_file -def run_popoolationte(sam, reference, taxon, read_len, insert_size, max_dist, ref_inserts, script_dir, out_dir, - log=None, - identify_min_count=3, - identify_min_qual=15, - crosslink_site_shift=100, - update_te_inserts_site_shift=100, - estimate_polymorphism_min_qual=15, - filter_min_count=5): +def run_popoolationte(sam, reference, taxon, read_len, insert_size, max_dist, ref_inserts, script_dir, out_dir, params, + log=None): mccutils.log("popoolationte","identify-te-insertsites.pl") insert_sites = out_dir+"te-fwd-rev.txt" @@ -160,8 +149,14 @@ def run_popoolationte(sam, reference, taxon, read_len, insert_size, max_dist, re "--input", sam, "--te-hierarchy-file", taxon, "--te-hierarchy-level", "family", - "--narrow-range", str(read_len), "--min-count", str(identify_min_count), - "--min-map-qual", str(identify_min_qual), "--output", insert_sites, "--insert-distance", str(insert_size), "--read-length", str(read_len)] + "--narrow-range", str(read_len), + "--output", insert_sites, + "--insert-distance", str(insert_size), + "--read-length", str(read_len)] + + for param in params["identify-te-insertsites.pl"].keys(): + command.append(param) + command.append(str(params["identify-te-insertsites.pl"][param])) mccutils.run_command(command, log=log) mccutils.log("popoolationte","genomic-N-2gtf.pl") @@ -176,10 +171,13 @@ def run_popoolationte(sam, reference, taxon, read_len, insert_size, max_dist, re "--min-dist", str(read_len), "--max-dist", str(max_dist), "--output", crosslinked, - "--single-site-shift", str(crosslink_site_shift), "--poly-n", poly_n, "--te-hierarchy", taxon, "--te-hier-level", "family"] + + for param in params["crosslink-te-sites.pl"].keys(): + command.append(param) + command.append(str(params["crosslink-te-sites.pl"][param])) mccutils.run_command(command, log=log) mccutils.log("popoolationte","update-teinserts-with-knowntes.pl") @@ -190,8 +188,11 @@ def run_popoolationte(sam, reference, taxon, read_len, insert_size, max_dist, re "--te-hierarchy-file", taxon, "--te-hierarchy-level", "family", "--max-dist", str(max_dist), - "--te-insertions", crosslinked, - "--single-site-shift", str(update_te_inserts_site_shift)] + "--te-insertions", crosslinked] + + for param in params["update-teinserts-with-knowntes.pl"].keys(): + command.append(param) + command.append(str(params["update-teinserts-with-knowntes.pl"][param])) mccutils.run_command(command, log=log) mccutils.log("popoolationte","estimate-polymorphism.pl") @@ -201,8 +202,11 @@ def run_popoolationte(sam, reference, taxon, read_len, insert_size, max_dist, re "--te-insert-file", updated_inserts, "--te-hierarchy-file", taxon, "--te-hierarchy-level", "family", - "--min-map-qual", str(estimate_polymorphism_min_qual), "--output", te_polymorphism] + + for param in params["estimate-polymorphism.pl"].keys(): + command.append(param) + command.append(str(params["estimate-polymorphism.pl"][param])) mccutils.run_command(command, log=log) mccutils.log("popoolationte","filter-teinserts.pl") @@ -210,8 +214,11 @@ def run_popoolationte(sam, reference, taxon, read_len, insert_size, max_dist, re command = ["perl", script_dir+"filter-teinserts.pl", "--te-insertions", te_polymorphism, "--output", filtered, - "--discard-overlapping", - "--min-count", str(filter_min_count)] + "--discard-overlapping"] + + for param in params["filter-teinserts.pl"].keys(): + command.append(param) + command.append(str(params["filter-teinserts.pl"][param])) mccutils.run_command(command, log=log) if __name__ == "__main__": diff --git a/scripts/popoolationte2/popoolationte2_post.py b/scripts/popoolationte2/popoolationte2_post.py index 3c39320..cd721e4 100644 --- a/scripts/popoolationte2/popoolationte2_post.py +++ b/scripts/popoolationte2/popoolationte2_post.py @@ -29,7 +29,7 @@ def main(): if prev_step_succeeded: ref_tes = get_ref_tes(te_gff, taxonomy, chromosomes) - insertions = read_insertions(te_predictions, ref_tes, chromosomes, sample_name, both_end_support_needed=config.REQUIRE_BOTH_END_SUPPORT, support_threshold=config.FREQUENCY_THRESHOLD) + insertions = read_insertions(te_predictions, ref_tes, chromosomes, sample_name, both_end_support_needed=config.PARAMS["require_both_end_support"], support_threshold=config.PARAMS["frequency_threshold"]) if len(insertions) >= 1: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="popoolationte2") insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="popoolationte2") diff --git a/scripts/popoolationte2/popoolationte2_run.py b/scripts/popoolationte2/popoolationte2_run.py index 291cf7b..8b2905c 100644 --- a/scripts/popoolationte2/popoolationte2_run.py +++ b/scripts/popoolationte2/popoolationte2_run.py @@ -28,12 +28,12 @@ def main(): try: mccutils.mkdir(out_dir+"/tmp") taxonomy = format_taxonomy(taxonomy, out_dir) - ppileup = popoolationte2_ppileup(jar, config.ppileup, bam, taxonomy, out_dir, log=log) - ppileup = popoolationte2_subsample(jar, config.subsampleppileup, ppileup, out_dir, log=log) - signatures = popoolationte2_signatures(jar, config.identifySignatures, ppileup, out_dir, log=log) - signatures = popoolationte2_strand(jar, config.updateStrand, signatures, bam, taxonomy, out_dir, log=log) + ppileup = popoolationte2_ppileup(jar, config.PARAMS["ppileup"], bam, taxonomy, out_dir, log=log) + ppileup = popoolationte2_subsample(jar, config.PARAMS["subsampleppileup"], ppileup, out_dir, log=log) + signatures = popoolationte2_signatures(jar, config.PARAMS["identifySignatures"], ppileup, out_dir, log=log) + signatures = popoolationte2_strand(jar, config.PARAMS["updateStrand"], signatures, bam, taxonomy, out_dir, log=log) signatures = popoolationte2_frequency(jar, ppileup, signatures, out_dir, log=log) - te_insertions = popoolationte2_pairup(jar, config.pairupSignatures, signatures, ref_fasta, taxonomy, out_dir, log=log) + te_insertions = popoolationte2_pairup(jar, config.PARAMS["pairupSignatures"], signatures, ref_fasta, taxonomy, out_dir, log=log) mccutils.remove(out_dir+"/tmp") mccutils.check_file_exists(snakemake.output[0]) @@ -70,14 +70,18 @@ def format_taxonomy(taxon, out): def popoolationte2_ppileup(jar, params, bam, taxon, out, log=None): mccutils.log("popoolationte2","making physical pileup file", log=log) ppileup = out+"/output.ppileup.gz" - mccutils.run_command(['java', "-Djava.io.tmpdir="+out+"/tmp", "-jar", jar, "ppileup", + command = ['java', "-Djava.io.tmpdir="+out+"/tmp", "-jar", jar, "ppileup", "--bam", bam, "--hier", taxon, - "--map-qual", str(params["map-qual"]), - "--sr-mindist", str(params['sr-mindist']), - "--id-up-quant", str(params['id-up-quant']), - "--output", ppileup], log=log) + "--output", ppileup] + + for param in params.keys(): + command.append(param) + command.append(str(params[param])) + + mccutils.run_command(command, log=log) + return ppileup def popoolationte2_subsample(jar, params, ppileup, out, log=None): @@ -85,11 +89,15 @@ def popoolationte2_subsample(jar, params, ppileup, out, log=None): if params["run"]: mccutils.log("popoolationte2","subsampling physical pileup file to uniform coverage", log=log) command = ["java", "-Djava.io.tmpdir="+out+"/tmp", "-jar", jar, "subsampleppileup", - "--ppileup", ppileup, - "--target-coverage", str(params['target-coverage']), + "--ppileup", ppileup, "--output", out_ppileup] - if params['with-replace']: - command.append("--with-replace") + + for param in params.keys(): + if param == True and param != "run": + command.append(param) + elif param != False: + command.append(param) + command.append(str(params[param])) mccutils.run_command(command, log=log) else: @@ -101,29 +109,33 @@ def popoolationte2_subsample(jar, params, ppileup, out, log=None): def popoolationte2_signatures(jar, params, ppileup, out, log=None): mccutils.log("popoolationte2","identifying signatures of TE insertions", log=log) signatures = out+"/output.signatures" - mccutils.run_command(["java", "-Djava.io.tmpdir="+out+"/tmp", "-jar", jar, "identifySignatures", + command = ["java", "-Djava.io.tmpdir="+out+"/tmp", "-jar", jar, "identifySignatures", "--ppileup", ppileup, "--mode", "separate", - "--output", signatures, - "--signature-window", params['signature-window'], - "--min-valley", params['min-valley'], - "--chunk-distance", str(params['chunk-distance']), - "--min-count", str(params['min-count'])], log=log) + "--output", signatures] + + for param in params.keys(): + command.append(param) + command.append(str(params[param])) + + mccutils.run_command(command, log=log) return signatures def popoolationte2_strand(jar, params, signatures, bam, taxon, out, log=None): mccutils.log("popoolationte2", "estimating strand of TEs", log=log) out_sig = out+"/output.stranded.signatures" - mccutils.run_command(["java", "-Djava.io.tmpdir="+out+"/tmp", "-jar", jar, "updateStrand", + command = ["java", "-Djava.io.tmpdir="+out+"/tmp", "-jar", jar, "updateStrand", "--signature", signatures, "--output", out_sig, "--bam", bam, - "--hier", taxon, - "--max-disagreement", str(params["max-disagreement"]), - "--sr-mindist", str(params['sr-mindist']), - "--map-qual", str(params['map-qual']), - "--id-up-quant", str(params['id-up-quant'])], log=log) + "--hier", taxon] + + for param in params.keys(): + command.append(param) + command.append(str(params[param])) + + mccutils.run_command(command, log=log) return out_sig @@ -140,15 +152,18 @@ def popoolationte2_frequency(jar, ppileup, signatures, out, log=None): def popoolationte2_pairup(jar, params, signatures, ref, taxon, out, log=None): mccutils.log("popoolationte2","generating raw TE insertion predictions", log=log) te_insertions = out+"/teinsertions.txt" - mccutils.run_command(["java", "-Djava.io.tmpdir="+out+"/tmp", "-jar", jar, "pairupSignatures", + command = ["java", "-Djava.io.tmpdir="+out+"/tmp", "-jar", jar, "pairupSignatures", "--signature", signatures, "--ref-genome", ref, "--hier", taxon, "--output-detail", "medium", - "--min-distance", str(params['min-distance']), - "--max-distance", str(params['max-distance']), - "--max-freq-diff", str(params['max-freq-diff']), - "--output", te_insertions], log=log) + "--output", te_insertions] + + for param in params.keys(): + command.append(param) + command.append(str(params[param])) + + mccutils.run_command(command, log=log) return te_insertions diff --git a/scripts/preprocessing/map_reads.py b/scripts/preprocessing/map_reads.py index ae18f45..dde34f8 100644 --- a/scripts/preprocessing/map_reads.py +++ b/scripts/preprocessing/map_reads.py @@ -20,7 +20,7 @@ def main(): if eval(snakemake.config['args']['save_comments']): command.append("-C") - command += ["-t", str(snakemake.threads), "-R", "@RG\\tID:"+snakemake.params.sample+"\\tSM:"+snakemake.params.sample, snakemake.input.ref, snakemake.input.fq1] + command += ["-t", str(snakemake.threads), "-M", "-R", "@RG\\tID:"+snakemake.params.sample+"\\tSM:"+snakemake.params.sample, snakemake.input.ref, snakemake.input.fq1] if snakemake.config['in']['fq2'] != "None": command.append(snakemake.input.fq2) diff --git a/scripts/preprocessing/repeatmask.py b/scripts/preprocessing/repeatmask.py index ff572ca..ede21b1 100644 --- a/scripts/preprocessing/repeatmask.py +++ b/scripts/preprocessing/repeatmask.py @@ -17,9 +17,9 @@ def main(): output = snakemake.output.rm_out - mccutils.log("processing","Running RepeatMasker for RelocaTE2", log=log) + mccutils.log("processing","Running RepeatMasker", log=log) run_repeatmasker(reference, ref_name, te_seqs, threads, log, output, out_dir) - mccutils.log("processing","Repeatmasker for RelocaTE2 complete") + mccutils.log("processing","Repeatmasker complete") diff --git a/scripts/preprocessing/sam_to_bam.py b/scripts/preprocessing/sam_to_bam.py index f8ef6a4..9152439 100644 --- a/scripts/preprocessing/sam_to_bam.py +++ b/scripts/preprocessing/sam_to_bam.py @@ -29,9 +29,9 @@ def main(): try: - command = ["samtools", "sort", "-@", str(snakemake.threads), snakemake.output.tmp_bam, snakemake.output.bam.replace(".bam", "")] + command = ["samtools", "sort", "-@", str(snakemake.threads), snakemake.output.tmp_bam, snakemake.output.tmp2_bam.replace(".bam", "")] mccutils.run_command(command, log=log) - mccutils.check_file_exists(snakemake.output.bam) + mccutils.check_file_exists(snakemake.output.tmp2_bam) except Exception as e: track = traceback.format_exc() @@ -39,6 +39,27 @@ def main(): print("ERROR...falied to sort the bam file using samtools sort...bam file:", snakemake.output.tmp_bam, file=sys.stderr) sys.exit(1) + try: + command = ["samtools", "index", snakemake.output.tmp2_bam] + mccutils.run_command(command, log=log) + + except Exception as e: + track = traceback.format_exc() + print(track, file=sys.stderr) + print("ERROR...falied to index the bam file using samtools index...bam file:", snakemake.output.tmp2_bam, file=sys.stderr) + sys.exit(1) + + try: + command = ['picard', "MarkDuplicates", "-I", snakemake.output.tmp2_bam, "-O", snakemake.output.bam, "-M", snakemake.output.metrics] + mccutils.run_command(command, log=log) + mccutils.check_file_exists(snakemake.output.bam) + + except Exception as e: + track = traceback.format_exc() + print(track, file=sys.stderr) + print("ERROR...falied to mark duplicates with picard for the bam file ...bam file:", snakemake.output.tmp2_bam, file=sys.stderr) + sys.exit(1) + try: command = ["samtools", "index", snakemake.output.bam] mccutils.run_command(command, log=log) diff --git a/scripts/preprocessing/setup_reads.py b/scripts/preprocessing/setup_reads.py index 45cea45..95c36d3 100644 --- a/scripts/preprocessing/setup_reads.py +++ b/scripts/preprocessing/setup_reads.py @@ -48,11 +48,9 @@ def main(): if "trimgalore" in methods: mccutils.log("processing", "running trim_galore", log=log) if fq2 == "None": - flags = trimgalore.SINGLE_END_FLAGS - trimmedfq = run_trim_galore(fq1, run_id, log, mcc_out, cores=processors, flags=flags) + trimmedfq = run_trim_galore(fq1, run_id, log, mcc_out, cores=processors, params=trimgalore.PARAMS["single_end"]) else: - flags = trimgalore.PAIRED_END_FLAGS - trimmedfq, trimmedfq2 = run_trim_galore(fq1, run_id, log, mcc_out, fq2=fq2, cores=processors, flags=flags) + trimmedfq, trimmedfq2 = run_trim_galore(fq1, run_id, log, mcc_out, fq2=fq2, cores=processors, params=trimgalore.PARAMS["paired_end"]) run_multiqc(mcc_out+"/results/trimgalore/") @@ -138,13 +136,18 @@ def check_fastqs(fq1, fq2, out, min_length=30, log=None): has_valid_read_ids(fq1, fq2, log=log) -def run_trim_galore(fq1, run_id, log, out, fq2=None, cores=1, flags=[]): +def run_trim_galore(fq1, run_id, log, out, fq2=None, cores=1, params={}): mccutils.mkdir(out+"/results/") - command = ['trim_galore'] + flags + ["-j", str(cores), "-o", out+"/results/trimgalore"] + command = ['trim_galore', "-j", str(cores), "-o", out+"/results/trimgalore"] + + for param in params.keys(): + if params[param] == True: + command.append(param) + if fq2 is None: command.append(fq1) else: - command += ["--paired", fq1, fq2] + command += [fq1, fq2] mccutils.run_command(command, log=log) diff --git a/scripts/relocaTE/relocate_post.py b/scripts/relocaTE/relocate_post.py index 6db4f35..e365393 100644 --- a/scripts/relocaTE/relocate_post.py +++ b/scripts/relocaTE/relocate_post.py @@ -28,7 +28,15 @@ def main(): prev_step_succeeded = mccutils.check_status_file(status_log) if prev_step_succeeded: - insertions = get_insertions(relocate_gff, sample_name, chromosomes, ref_l_threshold=config.REF_LEFT_THRESHOLD, ref_r_threshold=config.REF_RIGHT_THRESHOLD, nonref_l_threshold=config.NONREF_LEFT_THRESHOLD, nonref_r_threshold=config.NONREF_RIGHT_THRESHOLD) + insertions = get_insertions( + relocate_gff, + sample_name, + chromosomes, + ref_l_threshold=config.PARAMS["ref_left_threshold"], + ref_r_threshold=config.PARAMS["ref_right_threshold"], + nonref_l_threshold=config.PARAMS["nonref_left_threshold"], + nonref_r_threshold=config.PARAMS["nonref_right_threshold"] + ) insertions = set_ref_orientations(insertions, te_gff) if len(insertions) >= 1: diff --git a/scripts/relocaTE/relocate_run.py b/scripts/relocaTE/relocate_run.py index 57037e0..3e61169 100644 --- a/scripts/relocaTE/relocate_run.py +++ b/scripts/relocaTE/relocate_run.py @@ -73,13 +73,11 @@ def main(): "-d", fq_dir, "-g", reference_fasta, "-o", ".", - "-r", annotation, - "-l", str(config.RELOCATE['l']), - "-m", str(config.RELOCATE['m']), - "-bm", str(config.RELOCATE['bm']), - "-bt", str(config.RELOCATE['bt']), - "-f", str(config.RELOCATE['f'])] + "-r", annotation] + for param in config.PARAMS.keys(): + command.append(param) + command.append(str(config.PARAMS[param])) if is_paired: command += ["-1", fq1_uniq_id, "-2", fq2_uniq_id] diff --git a/scripts/relocaTE2/relocate2_post.py b/scripts/relocaTE2/relocate2_post.py index 8b3b2d2..4822d70 100644 --- a/scripts/relocaTE2/relocate2_post.py +++ b/scripts/relocaTE2/relocate2_post.py @@ -33,19 +33,19 @@ def main(): sample_name, chromosomes, insert_type="ref", - l_support_threshold=config.REF_LEFT_SUPPORT_THRESHOLD, - r_support_threshold=config.REF_RIGHT_SUPPORT_THRESHOLD, - l_junction_threshold=config.REF_LEFT_JUNCTION_THRESHOLD, - r_junction_threshold=config.REF_RIGHT_JUNCTION_THRESHOLD) + l_support_threshold=config.PARAMS["ref_left_support_threshold"], + r_support_threshold=config.PARAMS["ref_right_support_threshold"], + l_junction_threshold=config.PARAMS["ref_left_junction_threshold"], + r_junction_threshold=config.PARAMS["ref_right_junction_threshold"]) nonref_insertions = get_insertions(nonref_gff, sample_name, chromosomes, insert_type="nonref", - l_support_threshold=config.NONREF_LEFT_SUPPORT_THRESHOLD, - r_support_threshold=config.NONREF_RIGHT_SUPPORT_THRESHOLD, - l_junction_threshold=config.NONREF_LEFT_JUNCTION_THRESHOLD, - r_junction_threshold=config.NONREF_RIGHT_JUNCTION_THRESHOLD) + l_support_threshold=config.PARAMS["nonref_left_support_threshold"], + r_support_threshold=config.PARAMS["nonref_right_support_threshold"], + l_junction_threshold=config.PARAMS["nonref_left_junction_threshold"], + r_junction_threshold=config.PARAMS["nonref_right_junction_threshold"]) ref_insertions = fix_ref_te_names(ref_insertions, rm_out, sample_name) diff --git a/scripts/relocaTE2/relocate2_run.py b/scripts/relocaTE2/relocate2_run.py index e3f60ab..9d3e1be 100644 --- a/scripts/relocaTE2/relocate2_run.py +++ b/scripts/relocaTE2/relocate2_run.py @@ -74,14 +74,13 @@ def main(): "--run", "-v", "4", "-c", str(threads), - "--aligner", config.RELOCATE2["aligner"], - "--len_cut_match", str(config.RELOCATE2["len_cut_match"]), - "--len_cut_trim", str(config.RELOCATE2["len_cut_trim"]), - "--mismatch", str(config.RELOCATE2["mismatch"]), - "--mismatch_junction", str(config.RELOCATE2["mismatch_junction"]), "-d", fq_dir ] + for param in config.PARAMS.keys(): + command.append(param) + command.append(str(config.PARAMS[param])) + if is_paired: command += ["-1", "_1", "-2", "_2"] diff --git a/scripts/retroseq/retroseq_post.py b/scripts/retroseq/retroseq_post.py index 95190db..85ce55c 100644 --- a/scripts/retroseq/retroseq_post.py +++ b/scripts/retroseq/retroseq_post.py @@ -26,7 +26,7 @@ def main(): prev_steps_succeeded = mccutils.check_status_file(status_log) if prev_steps_succeeded: - insertions = read_insertions(retroseq_out, sample_name, chromosomes, support_threshold=config.READ_SUPPORT_THRESHOLD, breakpoint_threshold=config.BREAKPOINT_CONFIDENCE_THRESHOLD) + insertions = read_insertions(retroseq_out, sample_name, chromosomes, support_threshold=config.PARAMS["read_support_threshold"], breakpoint_threshold=config.PARAMS["breakpoint_confidence_threshold"]) if len(insertions) >= 1: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="retroseq") insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="retroseq") diff --git a/scripts/retroseq/retroseq_run.py b/scripts/retroseq/retroseq_run.py index 5e62909..f8d00ce 100644 --- a/scripts/retroseq/retroseq_run.py +++ b/scripts/retroseq/retroseq_run.py @@ -46,7 +46,7 @@ def main(): bed_location_file = make_consensus_beds(elements, ref_name, ref_te_bed, taxonomy, out_dir) - run_retroseq(bam, bed_location_file, ref_fasta, script_dir, sample_name, out_dir, config.PARAMETERS, log=log) + run_retroseq(bam, bed_location_file, ref_fasta, script_dir, sample_name, out_dir, config.PARAMS, log=log) with open(status_log,"w") as l: l.write("COMPLETED\n") @@ -133,11 +133,31 @@ def make_consensus_beds(elements, ref_name, te_bed, taxon, out): def run_retroseq(bam, bed_locations, ref_fasta, script_dir, sample_name, out_dir, params, log=None): discovery_out = out_dir+"/"+sample_name+".discovery" - command = ["perl", script_dir+"/retroseq.pl", "-discover", "-bam", bam, "-refTEs", bed_locations, "-output", discovery_out, "-depth", str(params["depth"]), "-reads", str(params['reads']), "-q", str(params['q'])] + command = ["perl", script_dir+"/retroseq.pl", + "-discover", + "-bam", bam, + "-refTEs", bed_locations, + "-output", discovery_out] + + for param in params.keys(): + command.append(param) + command.append(str(params[param])) mccutils.run_command(command, log=log) call_out = out_dir+"/"+sample_name+".call" - command = ["perl", script_dir+"/retroseq.pl", "-call", "-bam", bam, "-input", discovery_out, "-filter", bed_locations, "-ref", ref_fasta, "-output", call_out, "-orientate", "yes", "-depth", str(params["depth"]), "-reads", str(params['reads']), "-q", str(params['q'])] + command = ["perl", script_dir+"/retroseq.pl", + "-call", + "-bam", bam, + "-input", discovery_out, + "-filter", bed_locations, + "-ref", ref_fasta, + "-output", call_out, + "-orientate", "yes"] + + for param in params.keys(): + command.append(param) + command.append(str(params[param])) + mccutils.run_command(command, log=log) diff --git a/scripts/tebreak/tebreak_post.py b/scripts/tebreak/tebreak_post.py new file mode 100644 index 0000000..c43ee33 --- /dev/null +++ b/scripts/tebreak/tebreak_post.py @@ -0,0 +1,102 @@ +import os +import sys +import subprocess +import importlib.util as il +spec = il.spec_from_file_location("config", snakemake.params.config) +config = il.module_from_spec(spec) +sys.modules[spec.name] = config +spec.loader.exec_module(config) +sys.path.append(snakemake.config['args']['mcc_path']) +import scripts.mccutils as mccutils +import scripts.output as output + + +def main(): + mccutils.log("tebreak","running tebreak post processing") + tebreak_out = snakemake.input.tebreak_out + ref_fasta = snakemake.input.ref_fasta + + out_dir = snakemake.params.out_dir + ref_name = snakemake.params.ref_name + sample_name = snakemake.params.sample_name + chromosomes = snakemake.params.chromosomes.split(",") + status_log = snakemake.params.status_log + + prev_steps_succeeded = mccutils.check_status_file(status_log) + if prev_steps_succeeded: + insertions = read_insertions(tebreak_out, sample_name, chromosomes, config) + + if len(insertions) > 0: + insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="tebreak") + insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="tebreak") + output.write_vcf(insertions, ref_fasta, sample_name, "tebreak", out_dir) + else: + mccutils.run_command(["touch", out_dir+"/"+sample_name+"_tebreak_redundant.bed"]) + mccutils.run_command(["touch", out_dir+"/"+sample_name+"_tebreak_nonredundant.bed"]) + else: + mccutils.run_command(["touch", out_dir+"/"+sample_name+"_tebreak_redundant.bed"]) + mccutils.run_command(["touch", out_dir+"/"+sample_name+"_tebreak_nonredundant.bed"]) + + mccutils.log("tebreak","tebreak postprocessing complete") + + +def read_insertions(tebreak_out, sample_name, chromosomes, config): + insertions = [] + header = {} + with open(tebreak_out, "r") as inf: + for ln,line in enumerate(inf): + line = line.replace("\n","") + split_line = line.split("\t") + if ln == 0: + for x,val in enumerate(split_line): + header[val] = x + else: + insert = output.Insertion(output.Tebreak()) + insert.chromosome = split_line[header['Chromosome']] + insert.start = int(split_line[header['3_Prime_End']])+1 + insert.end = int(split_line[header['5_Prime_End']]) + insert.family = split_line[header['Superfamily']] + insert.type = "non-reference" + if split_line[header['Orient_5p']] == split_line[header['Orient_3p']]: + insert.strand = split_line[header['Orient_5p']] + else: + insert.strand = "." + + if insert.strand == "-": + tmp = insert.start + insert.start = insert.end + insert.end = tmp + + insert.support_info.support["five_p_elt_match"].value = float(split_line[header['5p_Elt_Match']]) + insert.support_info.support["three_p_elt_match"].value = float(split_line[header['3p_Elt_Match']]) + insert.support_info.support["five_p_genome_match"].value = float(split_line[header['5p_Genome_Match']]) + insert.support_info.support["three_p_genome_match"].value = float(split_line[header['3p_Genome_Match']]) + insert.support_info.support["split_reads_5prime"].value = float(split_line[header['Split_reads_5prime']]) + insert.support_info.support["split_reads_3prime"].value = float(split_line[header['Split_reads_3prime']]) + insert.support_info.support["remapped_discordant"].value = float(split_line[header['Remapped_Discordant']]) + insert.support_info.support["remap_disc_fraction"].value = float(split_line[header['Remap_Disc_Fraction']]) + insert.support_info.support["remapped_splitreads"].value = float(split_line[header['Remapped_Splitreads']]) + insert.support_info.support["remap_split_fraction"].value = float(split_line[header['Remap_Split_Fraction']]) + + insert.name = insert.family+"|non-reference|NA|"+sample_name+"|tebreak|sr|" + + if ( + insert.chromosome in chromosomes and + insert.support_info.support["five_p_elt_match"].value >= config.MIN_5P_ELT_MATCH and + insert.support_info.support["three_p_elt_match"].value >= config.MIN_3P_ELT_MATCH and + insert.support_info.support["five_p_genome_match"].value >= config.MIN_5P_GENOME_MATCH and + insert.support_info.support["three_p_genome_match"].value >= config.MIN_3P_GENOME_MATCH and + insert.support_info.support["split_reads_5prime"].value >= config.MIN_SPLIT_READS_5P and + insert.support_info.support["split_reads_3prime"].value >= config.MIN_SPLIT_READS_3P and + insert.support_info.support["remapped_discordant"].value >= config.MIN_REMAPPED_DISCORDANT and + insert.support_info.support["remap_disc_fraction"].value >= config.MIN_REMAP_DISC_FRACTION and + insert.support_info.support["remapped_splitreads"].value >= config.MIN_REMAPPED_SPLITREADS and + insert.support_info.support["remap_split_fraction"].value >= config.MIN_REMAP_SPLIT_FRACTION + ): + insertions.append(insert) + + return insertions + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/tebreak/tebreak_run.py b/scripts/tebreak/tebreak_run.py new file mode 100644 index 0000000..2700cff --- /dev/null +++ b/scripts/tebreak/tebreak_run.py @@ -0,0 +1,91 @@ +import os +import sys +import subprocess +import traceback +import importlib.util as il +spec = il.spec_from_file_location("config", snakemake.params.config) +config = il.module_from_spec(spec) +sys.modules[spec.name] = config +spec.loader.exec_module(config) +sys.path.append(snakemake.config['args']['mcc_path']) +import scripts.mccutils as mccutils + + +def main(): + mccutils.log("tebreak","running tebreak") + consensus = snakemake.input.consensus_fasta + bam = snakemake.input.bam + reference = snakemake.input.ref_fasta + repeatmasker_out = snakemake.input.rm_out + + threads = snakemake.threads + + script_dir = snakemake.params.script_dir + out_dir = snakemake.params.out_dir + ref_name = snakemake.params.ref_name + sample_name = snakemake.params.sample_name + log = snakemake.params.log + status_log = snakemake.params.status_log + + try: + ref_tes = get_ref_tes(repeatmasker_out, out_dir) + run_tebreak(bam, consensus, reference, ref_tes, script_dir, out_dir, threads, config.PARAMS, log=log) + mccutils.check_file_exists(snakemake.output[0]) + except Exception as e: + track = traceback.format_exc() + print(track, file=sys.stderr) + with open(log,"a") as l: + print(track, file=l) + mccutils.log("tebreak","tebreak run failed") + with open(status_log,"w") as l: + l.write("FAILED\n") + + mccutils.run_command(["touch", snakemake.output[0]]) + + mccutils.log("tebreak","tebreak run complete") + +def get_ref_tes(rm_out, out): + ref_te_file = out+"/ref_tes.txt" + with open(rm_out, "r") as inf, open(ref_te_file, "w") as outf: + lines = [] + for ln,line in enumerate(inf): + if ln > 2: + parsed_line = [] + split_line = line.split(" ") + for val in split_line: + if val != "": + parsed_line.append(val) + lines.append(parsed_line) + + for split_line in lines: + chrom = split_line[4] + start = split_line[5] + end = split_line[6] + te_type = split_line[10] + te_fam = split_line[9] + strand = split_line[8] + + if strand == "C": + strand = "-" + + out_line = "\t".join([chrom, start, end, te_type, te_fam, strand]) + outf.write(out_line+"\n") + + return ref_te_file + +def run_tebreak(bam, consensus, reference, ref_tes, script_dir, out_dir, threads, params, log=None): + os.chdir(out_dir) + command = ["samtools", "index", bam] + mccutils.run_command(command, log=log) + command = [script_dir+"/tebreak", "-b", bam, '-r', reference, '-p', str(threads), '-d', ref_tes, '-i', consensus] + for key in params.keys(): + if params[key] != False: + if params[key] == True: + command.append(key) + else: + command.append(key) + command.append(params[key]) + mccutils.run_command(command, log=log) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/teflon/teflon_post.py b/scripts/teflon/teflon_post.py index c98d23a..c2f63d6 100644 --- a/scripts/teflon/teflon_post.py +++ b/scripts/teflon/teflon_post.py @@ -34,11 +34,11 @@ def main(): chromosomes, sample_name, ref_tes, - min_presence=config.PARAMETERS['min_presence_reads'], - max_absence=config.PARAMETERS['max_absence_reads'], - min_presence_fraction=config.PARAMETERS['min_presence_fraction'], - require_tsd=config.PARAMETERS['require_tsd'], - require_both_breakpoints=config.PARAMETERS['require_both_breakpoints'] + min_presence=config.PARAMS['min_presence_reads'], + max_absence=config.PARAMS['max_absence_reads'], + min_presence_fraction=config.PARAMS['min_presence_fraction'], + require_tsd=config.PARAMS['require_tsd'], + require_both_breakpoints=config.PARAMS['require_both_breakpoints'] ) if len(insertions) >= 1: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="teflon") diff --git a/scripts/teflon/teflon_run.py b/scripts/teflon/teflon_run.py index 46d991a..7cecd38 100644 --- a/scripts/teflon/teflon_run.py +++ b/scripts/teflon/teflon_run.py @@ -37,13 +37,13 @@ def main(): sample_table, threads=threads, log=log, - quality_threshold=config.PARAMETERS['q'], - stdev=config.PARAMETERS['sd'], - cov=config.PARAMETERS['cov'], - te_support1=config.PARAMETERS['n1'], - te_support2=config.PARAMETERS['n2'], - read_count_lower_threshold=config.PARAMETERS['lt'], - read_count_higher_threshold=config.PARAMETERS['ht'] + quality_threshold=config.PARAMS['-q'], + stdev=config.PARAMS['-sd'], + cov=config.PARAMS['-cov'], + te_support1=config.PARAMS['-n1'], + te_support2=config.PARAMS['-n2'], + read_count_lower_threshold=config.PARAMS['-lt'], + read_count_higher_threshold=config.PARAMS['-ht'] ) mccutils.check_file_exists(snakemake.output[0]) diff --git a/scripts/telocate/telocate_post.py b/scripts/telocate/telocate_post.py index 8ca193e..e018c19 100644 --- a/scripts/telocate/telocate_post.py +++ b/scripts/telocate/telocate_post.py @@ -25,7 +25,7 @@ def main(): prev_steps_succeeded = mccutils.check_status_file(status_log) if prev_steps_succeeded: - insertions = read_insertions(telocate_raw, sample_name, chromosomes, rp_threshold=config.READ_PAIR_SUPPORT_THRESHOLD) + insertions = read_insertions(telocate_raw, sample_name, chromosomes, rp_threshold=config.PARAMS['read_pair_support_threshold']) insertions = filter_by_reference(insertions, te_gff) if len(insertions) > 0: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="telocate") diff --git a/scripts/telocate/telocate_run.py b/scripts/telocate/telocate_run.py index 107a4c3..91e371e 100644 --- a/scripts/telocate/telocate_run.py +++ b/scripts/telocate/telocate_run.py @@ -46,9 +46,9 @@ def main(): median_insert_size = mccutils.get_median_insert_size(median_insert_size_file) - distance = (median_insert_size * config.MIN_DISTANCE) + distance = (median_insert_size * config.PARAMS["min_distance"]) - command = ["perl", telocate, str(config.MAX_MEM), sam_dir, te_gff, ref_fasta, out_dir, str(distance), str(config.MIN_SUPPORT_READS), str(config.MIN_SUPPORT_INDIVIDUALS)] + command = ["perl", telocate, str(config.PARAMS["max_mem"]), sam_dir, te_gff, ref_fasta, out_dir, str(distance), str(config.PARAMS["min_support_reads"]), str(config.PARAMS["min_support_individuals"])] mccutils.run_command(command, log=log) diff --git a/scripts/temp2/temp2_post.py b/scripts/temp2/temp2_post.py index 7d94948..031400a 100644 --- a/scripts/temp2/temp2_post.py +++ b/scripts/temp2/temp2_post.py @@ -78,8 +78,8 @@ def read_insertions(insert_bed, sample_name, chromosomes, config): if ( insert.chromosome in chromosomes and - insert.support_info.support["frequency"].value >= config.FREQUENCY_THRESHOLD and - insert.support_info.support["class"].value in config.ACCEPTABLE_INSERTION_SUPPORT_CLASSES + insert.support_info.support["frequency"].value >= config.PARAMS["frequency_threshold"] and + insert.support_info.support["class"].value in config.PARAMS["acceptable_insertion_support_classes"] ): insertions.append(insert) diff --git a/scripts/temp2/temp2_run.py b/scripts/temp2/temp2_run.py index a732fae..62e9c1c 100644 --- a/scripts/temp2/temp2_run.py +++ b/scripts/temp2/temp2_run.py @@ -89,21 +89,15 @@ def run_temp2_insertion(fq1, fq2, bam, insert_size, reference, scripts, consensu "-t", te_bed, "-c", str(threads), "-f", str(insert_size), - "-o", out, - "-M", str(config.GENOME_MISMATCH_PCT), - "-m", str(config.TE_MISMATCH_PCT), - "-U", str(config.RATIO), - "-N", str(config.FILTER_WINDOW), + "-o", out ] - if config.TRUNCATED: - command.append("-T") - - if config.LOOSE_FILTER: - command.append("-L") - - if config.SKIP_INS_LEN_CHECK: - command.append("-S") + for param in config.PARAMS['insertion'].keys(): + if config.PARAMS['insertion'][param] == True: + command.append(param) + elif config.PARAMS['insertion'][param] != False: + command.append(param) + command.append(str(config.PARAMS['insertion'][param])) mccutils.run_command(command, log=log) @@ -116,10 +110,13 @@ def run_temp2_absence(scripts, bam, reference_2bit, te_bed, insert_size, threads "-t", reference_2bit, "-f", str(insert_size), "-c", str(threads), - "-o", out, - "-x", str(config.UNIQ_MAP_SCORE) + "-o", out ] + for param in config.PARAMS['absence'].keys(): + command.append(param) + command.append(str(config.PARAMS['absence'])) + mccutils.run_command(command, log=log) if __name__ == "__main__": diff --git a/scripts/tepid/tepid_post.py b/scripts/tepid/tepid_post.py index fae13a1..2477a56 100644 --- a/scripts/tepid/tepid_post.py +++ b/scripts/tepid/tepid_post.py @@ -2,10 +2,14 @@ import sys import subprocess import traceback +import importlib.util as il +spec = il.spec_from_file_location("config", snakemake.params.config) +config = il.module_from_spec(spec) +sys.modules[spec.name] = config +spec.loader.exec_module(config) sys.path.append(snakemake.config['args']['mcc_path']) import scripts.mccutils as mccutils -import config.tepid.tepid_post as config - +import scripts.output as output def main(): insertions_bed = snakemake.input.insertions_bed @@ -14,30 +18,45 @@ def main(): deletions_support = snakemake.input.deletions_support te_gff = snakemake.input.te_gff te_taxonomy = snakemake.input.te_taxonomy + reference_fasta = snakemake.input.reference_fasta + chromosomes = snakemake.params.chromosomes.split(",") + status_log = snakemake.params.status_log sample_name = snakemake.params.sample_name out_dir = snakemake.params.out_dir + + prev_steps_succeeded = mccutils.check_status_file(status_log) + mccutils.log("tepid","running TEPID post processing") - te_to_family = get_te_family_map(te_taxonomy) - te_pos_to_family = get_te_pos_family_map(te_gff, te_to_family) - insertions = read_insertions(insertions_bed, te_to_family, sample_name, te_pos_to_family, chromosomes, reference=False) - insertions = add_support(insertions, insertions_support, threshold=config.READ_SUPPORT_THRESHOLD) - - deletions = read_insertions(deletions_bed, te_to_family, sample_name, te_pos_to_family, chromosomes, reference=True) - deletions = add_support(deletions, deletions_support, threshold=config.READ_SUPPORT_THRESHOLD) - non_abs_ref_insertions = get_non_absent_ref_tes(deletions, te_gff, te_to_family, sample_name) - - insertions += non_abs_ref_insertions - if len(insertions) > 0: - mccutils.make_redundant_bed(insertions, sample_name, out_dir, method="tepid") - mccutils.make_nonredundant_bed(insertions, sample_name, out_dir, method="tepid") + + if prev_steps_succeeded: + te_to_family = get_te_family_map(te_taxonomy) + te_pos_to_family = get_te_pos_family_map(te_gff, te_to_family) + insertions = read_insertions(insertions_bed, te_to_family, sample_name, te_pos_to_family, chromosomes, reference=False) + insertions = add_support(insertions, insertions_support, threshold=config.READ_SUPPORT_THRESHOLD) + + deletions = read_insertions(deletions_bed, te_to_family, sample_name, te_pos_to_family, chromosomes, reference=True) + deletions = add_support(deletions, deletions_support, threshold=config.READ_SUPPORT_THRESHOLD) + non_abs_ref_insertions = get_non_absent_ref_tes(deletions, te_gff, te_to_family, sample_name) + + insertions += non_abs_ref_insertions + if len(insertions) > 0: + insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="tepid") + insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="tepid") + output.write_vcf(insertions, reference_fasta, sample_name, "tepid", out_dir) + else: + mccutils.run_command(["touch",out_dir+"/"+sample_name+"_tepid_redundant.bed"]) + mccutils.run_command(["touch",out_dir+"/"+sample_name+"_tepid_nonredundant.bed"]) + + else: - mccutils.run_command(["touch",out_dir+"/"+sample_name+"_tepid_redundant.bed"]) - mccutils.run_command(["touch",out_dir+"/"+sample_name+"_tepid_nonredundant.bed"]) - + mccutils.run_command(["touch",out_dir+"/"+sample_name+"_tepid_redundant.bed"]) + mccutils.run_command(["touch",out_dir+"/"+sample_name+"_tepid_nonredundant.bed"]) + mccutils.log("tepid","TEPID post processing complete") + def get_te_family_map(taxonomy): te_to_family = {} @@ -71,7 +90,7 @@ def read_insertions(bed, te_to_family, sample_name, te_pos_to_family, chromosome inserts = [] with open(bed,"r") as b: for line in b: - insert = mccutils.Insertion() + insert = output.Insertion(output.Tepid()) split_line = line.split("\t") insert.chromosome = split_line[0] if insert.chromosome in chromosomes: @@ -83,16 +102,16 @@ def read_insertions(bed, te_to_family, sample_name, te_pos_to_family, chromosome insert.family = te_to_family[te_name] insert.strand = split_line[3] insert.type = "reference" - insert.name = insert.family+"|reference|"+sample_name+"|tepid|nonab|" + insert.name = insert.family+"|reference|NA|"+sample_name+"|tepid|nonab|" else: te_chrom = split_line[3] te_start = split_line[4] te_end = split_line[5] insert.family = te_pos_to_family[te_chrom+"_"+te_start+"_"+te_end] insert.type = "non-reference" - insert.name = insert.family+"|non-reference|"+sample_name+"|tepid|" + insert.name = insert.family+"|non-reference|NA|"+sample_name+"|tepid|sr|" - insert.tepid.id = split_line[-1].replace("\n","") + insert.support_info.id = split_line[-1].replace("\n","") inserts.append(insert) return inserts @@ -110,8 +129,8 @@ def add_support(inserts, support_file, threshold=0): support[support_id] = support_val for insert in inserts: - insert.tepid.support = support[insert.tepid.id] - if insert.tepid.support >= threshold: + insert.support_info.support['supporting_reads'].value = support[insert.support_info.id] + if insert.support_info.support['supporting_reads'].value >= threshold: filtered_inserts.append(insert) @@ -122,7 +141,7 @@ def get_non_absent_ref_tes(deletions, te_gff, te_to_family, sample_name): ref_tes = [] with open(te_gff, "r") as gff: for line in gff: - ref_te = mccutils.Insertion() + ref_te = output.Insertion(output.Tepid()) split_line = line.split("\t") ref_te.chromosome = split_line[0] ref_te.start = int(split_line[3]) @@ -137,7 +156,7 @@ def get_non_absent_ref_tes(deletions, te_gff, te_to_family, sample_name): ref_te.family = te_to_family[te_id] ref_te.type = "reference" - ref_te.name = ref_te.family+"|reference|"+sample_name+"|tepid|nonab|" + ref_te.name = ref_te.family+"|reference|NA|"+sample_name+"|tepid|nonab|" ref_tes.append(ref_te) absent = [] diff --git a/scripts/tepid/tepid_run.py b/scripts/tepid/tepid_run.py index 50ff6f0..bf2fe80 100644 --- a/scripts/tepid/tepid_run.py +++ b/scripts/tepid/tepid_run.py @@ -2,6 +2,11 @@ import sys import subprocess import traceback +import importlib.util as il +spec = il.spec_from_file_location("config", snakemake.params.config) +config = il.module_from_spec(spec) +sys.modules[spec.name] = config +spec.loader.exec_module(config) sys.path.append(snakemake.config['args']['mcc_path']) import scripts.mccutils as mccutils @@ -19,35 +24,52 @@ def main(): ref_name = snakemake.params.ref_name out_dir = snakemake.params.out_dir log = snakemake.params.log + status_log = snakemake.params.status_log - # ensures intermediate files from previous runs are removed - for f in os.listdir(out_dir): - mccutils.remove(out_dir+"/"+f) - median_insert_size = "" - with open(median_insert_size_file,"r") as median_insert: - for line in median_insert: - line = line.split("=")[1].replace("\n","") - median_insert_size = line + try: + # ensures intermediate files from previous runs are removed + for f in os.listdir(out_dir): + mccutils.remove(out_dir+"/"+f) + + median_insert_size = "" + with open(median_insert_size_file,"r") as median_insert: + for line in median_insert: + line = line.split("=")[1].replace("\n","") + median_insert_size = line + + is_paired = True + if raw_fq2 == "False": + is_paired = False + + mccutils.log("tepid","indexing reference", log=log) + index_ref(ref_fasta, ref_name, out_dir, log=log) + mccutils.log("tepid","making TEPID reference TE bed") + te_bed = make_te_bed(te_gff, te_taxonomy, out_dir) + + mccutils.log("tepid","mapping reads", log=log) + if is_paired: + bam, split_bam = map_reads(fq1, fq2, ref_name, median_insert_size, out_dir, threads=threads, paired=True, log=log) + else: + bam, split_bam = map_reads(fq1, fq2, ref_name, median_insert_size, out_dir, threads=threads, paired=False, log=log) - is_paired = True - if raw_fq2 == "False": - is_paired = False + mccutils.log("tepid","discovering variants", log=log) + discover_variants(ref_name, bam, split_bam, te_bed, out_dir, threads=threads, log=log) + mccutils.log("tepid","TEPID run complete") - mccutils.log("tepid","indexing reference", log=log) - index_ref(ref_fasta, ref_name, out_dir, log=log) - mccutils.log("tepid","making TEPID reference TE bed") - te_bed = make_te_bed(te_gff, te_taxonomy, out_dir) - - mccutils.log("tepid","mapping reads", log=log) - if is_paired: - bam, split_bam = map_reads(fq1, fq2, ref_name, median_insert_size, out_dir, threads=threads, paired=True, log=log) - else: - bam, split_bam = map_reads(fq1, fq2, ref_name, median_insert_size, out_dir, threads=threads, paired=False, log=log) + except Exception as e: + track = traceback.format_exc() + print(track, file=sys.stderr) + with open(log,"a") as l: + print(track, file=l) + mccutils.log("tepid","TEPID run failed") + with open(status_log,"w") as l: + l.write("FAILED\n") - mccutils.log("tepid","discovering variants", log=log) - discover_variants(ref_name, bam, split_bam, te_bed, out_dir, threads=threads, log=log) - mccutils.log("tepid","TEPID run complete") + mccutils.run_command(["touch", snakemake.output[0]]) + mccutils.run_command(["touch", snakemake.output[1]]) + mccutils.run_command(["touch", snakemake.output.insertions_support]) + mccutils.run_command(["touch", snakemake.output.deletions_support]) def index_ref(fasta, ref_name, out, log=None): try: @@ -97,64 +119,50 @@ def make_te_bed(gff, taxonomy, out): return te_bed def map_reads(fq1, fq2, ref_name, median_insert_size, out, threads=1, paired=True, log=None): - try: - os.chdir(out) - if paired: - command = ["tepid-map", - "-x", out+"/"+ref_name, - "-y", out+"/"+ref_name+".X15_01_65525S", - "-p", str(threads), - "-s", median_insert_size, - "-n", ref_name, - "-1", fq1, - "-2", fq2 ] - else: - command = ["tepid-map-se", - "-x", out+"/"+ref_name, - "-y", out+"/"+ref_name+".X15_01_65525S", - "-p", str(threads), - "-n", ref_name, - "-q", fq1] + os.chdir(out) + if paired: + command = ["tepid-map", + "-x", out+"/"+ref_name, + "-y", out+"/"+ref_name+".X15_01_65525S", + "-p", str(threads), + "-s", median_insert_size, + "-n", ref_name, + "-1", fq1, + "-2", fq2 ] + else: + command = ["tepid-map-se", + "-x", out+"/"+ref_name, + "-y", out+"/"+ref_name+".X15_01_65525S", + "-p", str(threads), + "-n", ref_name, + "-q", fq1] - mccutils.run_command(command, log=log) + mccutils.run_command(command, log=log) - bam = out+"/"+ref_name+".bam" - split_bam = out+"/"+ref_name+".split.bam" - mccutils.check_file_exists(bam) - mccutils.check_file_exists(split_bam) - - except Exception as e: - track = traceback.format_exc() - print(track, file=sys.stderr) - print("ERROR...Failed to run TEPID mapping step...exiting...", file=sys.stderr) - sys.exit(1) + bam = out+"/"+ref_name+".bam" + split_bam = out+"/"+ref_name+".split.bam" + mccutils.check_file_exists(bam) + mccutils.check_file_exists(split_bam) return bam, split_bam def discover_variants(ref_name, bam, split_bam, te_bed, out, threads=1, log=None): - try: - os.chdir(out) - command = ["tepid-discover", - "-p", str(threads), - "-n", ref_name, - "-c", bam, - "-s", split_bam, - "-t", te_bed] - - mccutils.run_command(command, log=log) - - if not os.path.exists(snakemake.output[0]): - mccutils.run_command(["touch", snakemake.output[0]]) - if not os.path.exists(snakemake.output[2]): - mccutils.run_command(["touch", snakemake.output[2]]) - mccutils.check_file_exists(snakemake.output[1]) - - except Exception as e: - track = traceback.format_exc() - print(track, file=sys.stderr) - print("ERROR...Failed to run TEPID discover step...exiting...", file=sys.stderr) - sys.exit(1) + os.chdir(out) + command = ["tepid-discover", + "-p", str(threads), + "-n", ref_name, + "-c", bam, + "-s", split_bam, + "-t", te_bed] + + mccutils.run_command(command, log=log) + + if not os.path.exists(snakemake.output[0]): + mccutils.run_command(["touch", snakemake.output[0]]) + if not os.path.exists(snakemake.output[2]): + mccutils.run_command(["touch", snakemake.output[2]]) + mccutils.check_file_exists(snakemake.output[1]) if __name__ == "__main__": main() \ No newline at end of file diff --git a/snakefiles/jitterbug.snakefile b/snakefiles/jitterbug.snakefile index 4332d59..29db847 100644 --- a/snakefiles/jitterbug.snakefile +++ b/snakefiles/jitterbug.snakefile @@ -11,7 +11,9 @@ rule jitterbug_run: out_dir = config['args']['out']+"results/jitterbug/unfiltered/", script_dir = config['args']['mcc_path']+"/install/tools/jitterbug/", sample_name = config['args']['sample_name'], - log=config['args']['log_dir']+"jitterbug.log" + log=config['args']['log_dir']+"jitterbug.log", + config = config['config']['jitterbug']['files'][0], + status_log = config['status']['jitterbug'] output: out = config['args']['out']+"results/jitterbug/unfiltered/"+config['args']['sample_name']+".TE_insertions_paired_clusters.filtered.gff3" @@ -22,7 +24,8 @@ rule jitterbug_run: rule jitterbug_post: input: jitterbug_out = config['args']['out']+"results/jitterbug/unfiltered/"+config['args']['sample_name']+".TE_insertions_paired_clusters.filtered.gff3", - taxonomy = config['mcc']['taxonomy'] + taxonomy = config['mcc']['taxonomy'], + reference_fasta = config['mcc']['reference'] threads: 1 @@ -32,7 +35,9 @@ rule jitterbug_post: out_dir = config['args']['out']+"results/jitterbug/", log=config['args']['log_dir']+"jitterbug.log", sample_name = config['args']['sample_name'], - chromosomes = config['args']['chromosomes'] + chromosomes = config['args']['chromosomes'], + config = config['config']['jitterbug']['files'][1], + status_log = config['status']['jitterbug'] output: out = config['out']['jitterbug'] diff --git a/snakefiles/tebreak.snakefile b/snakefiles/tebreak.snakefile new file mode 100644 index 0000000..72d6da8 --- /dev/null +++ b/snakefiles/tebreak.snakefile @@ -0,0 +1,48 @@ +rule tebreak_run: + input: + consensus_fasta = config['mcc']['consensus'], + bam = config['mcc']['bam'], + ref_fasta = config['mcc']['reference'], + rm_out = config['mcc']['repeatmasker_out'] + + threads: config['args']['max_threads_per_rule'] + + conda: config['envs']['tebreak'] + + params: + script_dir = config['args']['mcc_path']+"/install/tools/tebreak/tebreak/", + out_dir = config['outdir']['tebreak']+"unfiltered/", + ref_name=config['args']['ref_name'], + sample_name=config['args']['sample_name'], + log = config['args']['log_dir']+"tebreak.log", + config = config['config']['tebreak']['files'][0], + status_log = config['status']['tebreak'] + + output: + config['outdir']['tebreak']+"unfiltered/"+config['args']['sample_name']+".sorted.tebreak.table.txt" + + script: + config['args']['mcc_path']+"/scripts/tebreak/tebreak_run.py" + +rule tebreak_post: + input: + tebreak_out = config['outdir']['tebreak']+"unfiltered/"+config['args']['sample_name']+".sorted.tebreak.table.txt", + ref_fasta = config['mcc']['reference'] + + threads: 1 + + conda: config['envs']['processing'] + + params: + out_dir = config['outdir']['tebreak'], + ref_name=config['args']['ref_name'], + sample_name=config['args']['sample_name'], + chromosomes = config['args']['chromosomes'], + config = config['config']['tebreak']['files'][1], + status_log = config['status']['tebreak'] + + output: + config['out']['tebreak'] + + script: + config['args']['mcc_path']+"/scripts/tebreak/tebreak_post.py" \ No newline at end of file diff --git a/snakefiles/tepid.snakefile b/snakefiles/tepid.snakefile index a877429..54d2134 100644 --- a/snakefiles/tepid.snakefile +++ b/snakefiles/tepid.snakefile @@ -15,7 +15,9 @@ rule tepid_run: raw_fq2 = config['in']['fq2'], ref_name = config['args']['ref_name'], out_dir = config['args']['out']+"/results/tepid/unfiltered/", - log = config['args']['log_dir']+"tepid.log" + log = config['args']['log_dir']+"tepid.log", + config = config['config']['tepid']['files'][0], + status_log = config['status']['tepid'] output: config['args']['out']+"results/tepid/unfiltered/insertions_"+config['args']['ref_name']+".bed", @@ -33,7 +35,8 @@ rule tepid_post: insertions_support = config['args']['out']+"results/tepid/unfiltered/insertion_reads_"+config['args']['ref_name']+".txt", deletions_support = config['args']['out']+"results/tepid/unfiltered/deletion_reads_"+config['args']['ref_name']+".txt", te_gff = config['mcc']['locations'], - te_taxonomy = config['mcc']['taxonomy'] + te_taxonomy = config['mcc']['taxonomy'], + reference_fasta = config['mcc']['reference'] threads: 1 @@ -42,7 +45,9 @@ rule tepid_post: params: sample_name = config['args']['sample_name'], out_dir = config['args']['out']+"/results/tepid/", - chromosomes = config['args']['chromosomes'] + chromosomes = config['args']['chromosomes'], + config = config['config']['tepid']['files'][1], + status_log = config['status']['tepid'] output: config['out']['tepid']