From 2df245693696687efe8845af01af4d40c995f3cd Mon Sep 17 00:00:00 2001 From: pbasting Date: Thu, 13 May 2021 16:42:23 -0400 Subject: [PATCH 01/13] basic installation and functionality of tebreak added --- Snakefile | 5 +- config/tebreak/tebreak_post.py | 0 config/tebreak/tebreak_run.py | 0 install/Snakefile | 15 +++++ install/envs/mcc_map_reads.yml | 1 + install/envs/mcc_tebreak.yml | 20 +++++++ install/scripts/tebreak.py | 43 ++++++++++++++ internal/install.py | 13 +++-- internal/sysconfig.py | 26 ++++++--- scripts/output.py | 17 ++++++ scripts/preprocessing/map_reads.py | 2 +- scripts/preprocessing/repeatmask.py | 4 +- scripts/preprocessing/sam_to_bam.py | 25 +++++++- scripts/tebreak/tebreak_post.py | 90 +++++++++++++++++++++++++++++ scripts/tebreak/tebreak_run.py | 84 +++++++++++++++++++++++++++ snakefiles/tebreak.snakefile | 48 +++++++++++++++ 16 files changed, 376 insertions(+), 17 deletions(-) create mode 100644 config/tebreak/tebreak_post.py create mode 100644 config/tebreak/tebreak_run.py create mode 100644 install/envs/mcc_tebreak.yml create mode 100644 install/scripts/tebreak.py create mode 100644 scripts/tebreak/tebreak_post.py create mode 100644 scripts/tebreak/tebreak_run.py create mode 100644 snakefiles/tebreak.snakefile diff --git a/Snakefile b/Snakefile index d35dfd8..d12e3b5 100644 --- a/Snakefile +++ b/Snakefile @@ -11,6 +11,7 @@ include: config['args']['mcc_path']+"/snakefiles/retroseq.snakefile" include: config['args']['mcc_path']+"/snakefiles/tepid.snakefile" include: config['args']['mcc_path']+"/snakefiles/teflon.snakefile" include: config['args']['mcc_path']+"/snakefiles/jitterbug.snakefile" +include: config['args']['mcc_path']+"/snakefiles/tebreak.snakefile" rule setup_reads: input: @@ -223,7 +224,9 @@ rule sam_to_bam: output: bam = config['mcc']['bam'], flagstat = config['mcc']['flagstat'], - tmp_bam = temp(config['mcc']['mcc_files']+config['args']['run_id']+".tmp.bam") + metrics = config['mcc']['mcc_files']+config['args']['run_id']+".metrics", + tmp_bam = temp(config['mcc']['mcc_files']+config['args']['run_id']+".tmp.bam"), + tmp2_bam = temp(config['mcc']['mcc_files']+config['args']['run_id']+".tmp2.bam") script: config['args']['mcc_path']+"/scripts/preprocessing/sam_to_bam.py" diff --git a/config/tebreak/tebreak_post.py b/config/tebreak/tebreak_post.py new file mode 100644 index 0000000..e69de29 diff --git a/config/tebreak/tebreak_run.py b/config/tebreak/tebreak_run.py new file mode 100644 index 0000000..e69de29 diff --git a/install/Snakefile b/install/Snakefile index 022372f..4b7c346 100644 --- a/install/Snakefile +++ b/install/Snakefile @@ -221,6 +221,21 @@ rule jitterbug: script: config['paths']['install']+"scripts/jitterbug.py" +rule tebreak: + params: + url = config['URLs']['tebreak'], + md5 = config['MD5s']['tebreak'], + zipfile = config['paths']['install']+"tools/tebreak/tebreak.zip", + log = config['paths']['log_dir']+"install.log", + + output: + config['output']['tebreak'] + + conda: config['ENVs']['tebreak'] + + script: + config['paths']['install']+"scripts/tebreak.py" + rule install_all: input: config['output']['te-locate'], diff --git a/install/envs/mcc_map_reads.yml b/install/envs/mcc_map_reads.yml index e8ca860..144bbc2 100644 --- a/install/envs/mcc_map_reads.yml +++ b/install/envs/mcc_map_reads.yml @@ -12,3 +12,4 @@ dependencies: - perl-bioperl-run=1.006900 - bwa=0.7.4 - jinja2=2.10.1 + - picard=2.25.4 diff --git a/install/envs/mcc_tebreak.yml b/install/envs/mcc_tebreak.yml new file mode 100644 index 0000000..76af78b --- /dev/null +++ b/install/envs/mcc_tebreak.yml @@ -0,0 +1,20 @@ +name: tebreak +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - python=3.8 + - bwa=0.7.17 + - pysam=0.16.0.1 + - scipy=1.6.3 + - bx-python=0.8.11 + - scikit-bio=0.5.6 + - last=1238 + - htslib=1.12 + - samtools=1.12 + - bcftools=1.12 + - minia=3.2.4 + - exonerate=2.4.0 + - lockfile=0.12.2 + - ipython_genutils=0.2.0 \ No newline at end of file diff --git a/install/scripts/tebreak.py b/install/scripts/tebreak.py new file mode 100644 index 0000000..a35e3c3 --- /dev/null +++ b/install/scripts/tebreak.py @@ -0,0 +1,43 @@ +import sys +import os +sys.path.append(snakemake.config['paths']['mcc_path']) +import scripts.mccutils as mccutils + +def main(): + install_path = snakemake.config['paths']['install']+"/tools/" + + raw_name="tebreak-3f00badda822dcb6390bbea5a6a1e233cff3e99c" + method_name = "tebreak" + + mccutils.remove(snakemake.params.zipfile) + download_success = mccutils.download(snakemake.params.url, snakemake.params.zipfile, md5=snakemake.params.md5, max_attempts=3) + + if not download_success: + print(method_name+" download failed... exiting...") + print("try running --install with --clean for clean installation") + sys.exit(1) + + mccutils.remove(snakemake.config['paths']['install']+raw_name) + command = ["unzip", snakemake.params.zipfile] + mccutils.run_command(command, log=snakemake.params.log) + + mccutils.remove(install_path+raw_name) + command = ["mv", snakemake.config['paths']['install']+raw_name, install_path] + mccutils.run_command(command, log=snakemake.params.log) + + mccutils.remove(install_path+method_name) + mccutils.mkdir(install_path+method_name) + for f in os.listdir(install_path+raw_name): + command = ["mv", install_path+raw_name+"/"+f, install_path+method_name] + mccutils.run_command(command, log=snakemake.params.log) + + + mccutils.remove(install_path+raw_name) + mccutils.remove(snakemake.params.zipfile) + + # write version to file + with open(install_path+"/"+method_name+"/version.log","w") as version: + version.write(snakemake.params.md5) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/internal/install.py b/internal/install.py index 916d094..8e3edad 100644 --- a/internal/install.py +++ b/internal/install.py @@ -11,7 +11,8 @@ "relocate2" : "https://github.com/stajichlab/RelocaTE2/archive/v2.0.1.zip", "tepid" : "https://github.com/ListerLab/TEPID/archive/ad46d65b5c41bf8a9171215d49b3ffaecdceaab0.zip", "teflon": "https://github.com/jradrion/TEFLoN/archive/3e2d67886b70644fd1f7d79263b3c8dbed639e46.zip", - "jitterbug": "https://github.com/elzbth/jitterbug/archive/b6b3f9c7ee4af042d4410137269f174a1399b752.zip" + "jitterbug": "https://github.com/elzbth/jitterbug/archive/b6b3f9c7ee4af042d4410137269f174a1399b752.zip", + "tebreak": "https://github.com/adamewing/tebreak/archive/3f00badda822dcb6390bbea5a6a1e233cff3e99c.zip" } MD5 = { @@ -27,7 +28,8 @@ "relocate2" : "63eb7dbc09ffeb4063aef4857d198b06", "tepid" : "51f61acf138b07842074f054aba780d5", "teflon" : "a29d2e7411fd06778186ba322f51fdc9", - "jitterbug": "05f165fb6a79a7a611d09981a355d27e" + "jitterbug": "05f165fb6a79a7a611d09981a355d27e", + "tebreak": "ac475d3cc4b0435f0a333c8d86f57d3f" } @@ -50,7 +52,8 @@ "setup_reads": ENV_PATH+"mcc_trimgalore.yml", "processing": ENV_PATH+"mcc_map_reads.yml", "map_reads": ENV_PATH+"mcc_map_reads.yml", - "trimgalore": ENV_PATH+"mcc_trimgalore.yml" + "trimgalore": ENV_PATH+"mcc_trimgalore.yml", + "tebreak": ENV_PATH+"mcc_tebreak.yml" } @@ -71,5 +74,7 @@ "teflon" : INSTALL_PATH+"tools/teflon/teflon.v0.4.py", "jitterbug": INSTALL_PATH+"tools/jitterbug/jitterbug.py", "map_reads" : INSTALL_PATH+"tools/map_reads/map_reads.log", - "trimgalore" : INSTALL_PATH+"tools/trimgalore/trimgalore.log" + "trimgalore" : INSTALL_PATH+"tools/trimgalore/trimgalore.log", + "tebreak" : INSTALL_PATH+"tools/tebreak/tebreak/tebreak" + } diff --git a/internal/sysconfig.py b/internal/sysconfig.py index 91104f3..8a9a91e 100644 --- a/internal/sysconfig.py +++ b/internal/sysconfig.py @@ -1,6 +1,6 @@ -ALL_METHODS = ["ngs_te_mapper", "ngs_te_mapper2", "relocate", "relocate2", "temp", "temp2", "retroseq", "popoolationte", "popoolationte2", "te-locate", "teflon", "coverage", "trimgalore","map_reads"] #"ngs_te_mapper2", -SINGLE_END_METHODS = ["ngs_te_mapper", "ngs_te_mapper2", "relocate", "coverage", "trimgalore", "map_reads"] #"ngs_te_mapper2", -MULTI_THREAD_METHODS = ["coverage", "temp", "temp2", "relocate2", "ngs_te_mapper", "ngs_te_mapper2", "popoolationte", "teflon", "trimgalore"] #"ngs_te_mapper2", +ALL_METHODS = ["ngs_te_mapper", "ngs_te_mapper2", "relocate", "relocate2", "temp", "temp2", "retroseq", "popoolationte", "popoolationte2", "te-locate", "teflon", "coverage", "trimgalore","map_reads", "tebreak"] +SINGLE_END_METHODS = ["ngs_te_mapper", "ngs_te_mapper2", "relocate", "coverage", "trimgalore", "map_reads", "tebreak"] +MULTI_THREAD_METHODS = ["coverage", "temp", "temp2", "relocate2", "ngs_te_mapper", "ngs_te_mapper2", "popoolationte", "teflon", "trimgalore", "tebreak"] NO_INSTALL_METHODS = ["trimgalore", "map_reads", "coverage", "relocate2"] # no source code to install for these methods, just envs INPUT_DIR = "{{indir}}" @@ -23,7 +23,8 @@ "teflon": ["teflon/teflon_run.py", "teflon/teflon_post.py"], "te-locate": ["telocate/telocate_run.py", "telocate/telocate_post.py"], "temp": ["temp/temp_run.py", "temp/temp_post.py"], - "temp2": ["temp2/temp2_run.py", "temp2/temp2_post.py"] + "temp2": ["temp2/temp2_run.py", "temp2/temp2_post.py"], + "tebreak": ["tebreak/tebreak_run.py", "tebreak/tebreak_post.py"] } # rules to re-run if specific config files change @@ -40,7 +41,8 @@ "teflon": ["teflon_run", "teflon_post"], "te-locate": ["telocate_run", "telocate_post"], "temp": ["run_temp", "process_temp"], - "temp2": ["run_temp2", "process_temp2"] + "temp2": ["run_temp2", "process_temp2"], + "tebreak": ["tebreak_run", "tebreak_post"] } LOG_DIR = "{{logdir}}" @@ -57,6 +59,7 @@ "te-locate": LOG_DIR+"status/telocate.status", "temp": LOG_DIR+"status/temp.status", "temp2": LOG_DIR+"status/temp2.status", + "tebreak": LOG_DIR+"status/tebreak.status" } @@ -86,6 +89,7 @@ 'fq2' : SAM_DIR+"intermediate/fastq/"+SAMPLE_NAME+"_2.fq", 'sam' : SAM_DIR+"intermediate/mapped_reads/"+SAMPLE_NAME+".sam", 'bam' : SAM_DIR+"intermediate/mapped_reads/"+SAMPLE_NAME+".sorted.bam", + 'duplicate_maked_bam' : SAM_DIR+"intermediate/mapped_reads/"+SAMPLE_NAME+".sorted.duplicate_marked.bam", 'telocate_sam' : SAM_DIR+"intermediate/mapped_reads/"+SAMPLE_NAME+".telocate.sam", 'flagstat' : SAM_DIR+"intermediate/mapped_reads/"+SAMPLE_NAME+".bam.flagstat", 'median_insert_size' : SAM_DIR+"intermediate/mapped_reads/median_insert.size", @@ -112,7 +116,8 @@ 'relocate2': RESULTS_DIR+"relocate2/", 'tepid': RESULTS_DIR+"tepid/", 'teflon': RESULTS_DIR+"teflon/", - 'jitterbug': RESULTS_DIR+"jitterbug/" + 'jitterbug': RESULTS_DIR+"jitterbug/", + 'tebreak': RESULTS_DIR+"tebreak/" } METHOD_DIR = "{{method}}" @@ -133,7 +138,8 @@ 'relocate2': METHOD_DIR+SAMPLE_NAME+"_relocate2_nonredundant.bed", 'tepid': METHOD_DIR+SAMPLE_NAME+"_tepid_nonredundant.bed", 'teflon': METHOD_DIR+SAMPLE_NAME+"_teflon_nonredundant.bed", - 'jitterbug': METHOD_DIR+SAMPLE_NAME+"_jitterbug_nonredundant.bed" + 'jitterbug': METHOD_DIR+SAMPLE_NAME+"_jitterbug_nonredundant.bed", + 'tebreak': METHOD_DIR+SAMPLE_NAME+"_tebreak_nonredundant.bed" } ESSENTIAL_PATHS = { @@ -257,5 +263,11 @@ METHOD_DIR+SAMPLE_NAME+"_jitterbug_redundant.bed", METHOD_DIR+SAMPLE_NAME+"_jitterbug_malformed.bed", METHOD_DIR+"unfiltered/"+SAMPLE_NAME+".TE_insertions_paired_clusters.filtered.gff3" + ], + + 'tebreak': [ + METHOD_DIR+SAMPLE_NAME+"_tebreak_nonredundant.bed", + METHOD_DIR+SAMPLE_NAME+"_tebreak_redundant.bed", + METHOD_DIR+"unfiltered/"+SAMPLE_NAME+".tebreak.table.txt" ] } \ No newline at end of file diff --git a/scripts/output.py b/scripts/output.py index e03e774..da807cb 100644 --- a/scripts/output.py +++ b/scripts/output.py @@ -192,6 +192,23 @@ def __init__(self): self.redundancy_filter = RedundancyFilter(["presence_reads"], "value") +class Tebreak: + def __init__(self): + self.support = { + "five_p_elt_match" : Info("FIVE_P_ELT_MATCH", "Fraction of bases matched to reference for inserted sequence on insertion seqment of 5' supporting contig", 0.0, "Float"), + "three_p_elt_match" : Info("THREE_P_ELT_MATCH", "Fraction of bases matched to reference for inserted sequence on insertion seqment of 3' supporting contig", 0.0, "Float"), + "five_p_genome_match" : Info("FIVE_P_GENOME_MATCH", "Fraction of bases matched to reference genome on genomic segment of 5' supporting contig", 0.0, "Float"), + "three_p_genome_match" : Info("THREE_P_GENOME_MATCH", "Fraction of bases matched to reference genome on genomic segment of 3' supporting contig", 0.0, "Float"), + "split_reads_5prime" : Info("SPLIT_READS_5PRIME", "Number of split reads supporting 5' end of the insertion", 0, "Integer"), + "split_reads_3prime" : Info("SPLIT_READS_3PRIME", "Number of split reads supporting 3' end of the insertion", 0, "Integer"), + "remapped_discordant" : Info("REMAPPED_DISCORDANT", "Number of discordant read ends re-mappable to insertion reference sequence", 0, "Integer"), + "remap_disc_fraction" : Info("REMAP_DISC_FRACTION", "The proportion of remapped discordant reads mapping to the reference insertion sequence", 0.0, "Float"), + "remapped_splitreads" : Info("REMAPPED_SPLITREADS", "Number of split reads re-mappable to insertion reference sequence", 0, "Integer"), + "remap_split_fraction" : Info("REMAP_SPLIT_FRACTION", "The proportion of remapped split reads mapping to the reference insertion sequence", 0.0, "Float") + } + + self.redundancy_filter = RedundancyFilter(["split_reads_5prime", "split_reads_3prime"], "sum") + ################################################# ## TODO convert to proper format for VCF creation class Tepid: diff --git a/scripts/preprocessing/map_reads.py b/scripts/preprocessing/map_reads.py index ae18f45..dde34f8 100644 --- a/scripts/preprocessing/map_reads.py +++ b/scripts/preprocessing/map_reads.py @@ -20,7 +20,7 @@ def main(): if eval(snakemake.config['args']['save_comments']): command.append("-C") - command += ["-t", str(snakemake.threads), "-R", "@RG\\tID:"+snakemake.params.sample+"\\tSM:"+snakemake.params.sample, snakemake.input.ref, snakemake.input.fq1] + command += ["-t", str(snakemake.threads), "-M", "-R", "@RG\\tID:"+snakemake.params.sample+"\\tSM:"+snakemake.params.sample, snakemake.input.ref, snakemake.input.fq1] if snakemake.config['in']['fq2'] != "None": command.append(snakemake.input.fq2) diff --git a/scripts/preprocessing/repeatmask.py b/scripts/preprocessing/repeatmask.py index ff572ca..ede21b1 100644 --- a/scripts/preprocessing/repeatmask.py +++ b/scripts/preprocessing/repeatmask.py @@ -17,9 +17,9 @@ def main(): output = snakemake.output.rm_out - mccutils.log("processing","Running RepeatMasker for RelocaTE2", log=log) + mccutils.log("processing","Running RepeatMasker", log=log) run_repeatmasker(reference, ref_name, te_seqs, threads, log, output, out_dir) - mccutils.log("processing","Repeatmasker for RelocaTE2 complete") + mccutils.log("processing","Repeatmasker complete") diff --git a/scripts/preprocessing/sam_to_bam.py b/scripts/preprocessing/sam_to_bam.py index f8ef6a4..9152439 100644 --- a/scripts/preprocessing/sam_to_bam.py +++ b/scripts/preprocessing/sam_to_bam.py @@ -29,9 +29,9 @@ def main(): try: - command = ["samtools", "sort", "-@", str(snakemake.threads), snakemake.output.tmp_bam, snakemake.output.bam.replace(".bam", "")] + command = ["samtools", "sort", "-@", str(snakemake.threads), snakemake.output.tmp_bam, snakemake.output.tmp2_bam.replace(".bam", "")] mccutils.run_command(command, log=log) - mccutils.check_file_exists(snakemake.output.bam) + mccutils.check_file_exists(snakemake.output.tmp2_bam) except Exception as e: track = traceback.format_exc() @@ -39,6 +39,27 @@ def main(): print("ERROR...falied to sort the bam file using samtools sort...bam file:", snakemake.output.tmp_bam, file=sys.stderr) sys.exit(1) + try: + command = ["samtools", "index", snakemake.output.tmp2_bam] + mccutils.run_command(command, log=log) + + except Exception as e: + track = traceback.format_exc() + print(track, file=sys.stderr) + print("ERROR...falied to index the bam file using samtools index...bam file:", snakemake.output.tmp2_bam, file=sys.stderr) + sys.exit(1) + + try: + command = ['picard', "MarkDuplicates", "-I", snakemake.output.tmp2_bam, "-O", snakemake.output.bam, "-M", snakemake.output.metrics] + mccutils.run_command(command, log=log) + mccutils.check_file_exists(snakemake.output.bam) + + except Exception as e: + track = traceback.format_exc() + print(track, file=sys.stderr) + print("ERROR...falied to mark duplicates with picard for the bam file ...bam file:", snakemake.output.tmp2_bam, file=sys.stderr) + sys.exit(1) + try: command = ["samtools", "index", snakemake.output.bam] mccutils.run_command(command, log=log) diff --git a/scripts/tebreak/tebreak_post.py b/scripts/tebreak/tebreak_post.py new file mode 100644 index 0000000..dfd993f --- /dev/null +++ b/scripts/tebreak/tebreak_post.py @@ -0,0 +1,90 @@ +import os +import sys +import subprocess +import importlib.util as il +spec = il.spec_from_file_location("config", snakemake.params.config) +config = il.module_from_spec(spec) +sys.modules[spec.name] = config +spec.loader.exec_module(config) +sys.path.append(snakemake.config['args']['mcc_path']) +import scripts.mccutils as mccutils +import scripts.output as output + + +def main(): + mccutils.log("tebreak","running tebreak post processing") + tebreak_out = snakemake.input.tebreak_out + ref_fasta = snakemake.input.ref_fasta + + out_dir = snakemake.params.out_dir + ref_name = snakemake.params.ref_name + sample_name = snakemake.params.sample_name + chromosomes = snakemake.params.chromosomes.split(",") + status_log = snakemake.params.status_log + + prev_steps_succeeded = mccutils.check_status_file(status_log) + if prev_steps_succeeded: + insertions = read_insertions(tebreak_out, sample_name, chromosomes, config) + + if len(insertions) > 0: + insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="tebreak") + insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="tebreak") + output.write_vcf(insertions, ref_fasta, sample_name, "tebreak", out_dir) + else: + mccutils.run_command(["touch", out_dir+"/"+sample_name+"_tebreak_redundant.bed"]) + mccutils.run_command(["touch", out_dir+"/"+sample_name+"_tebreak_nonredundant.bed"]) + else: + mccutils.run_command(["touch", out_dir+"/"+sample_name+"_tebreak_redundant.bed"]) + mccutils.run_command(["touch", out_dir+"/"+sample_name+"_tebreak_nonredundant.bed"]) + + mccutils.log("tebreak","tebreak postprocessing complete") + + +def read_insertions(tebreak_out, sample_name, chromosomes, config): + insertions = [] + header = {} + with open(tebreak_out, "r") as inf: + for ln,line in enumerate(inf): + line = line.replace("\n","") + split_line = line.split("\t") + if ln == 0: + for x,val in enumerate(split_line): + header[val] = x + else: + insert = output.Insertion(output.Tebreak()) + insert.chromosome = split_line[header['Chromosome']] + insert.start = int(split_line[header['3_Prime_End']]) + insert.end = int(split_line[header['5_Prime_End']]) + insert.family = split_line[header['Superfamily']] + insert.type = "non-reference" + if split_line[header['Orient_5p']] == split_line[header['Orient_3p']]: + insert.strand = split_line[header['Orient_5p']] + else: + insert.strand = "." + + if insert.strand == "-": + tmp = insert.start + insert.start = insert.end + insert.end = tmp + + insert.support_info.support["five_p_elt_match"].value = float(split_line[header['5p_Elt_Match']]) + insert.support_info.support["three_p_elt_match"].value = float(split_line[header['3p_Elt_Match']]) + insert.support_info.support["five_p_genome_match"].value = float(split_line[header['5p_Genome_Match']]) + insert.support_info.support["three_p_genome_match"].value = float(split_line[header['3p_Genome_Match']]) + insert.support_info.support["split_reads_5prime"].value = float(split_line[header['Split_reads_5prime']]) + insert.support_info.support["split_reads_3prime"].value = float(split_line[header['Split_reads_3prime']]) + insert.support_info.support["remapped_discordant"].value = float(split_line[header['Remapped_Discordant']]) + insert.support_info.support["remap_disc_fraction"].value = float(split_line[header['Remap_Disc_Fraction']]) + insert.support_info.support["remapped_splitreads"].value = float(split_line[header['Remapped_Splitreads']]) + insert.support_info.support["remap_split_fraction"].value = float(split_line[header['Remap_Split_Fraction']]) + + insert.name = insert.family+"|non-reference|NA|"+sample_name+"|tebreak|sr|" + + if insert.chromosome in chromosomes: + insertions.append(insert) + + return insertions + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/tebreak/tebreak_run.py b/scripts/tebreak/tebreak_run.py new file mode 100644 index 0000000..c4a0ce2 --- /dev/null +++ b/scripts/tebreak/tebreak_run.py @@ -0,0 +1,84 @@ +import os +import sys +import subprocess +import traceback +import importlib.util as il +spec = il.spec_from_file_location("config", snakemake.params.config) +config = il.module_from_spec(spec) +sys.modules[spec.name] = config +spec.loader.exec_module(config) +sys.path.append(snakemake.config['args']['mcc_path']) +import scripts.mccutils as mccutils + + +def main(): + mccutils.log("tebreak","running tebreak") + consensus = snakemake.input.consensus_fasta + bam = snakemake.input.bam + reference = snakemake.input.ref_fasta + repeatmasker_out = snakemake.input.rm_out + + threads = snakemake.threads + + script_dir = snakemake.params.script_dir + out_dir = snakemake.params.out_dir + ref_name = snakemake.params.ref_name + sample_name = snakemake.params.sample_name + log = snakemake.params.log + status_log = snakemake.params.status_log + + try: + ref_tes = get_ref_tes(repeatmasker_out, out_dir) + run_tebreak(bam, consensus, reference, ref_tes, script_dir, out_dir, threads, log=log) + mccutils.check_file_exists(snakemake.output[0]) + except Exception as e: + track = traceback.format_exc() + print(track, file=sys.stderr) + with open(log,"a") as l: + print(track, file=l) + mccutils.log("tebreak","tebreak run failed") + with open(status_log,"w") as l: + l.write("FAILED\n") + + mccutils.run_command(["touch", snakemake.output[0]]) + + mccutils.log("tebreak","tebreak run complete") + +def get_ref_tes(rm_out, out): + ref_te_file = out+"/ref_tes.txt" + with open(rm_out, "r") as inf, open(ref_te_file, "w") as outf: + lines = [] + for ln,line in enumerate(inf): + if ln > 2: + parsed_line = [] + split_line = line.split(" ") + for val in split_line: + if val != "": + parsed_line.append(val) + lines.append(parsed_line) + + for split_line in lines: + chrom = split_line[4] + start = split_line[5] + end = split_line[6] + te_type = split_line[10] + te_fam = split_line[9] + strand = split_line[8] + + if strand == "C": + strand = "-" + + out_line = "\t".join([chrom, start, end, te_type, te_fam, strand]) + outf.write(out_line+"\n") + + return ref_te_file + +def run_tebreak(bam, consensus, reference, ref_tes, script_dir, out_dir, threads, log=None): + os.chdir(out_dir) + command = ["samtools", "index", bam] + mccutils.run_command(command, log=log) + command = [script_dir+"/tebreak", "-b", bam, '-r', reference, '-p', str(threads), '-d', ref_tes, '-i', consensus, "--min_chr_len", "0"] + mccutils.run_command(command, log=log) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/snakefiles/tebreak.snakefile b/snakefiles/tebreak.snakefile new file mode 100644 index 0000000..72d6da8 --- /dev/null +++ b/snakefiles/tebreak.snakefile @@ -0,0 +1,48 @@ +rule tebreak_run: + input: + consensus_fasta = config['mcc']['consensus'], + bam = config['mcc']['bam'], + ref_fasta = config['mcc']['reference'], + rm_out = config['mcc']['repeatmasker_out'] + + threads: config['args']['max_threads_per_rule'] + + conda: config['envs']['tebreak'] + + params: + script_dir = config['args']['mcc_path']+"/install/tools/tebreak/tebreak/", + out_dir = config['outdir']['tebreak']+"unfiltered/", + ref_name=config['args']['ref_name'], + sample_name=config['args']['sample_name'], + log = config['args']['log_dir']+"tebreak.log", + config = config['config']['tebreak']['files'][0], + status_log = config['status']['tebreak'] + + output: + config['outdir']['tebreak']+"unfiltered/"+config['args']['sample_name']+".sorted.tebreak.table.txt" + + script: + config['args']['mcc_path']+"/scripts/tebreak/tebreak_run.py" + +rule tebreak_post: + input: + tebreak_out = config['outdir']['tebreak']+"unfiltered/"+config['args']['sample_name']+".sorted.tebreak.table.txt", + ref_fasta = config['mcc']['reference'] + + threads: 1 + + conda: config['envs']['processing'] + + params: + out_dir = config['outdir']['tebreak'], + ref_name=config['args']['ref_name'], + sample_name=config['args']['sample_name'], + chromosomes = config['args']['chromosomes'], + config = config['config']['tebreak']['files'][1], + status_log = config['status']['tebreak'] + + output: + config['out']['tebreak'] + + script: + config['args']['mcc_path']+"/scripts/tebreak/tebreak_post.py" \ No newline at end of file From d400e7d385b76cbb9c07513ed51bddd890202253 Mon Sep 17 00:00:00 2001 From: pbasting Date: Thu, 13 May 2021 23:31:52 -0400 Subject: [PATCH 02/13] adding config files for tebreak --- config/tebreak/tebreak_post.py | 29 +++++++++++++++++++++++ config/tebreak/tebreak_run.py | 42 +++++++++++++++++++++++++++++++++ scripts/tebreak/tebreak_post.py | 14 ++++++++++- scripts/tebreak/tebreak_run.py | 13 +++++++--- 4 files changed, 94 insertions(+), 4 deletions(-) diff --git a/config/tebreak/tebreak_post.py b/config/tebreak/tebreak_post.py index e69de29..61e2afb 100644 --- a/config/tebreak/tebreak_post.py +++ b/config/tebreak/tebreak_post.py @@ -0,0 +1,29 @@ +# Minimum fraction of bases matched to reference for inserted sequence on insertion seqment of 5' supporting contig +MIN_5P_ELT_MATCH = 0.0 + +# Minimum fraction of bases matched to reference for inserted sequence on insertion seqment of 3' supporting contig +MIN_3P_ELT_MATCH = 0.0 + +# Minimum fraction of bases matched to reference genome on genomic segment of 5' supporting contig +MIN_5P_GENOME_MATCH = 0.0 + +# Minimum fraction of bases matched to reference genome on genomic segment of 3' supporting contig +MIN_3P_GENOME_MATCH = 0.0 + +# Minimum number of split reads supporting 5' end of the insertion +MIN_SPLIT_READS_5P = 0 + +# Minimum number of split reads supporting 3' end of the insertion +MIN_SPLIT_READS_3P = 0 + +# Minimum number of discordant read ends re-mappable to insertion reference sequence +MIN_REMAPPED_DISCORDANT = 0 + +# Minimum proportion of remapped discordant reads mapping to the reference insertion sequence +MIN_REMAP_DISC_FRACTION = 0.0 + +# Minimum number of split reads re-mappable to insertion reference sequence +MIN_REMAPPED_SPLITREADS = 0 + +# Minimum proportion of remapped split reads mapping to the reference insertion sequence +MIN_REMAP_SPLIT_FRACTION = 0.0 \ No newline at end of file diff --git a/config/tebreak/tebreak_run.py b/config/tebreak/tebreak_run.py index e69de29..d6fa9d6 100644 --- a/config/tebreak/tebreak_run.py +++ b/config/tebreak/tebreak_run.py @@ -0,0 +1,42 @@ +PARAMS = { + "--minMWP": "0.01", + "--min_minclip" : "3", + "--min_maxclip" : "10", + "--min_sr_per_break" : "1", + "--min_consensus_score" : "0.9", + "--min_chr_len" : "0", + "--max_ins_reads" : "1000", + "--min_split_reads" : "4", + "--min_prox_mapq" : "10", + "--max_N_consensus" : "4", + "--max_disc_fetch" : "50", + "--min_disc_reads" : "4", + "--sr_density" : "2.0", + "--min_ins_match" : "0.90", + "--min_ref_match" : "0.98", + "--min_cons_len" : "250", + "--keep_all_tmp_bams" : False, + "--skip_final_filter" : False, + "--debug": False +} + + +# --minMWP MINMWP minimum Mann-Whitney P-value for split qualities (default = 0.01) +# --min_minclip MIN_MINCLIP min. shortest clipped bases per cluster (default = 3) +# --min_maxclip MIN_MAXCLIP min. longest clipped bases per cluster (default = 10) +# --min_sr_per_break MIN_SR_PER_BREAK minimum split reads per breakend (default = 1) +# --min_consensus_score MIN_CONSENSUS_SCORE quality of consensus alignment (default = 0.9) +# --min_chr_len MIN_CHR_LEN minimum chromosome length to consider in discordant site search (default = 0) +# --max_ins_reads MAX_INS_READS maximum number of reads to use per insertion call (default = 1000) +# --min_split_reads MIN_SPLIT_READS minimum total split reads per insertion call (default = 4) +# --min_prox_mapq MIN_PROX_MAPQ minimum map quality for proximal subread (default = 10) +# --max_N_consensus MAX_N_CONSENSUS exclude breakend seqs with > this number of N bases (default = 4) +# --max_disc_fetch MAX_DISC_FETCH maximum number of discordant reads to fetch per insertion site per BAM (default = 50; 0 = disable fetch) +# --min_disc_reads MIN_DISC_READS if using -d/--disco_target, minimum number of discordant reads to trigger a call (default = 4) +# --sr_density SR_DENSITY maximum split read density in chunk (default = 2.0) +# --min_ins_match MIN_INS_MATCH (output) minumum match to insertion library (default 0.90) +# --min_ref_match MIN_REF_MATCH (output) minimum match to reference genome (default 0.98) +# --min_cons_len MIN_CONS_LEN (output) min total consensus length (default=250) +# --keep_all_tmp_bams leave ALL temporary BAMs (warning: lots of files!) +# --skip_final_filter do not apply final filters or fix for orientation +# --debug \ No newline at end of file diff --git a/scripts/tebreak/tebreak_post.py b/scripts/tebreak/tebreak_post.py index dfd993f..72cbec1 100644 --- a/scripts/tebreak/tebreak_post.py +++ b/scripts/tebreak/tebreak_post.py @@ -80,7 +80,19 @@ def read_insertions(tebreak_out, sample_name, chromosomes, config): insert.name = insert.family+"|non-reference|NA|"+sample_name+"|tebreak|sr|" - if insert.chromosome in chromosomes: + if ( + insert.chromosome in chromosomes and + insert.support_info.support["five_p_elt_match"].value >= config.MIN_5P_ELT_MATCH and + insert.support_info.support["three_p_elt_match"].value >= config.MIN_3P_ELT_MATCH and + insert.support_info.support["five_p_genome_match"].value >= config.MIN_5P_GENOME_MATCH and + insert.support_info.support["three_p_genome_match"].value >= config.MIN_3P_GENOME_MATCH and + insert.support_info.support["split_reads_5prime"].value >= config.MIN_SPLIT_READS_5P and + insert.support_info.support["split_reads_3prime"].value >= config.MIN_SPLIT_READS_3P and + insert.support_info.support["remapped_discordant"].value >= config.MIN_REMAPPED_DISCORDANT and + insert.support_info.support["remap_disc_fraction"].value >= config.MIN_REMAP_DISC_FRACTION and + insert.support_info.support["remapped_splitreads"].value >= config.MIN_REMAPPED_SPLITREADS and + insert.support_info.support["remap_split_fraction"].value >= config.MIN_REMAP_SPLIT_FRACTION + ): insertions.append(insert) return insertions diff --git a/scripts/tebreak/tebreak_run.py b/scripts/tebreak/tebreak_run.py index c4a0ce2..2700cff 100644 --- a/scripts/tebreak/tebreak_run.py +++ b/scripts/tebreak/tebreak_run.py @@ -29,7 +29,7 @@ def main(): try: ref_tes = get_ref_tes(repeatmasker_out, out_dir) - run_tebreak(bam, consensus, reference, ref_tes, script_dir, out_dir, threads, log=log) + run_tebreak(bam, consensus, reference, ref_tes, script_dir, out_dir, threads, config.PARAMS, log=log) mccutils.check_file_exists(snakemake.output[0]) except Exception as e: track = traceback.format_exc() @@ -73,11 +73,18 @@ def get_ref_tes(rm_out, out): return ref_te_file -def run_tebreak(bam, consensus, reference, ref_tes, script_dir, out_dir, threads, log=None): +def run_tebreak(bam, consensus, reference, ref_tes, script_dir, out_dir, threads, params, log=None): os.chdir(out_dir) command = ["samtools", "index", bam] mccutils.run_command(command, log=log) - command = [script_dir+"/tebreak", "-b", bam, '-r', reference, '-p', str(threads), '-d', ref_tes, '-i', consensus, "--min_chr_len", "0"] + command = [script_dir+"/tebreak", "-b", bam, '-r', reference, '-p', str(threads), '-d', ref_tes, '-i', consensus] + for key in params.keys(): + if params[key] != False: + if params[key] == True: + command.append(key) + else: + command.append(key) + command.append(params[key]) mccutils.run_command(command, log=log) if __name__ == "__main__": From 6ebf290ed68769f9e58cae3c9d5034bf6f4fb926 Mon Sep 17 00:00:00 2001 From: pbasting Date: Wed, 19 May 2021 17:28:30 -0400 Subject: [PATCH 03/13] updating config files to follow a standard formatting --- config/coverage/coverage.py | 19 +++-- config/ngs_te_mapper/ngs_te_mapper_post.py | 5 +- config/ngs_te_mapper/ngs_te_mapper_run.py | 7 +- config/ngs_te_mapper2/ngs_te_mapper2_run.py | 28 ++++--- config/popoolationte/popoolationte_post.py | 12 +-- config/popoolationte/popoolationte_run.py | 51 +++++------- config/popoolationte2/popoolationte2_post.py | 11 +-- config/popoolationte2/popoolationte2_run.py | 64 ++++++++------- config/relocate/relocate_post.py | 11 +-- config/relocate/relocate_run.py | 12 +-- config/relocate2/relocate2_post.py | 18 +++-- config/relocate2/relocate2_run.py | 16 ++-- config/retroseq/retroseq_post.py | 17 ++-- config/retroseq/retroseq_run.py | 12 +-- config/teflon/teflon_post.py | 17 ++-- config/teflon/teflon_run.py | 23 +++--- config/telocate/telocate_post.py | 6 +- config/telocate/telocate_run.py | 19 +++-- config/temp/temp_post.py | 11 +-- config/temp/temp_run.py | 22 ++---- config/temp2/temp2_post.py | 11 +-- config/temp2/temp2_run.py | 31 ++++---- config/trimgalore/trimgalore.py | 13 +++- docs/getting_started/quick_start.rst | 60 +++++++++++++++ docs/index.rst | 21 +---- docs/methods/PoPoolationTE.rst | 8 -- docs/methods/PoPoolationTE2.rst | 5 -- docs/methods/RelocaTE.rst | 4 - docs/methods/RelocaTE2.rst | 5 -- docs/methods/RetroSeq.rst | 6 -- docs/methods/TE-locate.rst | 5 -- docs/methods/TEFLoN.rst | 5 -- docs/methods/TEMP.rst | 5 -- docs/methods/coverage.rst | 5 -- docs/methods/mapreads.rst | 5 -- docs/methods/ngs_te_mapper.rst | 5 -- docs/methods/trimgalore.rst | 5 -- docs/running/config.rst | 39 ++++++++++ scripts/TEMP/temp_post.py | 2 +- scripts/TEMP/temp_run.py | 6 +- scripts/coverage/coverage.py | 6 +- scripts/ngs_te_mapper/ngs_te_mapper_post.py | 2 +- scripts/ngs_te_mapper/ngs_te_mapper_run.py | 2 +- scripts/ngs_te_mapper2/ngs_te_mapper2_run.py | 11 ++- scripts/popoolationte/popoolationte_post.py | 2 +- scripts/popoolationte/popoolationte_run.py | 55 +++++++------ scripts/popoolationte2/popoolationte2_post.py | 2 +- scripts/popoolationte2/popoolationte2_run.py | 77 +++++++++++-------- scripts/preprocessing/setup_reads.py | 17 ++-- scripts/relocaTE/relocate_post.py | 10 ++- scripts/relocaTE/relocate_run.py | 10 +-- scripts/relocaTE2/relocate2_post.py | 16 ++-- scripts/relocaTE2/relocate2_run.py | 9 +-- scripts/retroseq/retroseq_post.py | 2 +- scripts/retroseq/retroseq_run.py | 26 ++++++- scripts/teflon/teflon_post.py | 10 +-- scripts/teflon/teflon_run.py | 14 ++-- scripts/telocate/telocate_post.py | 2 +- scripts/telocate/telocate_run.py | 4 +- scripts/temp2/temp2_post.py | 4 +- scripts/temp2/temp2_run.py | 27 +++---- 61 files changed, 507 insertions(+), 428 deletions(-) create mode 100644 docs/getting_started/quick_start.rst delete mode 100644 docs/methods/PoPoolationTE.rst delete mode 100644 docs/methods/PoPoolationTE2.rst delete mode 100644 docs/methods/RelocaTE.rst delete mode 100644 docs/methods/RelocaTE2.rst delete mode 100644 docs/methods/RetroSeq.rst delete mode 100644 docs/methods/TE-locate.rst delete mode 100644 docs/methods/TEFLoN.rst delete mode 100644 docs/methods/TEMP.rst delete mode 100644 docs/methods/coverage.rst delete mode 100644 docs/methods/mapreads.rst delete mode 100644 docs/methods/ngs_te_mapper.rst delete mode 100644 docs/methods/trimgalore.rst diff --git a/config/coverage/coverage.py b/config/coverage/coverage.py index 36a8045..15b0687 100644 --- a/config/coverage/coverage.py +++ b/config/coverage/coverage.py @@ -1,10 +1,9 @@ - -# omits the edges of the TE sequence coverage when calculating average depth across the element -OMIT_EDGES = True - -# If OMIT_EDGES = True, use the read length as the length of the edges to omit -OMIT_EDGES_READ_LENGTH = True - -# IF OMIT_EDGES = True and OMIT_EDGES_READ_LENGTH = False -# use this value as the length of the edges to omit -OMIT_EDGES_LENGTH = 300 +PARAMS = { + # omits the edges of the TE sequence coverage when calculating average depth across the element + "omit_edges": True, + # If OMIT_EDGES = True, use the read length as the length of the edges to omit + "omit_edges_read_length" : True, + # IF OMIT_EDGES = True and OMIT_EDGES_READ_LENGTH = False + # use this value as the length of the edges to omit + "omit_edges_length" : 300 +} diff --git a/config/ngs_te_mapper/ngs_te_mapper_post.py b/config/ngs_te_mapper/ngs_te_mapper_post.py index 94feca1..fb5bec2 100644 --- a/config/ngs_te_mapper/ngs_te_mapper_post.py +++ b/config/ngs_te_mapper/ngs_te_mapper_post.py @@ -1,2 +1,3 @@ - -MIN_READ_SUPPORT = 0 \ No newline at end of file +PARAMS = { + "min_read_support" : 0 +} diff --git a/config/ngs_te_mapper/ngs_te_mapper_run.py b/config/ngs_te_mapper/ngs_te_mapper_run.py index f755379..4638df5 100644 --- a/config/ngs_te_mapper/ngs_te_mapper_run.py +++ b/config/ngs_te_mapper/ngs_te_mapper_run.py @@ -1,3 +1,4 @@ - -# max length of junction read overlap to consider a target site duplication -MAX_TSD = 20 \ No newline at end of file +PARAMS = { + # max length of junction read overlap to consider a target site duplication + "tsd=" : 20 +} diff --git a/config/ngs_te_mapper2/ngs_te_mapper2_run.py b/config/ngs_te_mapper2/ngs_te_mapper2_run.py index 274f823..9103a80 100644 --- a/config/ngs_te_mapper2/ngs_te_mapper2_run.py +++ b/config/ngs_te_mapper2/ngs_te_mapper2_run.py @@ -1,17 +1,15 @@ - -# merge window for identifying TE clusters -WINDOW = 10 - -# minimum mapping quality of alignment -MIN_MAPQ = 20 - -# minimum allele frequency -MIN_ALLELE_FREQUENCY = 0.1 - -# max length of junction read overlap to consider a target site duplication -MAX_TSD = 25 - -# maximum gap size -MAX_GAP = 5 +PARAMS = { + "--window" : 10, + "--min_mapq" : 20, + "--min_af" : 0.1, + "--tsd_max" : 25, + "--gap_max" : 5 +} + +# --window WINDOW merge window for identifying TE clusters (default = 10) +# --min_mapq MIN_MAPQ minimum mapping quality of alignment (default = 20) +# --min_af MIN_AF minimum allele frequency (default = 0.1) +# --tsd_max TSD_MAX maximum TSD size (default = 25) +# --gap_max GAP_MAX maximum gap size (default = 5) diff --git a/config/popoolationte/popoolationte_post.py b/config/popoolationte/popoolationte_post.py index f421a86..e24f433 100644 --- a/config/popoolationte/popoolationte_post.py +++ b/config/popoolationte/popoolationte_post.py @@ -1,6 +1,6 @@ - -# requires that final results have support on both ends of prediction -REQUIRE_BOTH_END_SUPPORT=True - -# threshold for the minimum acceptable fraction of the reads supporting the prediction -PERCENT_READ_SUPPORT_THRESHOLD=0.1 \ No newline at end of file +PARAMS = { + # requires that final results have support on both ends of prediction + "require_both_end_support" : True, + # threshold for the minimum acceptable fraction of the reads supporting the prediction + "percent_read_support_threshold" : 0.1 +} \ No newline at end of file diff --git a/config/popoolationte/popoolationte_run.py b/config/popoolationte/popoolationte_run.py index 18ff8f4..306b44f 100644 --- a/config/popoolationte/popoolationte_run.py +++ b/config/popoolationte/popoolationte_run.py @@ -1,3 +1,23 @@ + +PARAMS = { + "identify-te-insertsites.pl" : { + "--min-count" : 3, + "--min-map-qual" : 15 + }, + "crosslink-te-sites.pl" : { + "--single-site-shift": 100 + }, + "update-teinserts-with-knowntes.pl" : { + "--single-site-shift": 100 + }, + "estimate-polymorphism.pl" : { + "--min-map-qual": 15 + }, + "filter-teinserts.pl" : { + "--min-count": 5 + } +} + ''' NAME perl identify-te-insertsites.pl - Identifies TE insertion sites (forward @@ -11,27 +31,7 @@ --min-map-qual the minimum mapping quality; this will only apply to reads mapping to a reference contig. -''' -IDENTIFY_TE_INSERTSITES = { - "min-count" : 3, - "min-map-qual" : 15 -} - - - - -CROSSLINK_TE_SITES = { - "single-site-shift": 100 -} - - -UPDATE_TEINSERTS_WITH_KNOWNTES = { - "single-site-shift": 100 -} - - -''' NAME perl estimate-polymorphism.pl - Estimte the insertion frequencies for a given set of TE insertions @@ -43,13 +43,4 @@ --min-map-qual the minimum mapping quality -''' - -ESTIMATE_POLYMORPHISM = { - "min-map-qual": 15 -} - - -FILTER = { - "min-count": 5 -} \ No newline at end of file +''' \ No newline at end of file diff --git a/config/popoolationte2/popoolationte2_post.py b/config/popoolationte2/popoolationte2_post.py index e077e47..477755e 100644 --- a/config/popoolationte2/popoolationte2_post.py +++ b/config/popoolationte2/popoolationte2_post.py @@ -1,6 +1,7 @@ +PARAMS = { + # require that the TE prediction have support from both junctions + "require_both_end_support" : True, -# require that the TE prediction have support from both junctions -REQUIRE_BOTH_END_SUPPORT = True - -# threshold for the minimum acceptable (average physical coverage supporting the insertion of the given TE) / (average physical coverage) -FREQUENCY_THRESHOLD=0.1 \ No newline at end of file + # threshold for the minimum acceptable (average physical coverage supporting the insertion of the given TE) / (average physical coverage) + "frequency_threshold" : 0.1 +} \ No newline at end of file diff --git a/config/popoolationte2/popoolationte2_run.py b/config/popoolationte2/popoolationte2_run.py index af718b8..8097a69 100644 --- a/config/popoolationte2/popoolationte2_run.py +++ b/config/popoolationte2/popoolationte2_run.py @@ -1,4 +1,39 @@ +PARAMS = { + "ppileup" : { + "--map-qual": 15, + "--sr-mindist" : 10000, + "--id-up-quant": 0.01 + }, + + "subsampleppileup" : { + "run" : False, + "--target-coverage": 100, + "--with-replace": False + }, + + "identifySignatures" : { + "--min-count": 2.0, + "--signature-window": "median", + "--min-valley": "median", + "--chunk-distance": 5 + }, + + "updateStrand" : { + "--map-qual": 15, + "--max-disagreement": 0.1, + "--sr-mindist": 10000, + "--id-up-quant": 0.01 + }, + + "pairupSignatures" : { + "--min-distance": -100, + "--max-distance": 500, + "--max-freq-diff": 1.0 + } + +} + #ppileup # create a physical pileup file from one or multiple bam files @@ -11,11 +46,6 @@ # --id-up-quant paired end fragments with an insert size in the upper # quantile will be ignored [fraction]; default=0.01 -ppileup = { - "map-qual": 15, - "sr-mindist" : 10000, - "id-up-quant": 0.01 -} # subsampleppileup @@ -28,11 +58,7 @@ # == Parameters for fine tuning == # --with-replace use sampling with replacement instead of without replacement -subsampleppileup = { - "run" : False, - "target-coverage": 100, - "with-replace": False -} + # identifySignatures @@ -51,12 +77,7 @@ # --chunk-distance minimum distance between chromosomal chunks in multiples # of --min-valley [int]; default=5 -identifySignatures = { - "min-count": 2.0, - "signature-window": "median", - "min-valley": "median", - "chunk-distance": 5 -} + # updateStrand @@ -73,12 +94,6 @@ # --id-up-quant paired end fragments with an insert size in the upper # quantile will be ignored [fraction]; default=0.01 -updateStrand = { - "map-qual": 15, - "max-disagreement": 0.1, - "sr-mindist": 10000, - "id-up-quant": 0.01 -} # pairupSignatures @@ -92,8 +107,3 @@ # --max-freq-diff the maximum frequency difference between signatures; # default=1.0 -pairupSignatures = { - "min-distance": -100, - "max-distance": 500, - "max-freq-diff": 1.0 -} diff --git a/config/relocate/relocate_post.py b/config/relocate/relocate_post.py index 41b2f93..be2b462 100644 --- a/config/relocate/relocate_post.py +++ b/config/relocate/relocate_post.py @@ -1,5 +1,6 @@ -REF_LEFT_THRESHOLD = 0 -REF_RIGHT_THRESHOLD = 0 - -NONREF_LEFT_THRESHOLD = 0 -NONREF_RIGHT_THRESHOLD = 0 \ No newline at end of file +PARAMS = { + "ref_left_threshold" : 0, + "ref_right_threshold" : 0, + "nonref_left_threshold" : 0, + "nonref_right_threshold" : 0 +} \ No newline at end of file diff --git a/config/relocate/relocate_run.py b/config/relocate/relocate_run.py index 7b2e9a1..2173d29 100644 --- a/config/relocate/relocate_run.py +++ b/config/relocate/relocate_run.py @@ -53,10 +53,10 @@ ''' -RELOCATE = { - 'l' : 10, - 'm' : 0.0, - 'bm' : 10, - 'bt' : 7, - 'f' : 100 +PARAMS = { + '-l' : 10, + '-m' : 0.0, + '-bm' : 10, + '-bt' : 7, + '-f' : 100 } \ No newline at end of file diff --git a/config/relocate2/relocate2_post.py b/config/relocate2/relocate2_post.py index 234a45f..bddbe93 100644 --- a/config/relocate2/relocate2_post.py +++ b/config/relocate2/relocate2_post.py @@ -1,9 +1,11 @@ -REF_LEFT_SUPPORT_THRESHOLD = 0 -REF_RIGHT_SUPPORT_THRESHOLD = 0 -REF_LEFT_JUNCTION_THRESHOLD = 0 -REF_RIGHT_JUNCTION_THRESHOLD = 0 +PARAMS = { + "ref_left_support_threshold" : 0, + "ref_right_support_threshold" : 0, + "ref_left_junction_threshold" : 0, + "ref_right_junction_threshold" : 0, -NONREF_LEFT_SUPPORT_THRESHOLD = 0 -NONREF_RIGHT_SUPPORT_THRESHOLD = 0 -NONREF_LEFT_JUNCTION_THRESHOLD = 0 -NONREF_RIGHT_JUNCTION_THRESHOLD = 0 \ No newline at end of file + "nonref_left_support_threshold" : 0, + "nonref_right_support_threshold" : 0, + "nonref_left_junction_threshold" : 0, + "nonref_right_junction_threshold" : 0 +} \ No newline at end of file diff --git a/config/relocate2/relocate2_run.py b/config/relocate2/relocate2_run.py index 18ba0a4..c90d960 100644 --- a/config/relocate2/relocate2_run.py +++ b/config/relocate2/relocate2_run.py @@ -1,3 +1,10 @@ +PARAMS = { + '--aligner' : "blat", + '--len_cut_match' : 10, + '--len_cut_trim' : 10, + '--mismatch' : 2, + '--mismatch_junction' : 2 +} ''' python scripts/relocaTE2.py --help @@ -28,12 +35,3 @@ ''' - - -RELOCATE2 = { - 'aligner' : "blat", - 'len_cut_match' : 10, - 'len_cut_trim' : 10, - 'mismatch' : 2, - 'mismatch_junction' : 2 -} \ No newline at end of file diff --git a/config/retroseq/retroseq_post.py b/config/retroseq/retroseq_post.py index 82f1116..0b0d4d3 100644 --- a/config/retroseq/retroseq_post.py +++ b/config/retroseq/retroseq_post.py @@ -1,11 +1,12 @@ # https://github.com/tk2/RetroSeq/wiki/RetroSeq-Tutorial -''' -Number of correctly mapped read pairs spanning breakpoint -''' -READ_SUPPORT_THRESHOLD = 0 +PARAMS = { + "read_support_threshold" : 0, + "breakpoint_confidence_threshold" : 6 +} ''' -The FL tag ranges from 1-8 and gives information on the breakpoint with 8 being the most confident calls and -lower values indicating calls that don’t meet the breakpoint criteria for reasons such as lack of 5’ or 3’ reads -''' -BREAKPOINT_CONFIDENCE_THRESHOLD = 6 \ No newline at end of file +read_support_threshold Number of correctly mapped read pairs spanning breakpoint + +breakpoint_confidence_threshold The FL tag ranges from 1-8 and gives information on the breakpoint with 8 being the most confident calls and + lower values indicating calls that don’t meet the breakpoint criteria for reasons such as lack of 5’ or 3’ reads +''' \ No newline at end of file diff --git a/config/retroseq/retroseq_run.py b/config/retroseq/retroseq_run.py index a1b622c..556cad9 100644 --- a/config/retroseq/retroseq_run.py +++ b/config/retroseq/retroseq_run.py @@ -1,11 +1,11 @@ +PARAMS = { + "-depth" : 200, + "-reads" : 10, + "-q": 20 +} + ''' [-depth Max average depth of a region to be considered for calling. Default is 200.] [-reads It is the minimum number of reads required to make a call. Default is 5. Parameter is optional.] [-q Minimum mapping quality for a read mate that anchors the insertion call. Default is 30. Parameter is optional.] ''' - -PARAMETERS = { - "depth" : 200, - "reads" : 10, - "q": 20 -} \ No newline at end of file diff --git a/config/teflon/teflon_post.py b/config/teflon/teflon_post.py index 8636aa6..b50e4ca 100644 --- a/config/teflon/teflon_post.py +++ b/config/teflon/teflon_post.py @@ -1,15 +1,16 @@ -''' -C10: read count for "presence reads" -C11: read count for "absence reads" -C13: genotype for every TE (allele frequency for pooled data, present/absent for haploid, present/absent/heterozygous for diploid) #Note: haploid/diploid caller is under construction, use "pooled" for presence/absence read counts -''' - -PARAMETERS = { +PARAMS = { "min_presence_reads" : 3, "max_absence_reads" : None, "min_presence_fraction" : 0.1, # presence/(absence+presence), "require_tsd" : True, # non-ref predictions must have a target site duplication "require_both_breakpoints" : True # non-ref predictions must have both breakpoints predicted -} \ No newline at end of file +} + +''' +C10: read count for "presence reads" +C11: read count for "absence reads" +C13: genotype for every TE (allele frequency for pooled data, present/absent for haploid, present/absent/heterozygous for diploid) #Note: haploid/diploid caller is under construction, use "pooled" for presence/absence read counts +''' + diff --git a/config/teflon/teflon_run.py b/config/teflon/teflon_run.py index 5e1c5da..0d0659a 100644 --- a/config/teflon/teflon_run.py +++ b/config/teflon/teflon_run.py @@ -1,3 +1,15 @@ + +# When params: sd, cov, or ht are set to None, they are determined by TEFLoN, Can be set to a specific INT value +PARAMS = { + "-q": 20, + "-sd" : None, + "-cov" : None, + "-n1" : 1, + "-n2" : 1, + "-lt" : 1, + "-ht" : None +} + ''' usage: python usr/local/teflon.v0.4.py [optional] -q #NOTE: Mapped reads with map qualities lower than this number will be discarded @@ -17,14 +29,3 @@ -lt [sites genotyped as -9 if adjusted read counts lower than this threshold, default=1] -ht [sites genotyped as -9 if adjusted read counts higher than this threshold, default=mean_coverage + 2*STDEV] ''' - -# When params: sd, cov, or ht are set to None, they are determined by TEFLoN, Can be set to a specific INT value -PARAMETERS = { - "q": 20, - "sd" : None, - "cov" : None, - "n1" : 1, - "n2" : 1, - "lt" : 1, - "ht" : None -} diff --git a/config/telocate/telocate_post.py b/config/telocate/telocate_post.py index d7bd58c..7be6453 100644 --- a/config/telocate/telocate_post.py +++ b/config/telocate/telocate_post.py @@ -1,3 +1,3 @@ - - -READ_PAIR_SUPPORT_THRESHOLD = 0 \ No newline at end of file +PARAMS = { + "read_pair_support_threshold" : 0 +} \ No newline at end of file diff --git a/config/telocate/telocate_run.py b/config/telocate/telocate_run.py index d9d2886..71bb76a 100644 --- a/config/telocate/telocate_run.py +++ b/config/telocate/telocate_run.py @@ -1,12 +1,15 @@ -# Max memory to use (GB) -MAX_MEM = 4 +PARAMS = { + # Max memory to use (GB) + "max_mem" : 4, -# Multiplied by median insert size -MIN_DISTANCE = 5 + # Multiplied by median insert size + "min_distance" : 5, -# minimal supporting reads -MIN_SUPPORT_READS = 3 + # minimal supporting reads + "min_support_reads" : 3, + + # minimal supporting individuals + "min_support_individuals" : 1 +} -# minimal supporting individuals -MIN_SUPPORT_INDIVIDUALS = 1 diff --git a/config/temp/temp_post.py b/config/temp/temp_post.py index fd6719d..6d98c71 100644 --- a/config/temp/temp_post.py +++ b/config/temp/temp_post.py @@ -1,6 +1,7 @@ - # post processing constants for TEMP module - -ACCEPTABLE_INSERTION_SUPPORT_CLASSES = ["1p1"] - -FREQUENCY_THRESHOLD = 0.1 \ No newline at end of file +PARAMS = { + # valid options: 1p1, 2p, singleton + "acceptable_insertion_support_classes" : ["1p1"], + # minimum frequency of the inserted transposon + "frequency_threshold" : 0.1 +} \ No newline at end of file diff --git a/config/temp/temp_run.py b/config/temp/temp_run.py index b943535..a767215 100644 --- a/config/temp/temp_run.py +++ b/config/temp/temp_run.py @@ -1,22 +1,12 @@ # TEMP_Insertion.sh +PARAMS = { + '-x' : 30, + '-m' : 1 +} + ''' Options: - -i Input file in bam format with full path. Please sort and index the file before calling this program. - Sorting and indexing can be done by 'samtools sort' and 'samtools index' - -s Directory where all the scripts are - -o Path to output directory. Default is current directory - -r Transposon sequence database in fasta format with full path - -t Annotated TEs in BED6 format with full path. Detected insertions that overlap with annoated TEs will be filtered. - -u TE families annotations. If supplied detected insertions overlap with annotated TE of the same family will be filtered. Only use with -t. -m Number of mismatch allowed when mapping to TE concensus sequences. Default is 3 -x The minimum score difference between the best hit and the second best hit for considering a read as uniquely mapped. For BWA mem. - -f An integer specifying the length of the fragments (inserts) of the library. Default is 500 - -c An integer specifying the number of CUPs used. Default is 8 - -h Show help message -''' - -TEMP_Insertion = { - 'x' : 30, - 'm' : 1 -} +''' \ No newline at end of file diff --git a/config/temp2/temp2_post.py b/config/temp2/temp2_post.py index 2b9ffe0..d6dd030 100644 --- a/config/temp2/temp2_post.py +++ b/config/temp2/temp2_post.py @@ -1,5 +1,6 @@ -# valid options: 1p1, 2p, singleton -ACCEPTABLE_INSERTION_SUPPORT_CLASSES = ["1p1"] - -# minimum frequency of the inserted transposon. It generally means what fraction of sequenced genome present this insertion. -FREQUENCY_THRESHOLD = 0.1 \ No newline at end of file +PARAMS = { + # valid options: 1p1, 2p, singleton + "acceptable_insertion_support_classes" : ["1p1"], + # minimum frequency of the inserted transposon. It generally means what fraction of sequenced genome present this insertion. + "frequency_threshold" : 0.1 +} \ No newline at end of file diff --git a/config/temp2/temp2_run.py b/config/temp2/temp2_run.py index 7d47f75..1e567bd 100644 --- a/config/temp2/temp2_run.py +++ b/config/temp2/temp2_run.py @@ -1,33 +1,34 @@ +PARAMS ={ + "insertion" : { + "-M" : 2, + "-m" : 5, + "-U" : 0.8, + "-N" : 300, + "-T" : False, + "-L" : False, + "-S" : False + }, + + "absence" : { + "-x" : 0 + } +} ############################## # TEMP2 INSERTION PARAMETERS # ############################## # -M mismatch% Percentage of mismatch allowed when anchor to genome. Default is 2. -GENOME_MISMATCH_PCT = 2 - # -m mismatch% Percentage of mismatch allowed when mapping to TEs. Default is 5. -TE_MISMATCH_PCT = 5 - # -U ratio The ratio between the second best alignment and the best alignment to judge if a read is uniquely mapped. Default is 0.8. -RATIO = 0.8 - # -N reference_filter_window window sizea (+-n) for filtering insertions overlapping reference insertions. Default is 300. -FILTER_WINDOW = 300 - # -T Set this parameter to allow truncated de novo insertions; For default, only full-length de novo insertions are allowed. -TRUNCATED = False - # -L Set this parameter to use a looser criteria to filter reference annotated copy overlapped insertions; Default not allowed. -LOOSE_FILTER = False - # -S Set this parameter to skip insertion length checking; Default is to remove those insertions that are not full length of shorter than 500bp. -SKIP_INS_LEN_CHECK = False ############################ # TEMP2 ABSENCE PARAMETERS # ############################ -# -x The minimum score difference between the best hit and the second best hit for considering a read as uniquely mapped. For BWA MEM. -UNIQ_MAP_SCORE = 0 \ No newline at end of file +# -x The minimum score difference between the best hit and the second best hit for considering a read as uniquely mapped. For BWA MEM. \ No newline at end of file diff --git a/config/trimgalore/trimgalore.py b/config/trimgalore/trimgalore.py index a224069..4e392a4 100644 --- a/config/trimgalore/trimgalore.py +++ b/config/trimgalore/trimgalore.py @@ -1,3 +1,12 @@ # options used for adapter trimming with the trimgalore module -SINGLE_END_FLAGS = ["--fastqc"] -PAIRED_END_FLAGS = ["--paired", "--fastqc"] +PARAMS = { + "single_end": { + "--fastqc": True, + }, + + "paired_end": { + "--fastqc": True, + "--paired": True + } +} + diff --git a/docs/getting_started/quick_start.rst b/docs/getting_started/quick_start.rst new file mode 100644 index 0000000..79392ae --- /dev/null +++ b/docs/getting_started/quick_start.rst @@ -0,0 +1,60 @@ +========== +McClintock +========== + +* `McClintock Github Repository `_ + +Many methods have been developed to detect transposable element (TE) insertions from whole genome shotgun next-generation sequencing (NGS) data, each of which has different dependencies, run interfaces, and output formats. Here, we have developed a meta-pipeline to install and run multiple methods for detecting TE insertions in NGS data, which generates output in the UCSC Browser extensible data (BED) format as well as the Variant Call Format (VCF). + +.. seealso:: + + `Nelson, Linheiro and Bergman (2017) G3 7:2763-2778 `_ + Contains a detailed description of the original McClintock pipeline (v0.1) and evaluation of the original six McClintock component methods on the yeast genome + +--------------------------------- +TE Dectection Software Components +--------------------------------- + +.. csv-table:: + :header: "Method", "Citation" + :widths: 10, 15 + + `ngs_te_mapper `_, `Linheiro and Bergman (2012) `_ + `ngs_te_mapper2 `_, Unpublished + `RelocaTE `_, `Robb et al. (2013) `_ + `RelocaTE2 `_, `Chen et al. (2017) `_ + `TEMP `_, `Zhuang et al. (2014) `_ + `TEMP2 `_, `Yu et al. (2021) `_ + `RetroSeq `_, `Keane et al. (2012) `_ + `PoPoolationTE `_, `Kofler et al. (2012) `_ + `PoPoolationTE2 `_, `Kofler et al. (2016) `_ + `TE-locate `_, `Platzer et al. (2012) `_ + `TEFLoN `_, `Adrion et al. (2017) `_ + + +----------- +Quick Start +----------- + +.. code:: bash + + # INSTALL (Requires Conda) + git clone git@github.com:bergmanlab/mcclintock.git + cd mcclintock + conda env create -f install/envs/mcclintock.yml --name mcclintock + conda activate mcclintock + python3 mcclintock.py --install + + # DOWNLOAD TEST DATA + python3 test/download_test_data.py + + # RUN ON TEST DATA + python3 mcclintock.py \ + -r test/sacCer2.fasta \ + -c test/sac_cer_TE_seqs.fasta \ + -g test/reference_TE_locations.gff \ + -t test/sac_cer_te_families.tsv \ + -1 test/SRR800842_1.fastq.gz \ + -2 test/SRR800842_2.fastq.gz \ + -p 4 \ + -o /path/to/output/directory \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 8a04190..f8d371e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -25,9 +25,11 @@ TE Dectection Software Components :widths: 10, 15 `ngs_te_mapper `_, `Linheiro and Bergman (2012) `_ + `ngs_te_mapper2 `_, Unpublished `RelocaTE `_, `Robb et al. (2013) `_ `RelocaTE2 `_, `Chen et al. (2017) `_ `TEMP `_, `Zhuang et al. (2014) `_ + `TEMP2 `_, `Yu et al. (2021) `_ `RetroSeq `_, `Keane et al. (2012) `_ `PoPoolationTE `_, `Kofler et al. (2012) `_ `PoPoolationTE2 `_, `Kofler et al. (2016) `_ @@ -71,26 +73,9 @@ Quick Start :caption: Getting Started :hidden: + getting_started/quick_start getting_started/install -.. toctree:: - :maxdepth: 2 - :caption: Component methods - :hidden: - - methods/trimgalore - methods/mapreads - methods/coverage - methods/ngs_te_mapper - methods/RelocaTE - methods/RelocaTE2 - methods/TEMP - methods/RetroSeq - methods/PoPoolationTE - methods/PoPoolationTE2 - methods/TE-locate - methods/TEFLoN - .. toctree:: :maxdepth: 2 diff --git a/docs/methods/PoPoolationTE.rst b/docs/methods/PoPoolationTE.rst deleted file mode 100644 index a3bad2d..0000000 --- a/docs/methods/PoPoolationTE.rst +++ /dev/null @@ -1,8 +0,0 @@ - -============= -PoPoolationTE -============= - - - - diff --git a/docs/methods/PoPoolationTE2.rst b/docs/methods/PoPoolationTE2.rst deleted file mode 100644 index 96a7fa3..0000000 --- a/docs/methods/PoPoolationTE2.rst +++ /dev/null @@ -1,5 +0,0 @@ - - -============== -PoPoolationTE2 -============== diff --git a/docs/methods/RelocaTE.rst b/docs/methods/RelocaTE.rst deleted file mode 100644 index 4224712..0000000 --- a/docs/methods/RelocaTE.rst +++ /dev/null @@ -1,4 +0,0 @@ - -======== -RelocaTE -======== diff --git a/docs/methods/RelocaTE2.rst b/docs/methods/RelocaTE2.rst deleted file mode 100644 index d2a24e5..0000000 --- a/docs/methods/RelocaTE2.rst +++ /dev/null @@ -1,5 +0,0 @@ - -========= -RelocaTE2 -========= - diff --git a/docs/methods/RetroSeq.rst b/docs/methods/RetroSeq.rst deleted file mode 100644 index a2b346c..0000000 --- a/docs/methods/RetroSeq.rst +++ /dev/null @@ -1,6 +0,0 @@ - - -======== -RetroSeq -======== - diff --git a/docs/methods/TE-locate.rst b/docs/methods/TE-locate.rst deleted file mode 100644 index 8541c80..0000000 --- a/docs/methods/TE-locate.rst +++ /dev/null @@ -1,5 +0,0 @@ - -========= -TE-locate -========= - diff --git a/docs/methods/TEFLoN.rst b/docs/methods/TEFLoN.rst deleted file mode 100644 index 32615eb..0000000 --- a/docs/methods/TEFLoN.rst +++ /dev/null @@ -1,5 +0,0 @@ - -====== -TEFLoN -====== - diff --git a/docs/methods/TEMP.rst b/docs/methods/TEMP.rst deleted file mode 100644 index 766225e..0000000 --- a/docs/methods/TEMP.rst +++ /dev/null @@ -1,5 +0,0 @@ - -==== -TEMP -==== - diff --git a/docs/methods/coverage.rst b/docs/methods/coverage.rst deleted file mode 100644 index b7f8f00..0000000 --- a/docs/methods/coverage.rst +++ /dev/null @@ -1,5 +0,0 @@ - -======== -Coverage -======== - diff --git a/docs/methods/mapreads.rst b/docs/methods/mapreads.rst deleted file mode 100644 index 574f2b0..0000000 --- a/docs/methods/mapreads.rst +++ /dev/null @@ -1,5 +0,0 @@ - -========= -map_reads -========= - diff --git a/docs/methods/ngs_te_mapper.rst b/docs/methods/ngs_te_mapper.rst deleted file mode 100644 index 2770b47..0000000 --- a/docs/methods/ngs_te_mapper.rst +++ /dev/null @@ -1,5 +0,0 @@ - -============= -ngs_te_mapper -============= - diff --git a/docs/methods/trimgalore.rst b/docs/methods/trimgalore.rst deleted file mode 100644 index 415c4ef..0000000 --- a/docs/methods/trimgalore.rst +++ /dev/null @@ -1,5 +0,0 @@ - -========== -TrimGalore -========== - diff --git a/docs/running/config.rst b/docs/running/config.rst index 38fdb3d..3a2e194 100644 --- a/docs/running/config.rst +++ b/docs/running/config.rst @@ -3,3 +3,42 @@ Method config files =================== +Each component method has associated config file(s) that enable the modification of run parameters and filtering parameters. TE detection components have separate run and post processing config files. These files are located in the `config directory `_ of mcclintock :code:`/path/to/mcclintock/config`. + +Example run config file +""""""""""""""""""""""" +.. code:: python + + PARAMS = { + '-l' : 10, + '-m' : 0.0, + '-bm' : 10, + '-bt' : 7, + '-f' : 100 + } + +Each config file contains a python :code:`dict` named :code:`PARAMS` which contains :code:`key : value` pairs with the :code:`key` being the flag or name of the parameter, and the :code:`value` being the value associated with the parameter. Most of the :code:`values` are set to the default parameters for each tool. The :code:`values` can be modified to better fit the data being used. + + +Coverage config +""""""""""""""" +This config files contains the parameters that can be modified for the :code:`coverage` component method. + +.. code:: python + + PARAMS = { + "omit_edges": True, + "omit_edges_read_length" : True, + "omit_edges_length" : 300 + } + +:code:`/path/to/mcclintock/config/coverage/coverage.py` : `GitHub Link `_ + +omit_edges + * Omits the edges of the TE sequence coverage when calculating average depth across the element. + +omit_edges_read_length + * If :code:`omit_edges: True` and :code:`omit_edges_read_length True`, then the read length will be used as the length of the edges to omit + +omit_edges_length + * If :code:`omit_edges: True` and :code:`omit_edges_read_length False`, the value of :code:`omit_edges_length` will be used as the length of the edges to omit \ No newline at end of file diff --git a/scripts/TEMP/temp_post.py b/scripts/TEMP/temp_post.py index e000e4e..09a1d30 100644 --- a/scripts/TEMP/temp_post.py +++ b/scripts/TEMP/temp_post.py @@ -30,7 +30,7 @@ def main(): absence_bed = make_absence_bed(absence_summary, sample_name, out_dir) non_absent_ref_insertions = get_non_absent_ref_tes(te_gff, absence_bed, sample_name, out_dir, log) insertions += non_absent_ref_insertions - insertions = filter_insertions(insertions, chromosomes, acceptable_classes=config.ACCEPTABLE_INSERTION_SUPPORT_CLASSES, frequency_theshold=config.FREQUENCY_THRESHOLD) + insertions = filter_insertions(insertions, chromosomes, acceptable_classes=config.PARAMS["acceptable_insertion_support_classes"], frequency_theshold=config.PARAMS["frequency_threshold"]) if len(insertions) > 0: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="temp") insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="temp") diff --git a/scripts/TEMP/temp_run.py b/scripts/TEMP/temp_run.py index 7d93153..3cd520f 100644 --- a/scripts/TEMP/temp_run.py +++ b/scripts/TEMP/temp_run.py @@ -81,18 +81,20 @@ def run_temp_insertion(bam, scripts, consensus, te_bed, taxonomy, median_insert_ mccutils.log("temp","running TEMP non-reference insertion prediction", log=log) command = [ "bash", scripts+"TEMP_Insertion.sh", - "-x", str(config.TEMP_Insertion['x']), "-i", bam, "-s", scripts, "-r", consensus, "-t", te_bed, "-u", taxonomy, - "-m", str(config.TEMP_Insertion['m']), "-f", str(median_insert_size), "-c", str(threads), "-o", out ] + for param in config.PARAMS.keys(): + command.append(param) + command.append(str(config.PARAMS[param])) + mccutils.run_command(command, log=log) diff --git a/scripts/coverage/coverage.py b/scripts/coverage/coverage.py index 44f3b64..da5d617 100644 --- a/scripts/coverage/coverage.py +++ b/scripts/coverage/coverage.py @@ -57,11 +57,11 @@ def main(): genome_depth = get_genome_depth(nonte_bed, bam, run_id, coverage_out, log) edge_trim = 0 - if config.OMIT_EDGES: - if config.OMIT_EDGES_READ_LENGTH: + if config.PARAMS["omit_edges"]: + if config.PARAMS["omit_edges_read_length"]: edge_trim = mccutils.estimate_read_length(snakemake.input.fq1) else: - edge_trim = config.OMIT_EDGES_LENGTH + edge_trim = config.PARAMS["omit_edges_length"] te_names, all_coverage_files, uniq_coverage_files, avg_norm_te_depths = make_depth_table(te_seqs, bam, genome_depth, run_id, coverage_out, snakemake.output[0], log, trim_edges=edge_trim) make_plots(te_names, all_coverage_files, uniq_coverage_files, avg_norm_te_depths, genome_depth, snakemake.params.sample, coverage_out, trim_edges=edge_trim) diff --git a/scripts/ngs_te_mapper/ngs_te_mapper_post.py b/scripts/ngs_te_mapper/ngs_te_mapper_post.py index 5a45e84..47d92e2 100644 --- a/scripts/ngs_te_mapper/ngs_te_mapper_post.py +++ b/scripts/ngs_te_mapper/ngs_te_mapper_post.py @@ -28,7 +28,7 @@ def main(): succeeded = mccutils.check_status_file(status_log) if succeeded: mccutils.log("ngs_te_mapper","processing ngs_te_mapper results", log=log) - insertions = read_insertions(raw_bed, chromosomes, sample_name, out_dir, min_read_cutoff=config.MIN_READ_SUPPORT) + insertions = read_insertions(raw_bed, chromosomes, sample_name, out_dir, min_read_cutoff=config.PARAMS['min_read_support']) if len(insertions) > 0: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="ngs_te_mapper") diff --git a/scripts/ngs_te_mapper/ngs_te_mapper_run.py b/scripts/ngs_te_mapper/ngs_te_mapper_run.py index 54cb754..8463796 100644 --- a/scripts/ngs_te_mapper/ngs_te_mapper_run.py +++ b/scripts/ngs_te_mapper/ngs_te_mapper_run.py @@ -43,7 +43,7 @@ def main(): if snakemake.params.raw_fq2 == "None": is_paired = False - command = ['Rscript', "--vanilla", script_dir+"/ngs_te_mapper.R", "genome="+reference_fasta, "teFile="+consensus_fasta, "tsd="+str(config.MAX_TSD), "thread="+str(threads), "output="+out_dir, "sourceCodeFolder="+script_dir] + command = ['Rscript', "--vanilla", script_dir+"/ngs_te_mapper.R", "genome="+reference_fasta, "teFile="+consensus_fasta, "tsd="+str(config.PARAMS["tsd="]), "thread="+str(threads), "output="+out_dir, "sourceCodeFolder="+script_dir] if is_paired: command.append("sample="+fastq1+";"+fastq2) diff --git a/scripts/ngs_te_mapper2/ngs_te_mapper2_run.py b/scripts/ngs_te_mapper2/ngs_te_mapper2_run.py index 3d09949..aa6c385 100644 --- a/scripts/ngs_te_mapper2/ngs_te_mapper2_run.py +++ b/scripts/ngs_te_mapper2/ngs_te_mapper2_run.py @@ -49,19 +49,18 @@ def main(): 'python', script_dir+"/ngs_te_mapper2.py", "-r", reference_fasta, "-l", consensus_fasta, - "-w", str(config.WINDOW), - "--min_mapq", str(config.MIN_MAPQ), - "--tsd_max", str(config.MAX_TSD), - "--gap_max", str(config.MAX_GAP), "-t", str(threads), "-o", out_dir, "--keep_files", "-p", sample_name, "-a", locations, - "--min_af", str(config.MIN_ALLELE_FREQUENCY), - "-f" ] + for key in config.PARAMS.keys(): + command.append(key) + command.append(str(config.PARAMS[key])) + + command.append("-f") if is_paired: command.append(fastq1+","+fastq2) else: diff --git a/scripts/popoolationte/popoolationte_post.py b/scripts/popoolationte/popoolationte_post.py index 5a9104b..27e729e 100644 --- a/scripts/popoolationte/popoolationte_post.py +++ b/scripts/popoolationte/popoolationte_post.py @@ -25,7 +25,7 @@ def main(): succeeded = mccutils.check_status_file(status_log) if succeeded: - insertions = read_insertions(popoolationte_out, sample_name, chromosomes, require_both_end_support=config.REQUIRE_BOTH_END_SUPPORT, percent_read_support_threshold=config.PERCENT_READ_SUPPORT_THRESHOLD) + insertions = read_insertions(popoolationte_out, sample_name, chromosomes, require_both_end_support=config.PARAMS["require_both_end_support"], percent_read_support_threshold=config.PARAMS["percent_read_support_threshold"]) if len(insertions) >= 1: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="popoolationte") insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="popoolationte") diff --git a/scripts/popoolationte/popoolationte_run.py b/scripts/popoolationte/popoolationte_run.py index b2d11d9..9e59360 100644 --- a/scripts/popoolationte/popoolationte_run.py +++ b/scripts/popoolationte/popoolationte_run.py @@ -57,14 +57,9 @@ def main(): max_dist, known_inserts, script_dir, - out_dir, - log=log, - identify_min_count=config.IDENTIFY_TE_INSERTSITES["min-count"], - identify_min_qual=config.IDENTIFY_TE_INSERTSITES["min-map-qual"], - crosslink_site_shift=config.CROSSLINK_TE_SITES['single-site-shift'], - update_te_inserts_site_shift=config.UPDATE_TEINSERTS_WITH_KNOWNTES['single-site-shift'], - estimate_polymorphism_min_qual=config.ESTIMATE_POLYMORPHISM['min-map-qual'], - filter_min_count=config.FILTER['min-count']) + out_dir, + config.PARAMS, + log=log) mccutils.check_file_exists(snakemake.output[0]) @@ -145,14 +140,8 @@ def make_known_insert_file(gff, out): return out_file -def run_popoolationte(sam, reference, taxon, read_len, insert_size, max_dist, ref_inserts, script_dir, out_dir, - log=None, - identify_min_count=3, - identify_min_qual=15, - crosslink_site_shift=100, - update_te_inserts_site_shift=100, - estimate_polymorphism_min_qual=15, - filter_min_count=5): +def run_popoolationte(sam, reference, taxon, read_len, insert_size, max_dist, ref_inserts, script_dir, out_dir, params, + log=None): mccutils.log("popoolationte","identify-te-insertsites.pl") insert_sites = out_dir+"te-fwd-rev.txt" @@ -160,8 +149,14 @@ def run_popoolationte(sam, reference, taxon, read_len, insert_size, max_dist, re "--input", sam, "--te-hierarchy-file", taxon, "--te-hierarchy-level", "family", - "--narrow-range", str(read_len), "--min-count", str(identify_min_count), - "--min-map-qual", str(identify_min_qual), "--output", insert_sites, "--insert-distance", str(insert_size), "--read-length", str(read_len)] + "--narrow-range", str(read_len), + "--output", insert_sites, + "--insert-distance", str(insert_size), + "--read-length", str(read_len)] + + for param in params["identify-te-insertsites.pl"].keys(): + command.append(param) + command.append(str(params["identify-te-insertsites.pl"][param])) mccutils.run_command(command, log=log) mccutils.log("popoolationte","genomic-N-2gtf.pl") @@ -176,10 +171,13 @@ def run_popoolationte(sam, reference, taxon, read_len, insert_size, max_dist, re "--min-dist", str(read_len), "--max-dist", str(max_dist), "--output", crosslinked, - "--single-site-shift", str(crosslink_site_shift), "--poly-n", poly_n, "--te-hierarchy", taxon, "--te-hier-level", "family"] + + for param in params["crosslink-te-sites.pl"].keys(): + command.append(param) + command.append(str(params["crosslink-te-sites.pl"][param])) mccutils.run_command(command, log=log) mccutils.log("popoolationte","update-teinserts-with-knowntes.pl") @@ -190,8 +188,11 @@ def run_popoolationte(sam, reference, taxon, read_len, insert_size, max_dist, re "--te-hierarchy-file", taxon, "--te-hierarchy-level", "family", "--max-dist", str(max_dist), - "--te-insertions", crosslinked, - "--single-site-shift", str(update_te_inserts_site_shift)] + "--te-insertions", crosslinked] + + for param in params["update-teinserts-with-knowntes.pl"].keys(): + command.append(param) + command.append(str(params["update-teinserts-with-knowntes.pl"][param])) mccutils.run_command(command, log=log) mccutils.log("popoolationte","estimate-polymorphism.pl") @@ -201,8 +202,11 @@ def run_popoolationte(sam, reference, taxon, read_len, insert_size, max_dist, re "--te-insert-file", updated_inserts, "--te-hierarchy-file", taxon, "--te-hierarchy-level", "family", - "--min-map-qual", str(estimate_polymorphism_min_qual), "--output", te_polymorphism] + + for param in params["estimate-polymorphism.pl"].keys(): + command.append(param) + command.append(str(params["estimate-polymorphism.pl"][param])) mccutils.run_command(command, log=log) mccutils.log("popoolationte","filter-teinserts.pl") @@ -210,8 +214,11 @@ def run_popoolationte(sam, reference, taxon, read_len, insert_size, max_dist, re command = ["perl", script_dir+"filter-teinserts.pl", "--te-insertions", te_polymorphism, "--output", filtered, - "--discard-overlapping", - "--min-count", str(filter_min_count)] + "--discard-overlapping"] + + for param in params["filter-teinserts.pl"].keys(): + command.append(param) + command.append(str(params["filter-teinserts.pl"][param])) mccutils.run_command(command, log=log) if __name__ == "__main__": diff --git a/scripts/popoolationte2/popoolationte2_post.py b/scripts/popoolationte2/popoolationte2_post.py index 3c39320..cd721e4 100644 --- a/scripts/popoolationte2/popoolationte2_post.py +++ b/scripts/popoolationte2/popoolationte2_post.py @@ -29,7 +29,7 @@ def main(): if prev_step_succeeded: ref_tes = get_ref_tes(te_gff, taxonomy, chromosomes) - insertions = read_insertions(te_predictions, ref_tes, chromosomes, sample_name, both_end_support_needed=config.REQUIRE_BOTH_END_SUPPORT, support_threshold=config.FREQUENCY_THRESHOLD) + insertions = read_insertions(te_predictions, ref_tes, chromosomes, sample_name, both_end_support_needed=config.PARAMS["require_both_end_support"], support_threshold=config.PARAMS["frequency_threshold"]) if len(insertions) >= 1: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="popoolationte2") insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="popoolationte2") diff --git a/scripts/popoolationte2/popoolationte2_run.py b/scripts/popoolationte2/popoolationte2_run.py index 291cf7b..8b2905c 100644 --- a/scripts/popoolationte2/popoolationte2_run.py +++ b/scripts/popoolationte2/popoolationte2_run.py @@ -28,12 +28,12 @@ def main(): try: mccutils.mkdir(out_dir+"/tmp") taxonomy = format_taxonomy(taxonomy, out_dir) - ppileup = popoolationte2_ppileup(jar, config.ppileup, bam, taxonomy, out_dir, log=log) - ppileup = popoolationte2_subsample(jar, config.subsampleppileup, ppileup, out_dir, log=log) - signatures = popoolationte2_signatures(jar, config.identifySignatures, ppileup, out_dir, log=log) - signatures = popoolationte2_strand(jar, config.updateStrand, signatures, bam, taxonomy, out_dir, log=log) + ppileup = popoolationte2_ppileup(jar, config.PARAMS["ppileup"], bam, taxonomy, out_dir, log=log) + ppileup = popoolationte2_subsample(jar, config.PARAMS["subsampleppileup"], ppileup, out_dir, log=log) + signatures = popoolationte2_signatures(jar, config.PARAMS["identifySignatures"], ppileup, out_dir, log=log) + signatures = popoolationte2_strand(jar, config.PARAMS["updateStrand"], signatures, bam, taxonomy, out_dir, log=log) signatures = popoolationte2_frequency(jar, ppileup, signatures, out_dir, log=log) - te_insertions = popoolationte2_pairup(jar, config.pairupSignatures, signatures, ref_fasta, taxonomy, out_dir, log=log) + te_insertions = popoolationte2_pairup(jar, config.PARAMS["pairupSignatures"], signatures, ref_fasta, taxonomy, out_dir, log=log) mccutils.remove(out_dir+"/tmp") mccutils.check_file_exists(snakemake.output[0]) @@ -70,14 +70,18 @@ def format_taxonomy(taxon, out): def popoolationte2_ppileup(jar, params, bam, taxon, out, log=None): mccutils.log("popoolationte2","making physical pileup file", log=log) ppileup = out+"/output.ppileup.gz" - mccutils.run_command(['java', "-Djava.io.tmpdir="+out+"/tmp", "-jar", jar, "ppileup", + command = ['java', "-Djava.io.tmpdir="+out+"/tmp", "-jar", jar, "ppileup", "--bam", bam, "--hier", taxon, - "--map-qual", str(params["map-qual"]), - "--sr-mindist", str(params['sr-mindist']), - "--id-up-quant", str(params['id-up-quant']), - "--output", ppileup], log=log) + "--output", ppileup] + + for param in params.keys(): + command.append(param) + command.append(str(params[param])) + + mccutils.run_command(command, log=log) + return ppileup def popoolationte2_subsample(jar, params, ppileup, out, log=None): @@ -85,11 +89,15 @@ def popoolationte2_subsample(jar, params, ppileup, out, log=None): if params["run"]: mccutils.log("popoolationte2","subsampling physical pileup file to uniform coverage", log=log) command = ["java", "-Djava.io.tmpdir="+out+"/tmp", "-jar", jar, "subsampleppileup", - "--ppileup", ppileup, - "--target-coverage", str(params['target-coverage']), + "--ppileup", ppileup, "--output", out_ppileup] - if params['with-replace']: - command.append("--with-replace") + + for param in params.keys(): + if param == True and param != "run": + command.append(param) + elif param != False: + command.append(param) + command.append(str(params[param])) mccutils.run_command(command, log=log) else: @@ -101,29 +109,33 @@ def popoolationte2_subsample(jar, params, ppileup, out, log=None): def popoolationte2_signatures(jar, params, ppileup, out, log=None): mccutils.log("popoolationte2","identifying signatures of TE insertions", log=log) signatures = out+"/output.signatures" - mccutils.run_command(["java", "-Djava.io.tmpdir="+out+"/tmp", "-jar", jar, "identifySignatures", + command = ["java", "-Djava.io.tmpdir="+out+"/tmp", "-jar", jar, "identifySignatures", "--ppileup", ppileup, "--mode", "separate", - "--output", signatures, - "--signature-window", params['signature-window'], - "--min-valley", params['min-valley'], - "--chunk-distance", str(params['chunk-distance']), - "--min-count", str(params['min-count'])], log=log) + "--output", signatures] + + for param in params.keys(): + command.append(param) + command.append(str(params[param])) + + mccutils.run_command(command, log=log) return signatures def popoolationte2_strand(jar, params, signatures, bam, taxon, out, log=None): mccutils.log("popoolationte2", "estimating strand of TEs", log=log) out_sig = out+"/output.stranded.signatures" - mccutils.run_command(["java", "-Djava.io.tmpdir="+out+"/tmp", "-jar", jar, "updateStrand", + command = ["java", "-Djava.io.tmpdir="+out+"/tmp", "-jar", jar, "updateStrand", "--signature", signatures, "--output", out_sig, "--bam", bam, - "--hier", taxon, - "--max-disagreement", str(params["max-disagreement"]), - "--sr-mindist", str(params['sr-mindist']), - "--map-qual", str(params['map-qual']), - "--id-up-quant", str(params['id-up-quant'])], log=log) + "--hier", taxon] + + for param in params.keys(): + command.append(param) + command.append(str(params[param])) + + mccutils.run_command(command, log=log) return out_sig @@ -140,15 +152,18 @@ def popoolationte2_frequency(jar, ppileup, signatures, out, log=None): def popoolationte2_pairup(jar, params, signatures, ref, taxon, out, log=None): mccutils.log("popoolationte2","generating raw TE insertion predictions", log=log) te_insertions = out+"/teinsertions.txt" - mccutils.run_command(["java", "-Djava.io.tmpdir="+out+"/tmp", "-jar", jar, "pairupSignatures", + command = ["java", "-Djava.io.tmpdir="+out+"/tmp", "-jar", jar, "pairupSignatures", "--signature", signatures, "--ref-genome", ref, "--hier", taxon, "--output-detail", "medium", - "--min-distance", str(params['min-distance']), - "--max-distance", str(params['max-distance']), - "--max-freq-diff", str(params['max-freq-diff']), - "--output", te_insertions], log=log) + "--output", te_insertions] + + for param in params.keys(): + command.append(param) + command.append(str(params[param])) + + mccutils.run_command(command, log=log) return te_insertions diff --git a/scripts/preprocessing/setup_reads.py b/scripts/preprocessing/setup_reads.py index 45cea45..95c36d3 100644 --- a/scripts/preprocessing/setup_reads.py +++ b/scripts/preprocessing/setup_reads.py @@ -48,11 +48,9 @@ def main(): if "trimgalore" in methods: mccutils.log("processing", "running trim_galore", log=log) if fq2 == "None": - flags = trimgalore.SINGLE_END_FLAGS - trimmedfq = run_trim_galore(fq1, run_id, log, mcc_out, cores=processors, flags=flags) + trimmedfq = run_trim_galore(fq1, run_id, log, mcc_out, cores=processors, params=trimgalore.PARAMS["single_end"]) else: - flags = trimgalore.PAIRED_END_FLAGS - trimmedfq, trimmedfq2 = run_trim_galore(fq1, run_id, log, mcc_out, fq2=fq2, cores=processors, flags=flags) + trimmedfq, trimmedfq2 = run_trim_galore(fq1, run_id, log, mcc_out, fq2=fq2, cores=processors, params=trimgalore.PARAMS["paired_end"]) run_multiqc(mcc_out+"/results/trimgalore/") @@ -138,13 +136,18 @@ def check_fastqs(fq1, fq2, out, min_length=30, log=None): has_valid_read_ids(fq1, fq2, log=log) -def run_trim_galore(fq1, run_id, log, out, fq2=None, cores=1, flags=[]): +def run_trim_galore(fq1, run_id, log, out, fq2=None, cores=1, params={}): mccutils.mkdir(out+"/results/") - command = ['trim_galore'] + flags + ["-j", str(cores), "-o", out+"/results/trimgalore"] + command = ['trim_galore', "-j", str(cores), "-o", out+"/results/trimgalore"] + + for param in params.keys(): + if params[param] == True: + command.append(param) + if fq2 is None: command.append(fq1) else: - command += ["--paired", fq1, fq2] + command += [fq1, fq2] mccutils.run_command(command, log=log) diff --git a/scripts/relocaTE/relocate_post.py b/scripts/relocaTE/relocate_post.py index 6db4f35..e365393 100644 --- a/scripts/relocaTE/relocate_post.py +++ b/scripts/relocaTE/relocate_post.py @@ -28,7 +28,15 @@ def main(): prev_step_succeeded = mccutils.check_status_file(status_log) if prev_step_succeeded: - insertions = get_insertions(relocate_gff, sample_name, chromosomes, ref_l_threshold=config.REF_LEFT_THRESHOLD, ref_r_threshold=config.REF_RIGHT_THRESHOLD, nonref_l_threshold=config.NONREF_LEFT_THRESHOLD, nonref_r_threshold=config.NONREF_RIGHT_THRESHOLD) + insertions = get_insertions( + relocate_gff, + sample_name, + chromosomes, + ref_l_threshold=config.PARAMS["ref_left_threshold"], + ref_r_threshold=config.PARAMS["ref_right_threshold"], + nonref_l_threshold=config.PARAMS["nonref_left_threshold"], + nonref_r_threshold=config.PARAMS["nonref_right_threshold"] + ) insertions = set_ref_orientations(insertions, te_gff) if len(insertions) >= 1: diff --git a/scripts/relocaTE/relocate_run.py b/scripts/relocaTE/relocate_run.py index 57037e0..3e61169 100644 --- a/scripts/relocaTE/relocate_run.py +++ b/scripts/relocaTE/relocate_run.py @@ -73,13 +73,11 @@ def main(): "-d", fq_dir, "-g", reference_fasta, "-o", ".", - "-r", annotation, - "-l", str(config.RELOCATE['l']), - "-m", str(config.RELOCATE['m']), - "-bm", str(config.RELOCATE['bm']), - "-bt", str(config.RELOCATE['bt']), - "-f", str(config.RELOCATE['f'])] + "-r", annotation] + for param in config.PARAMS.keys(): + command.append(param) + command.append(str(config.PARAMS[param])) if is_paired: command += ["-1", fq1_uniq_id, "-2", fq2_uniq_id] diff --git a/scripts/relocaTE2/relocate2_post.py b/scripts/relocaTE2/relocate2_post.py index 8b3b2d2..4822d70 100644 --- a/scripts/relocaTE2/relocate2_post.py +++ b/scripts/relocaTE2/relocate2_post.py @@ -33,19 +33,19 @@ def main(): sample_name, chromosomes, insert_type="ref", - l_support_threshold=config.REF_LEFT_SUPPORT_THRESHOLD, - r_support_threshold=config.REF_RIGHT_SUPPORT_THRESHOLD, - l_junction_threshold=config.REF_LEFT_JUNCTION_THRESHOLD, - r_junction_threshold=config.REF_RIGHT_JUNCTION_THRESHOLD) + l_support_threshold=config.PARAMS["ref_left_support_threshold"], + r_support_threshold=config.PARAMS["ref_right_support_threshold"], + l_junction_threshold=config.PARAMS["ref_left_junction_threshold"], + r_junction_threshold=config.PARAMS["ref_right_junction_threshold"]) nonref_insertions = get_insertions(nonref_gff, sample_name, chromosomes, insert_type="nonref", - l_support_threshold=config.NONREF_LEFT_SUPPORT_THRESHOLD, - r_support_threshold=config.NONREF_RIGHT_SUPPORT_THRESHOLD, - l_junction_threshold=config.NONREF_LEFT_JUNCTION_THRESHOLD, - r_junction_threshold=config.NONREF_RIGHT_JUNCTION_THRESHOLD) + l_support_threshold=config.PARAMS["nonref_left_support_threshold"], + r_support_threshold=config.PARAMS["nonref_right_support_threshold"], + l_junction_threshold=config.PARAMS["nonref_left_junction_threshold"], + r_junction_threshold=config.PARAMS["nonref_right_junction_threshold"]) ref_insertions = fix_ref_te_names(ref_insertions, rm_out, sample_name) diff --git a/scripts/relocaTE2/relocate2_run.py b/scripts/relocaTE2/relocate2_run.py index e3f60ab..9d3e1be 100644 --- a/scripts/relocaTE2/relocate2_run.py +++ b/scripts/relocaTE2/relocate2_run.py @@ -74,14 +74,13 @@ def main(): "--run", "-v", "4", "-c", str(threads), - "--aligner", config.RELOCATE2["aligner"], - "--len_cut_match", str(config.RELOCATE2["len_cut_match"]), - "--len_cut_trim", str(config.RELOCATE2["len_cut_trim"]), - "--mismatch", str(config.RELOCATE2["mismatch"]), - "--mismatch_junction", str(config.RELOCATE2["mismatch_junction"]), "-d", fq_dir ] + for param in config.PARAMS.keys(): + command.append(param) + command.append(str(config.PARAMS[param])) + if is_paired: command += ["-1", "_1", "-2", "_2"] diff --git a/scripts/retroseq/retroseq_post.py b/scripts/retroseq/retroseq_post.py index 95190db..85ce55c 100644 --- a/scripts/retroseq/retroseq_post.py +++ b/scripts/retroseq/retroseq_post.py @@ -26,7 +26,7 @@ def main(): prev_steps_succeeded = mccutils.check_status_file(status_log) if prev_steps_succeeded: - insertions = read_insertions(retroseq_out, sample_name, chromosomes, support_threshold=config.READ_SUPPORT_THRESHOLD, breakpoint_threshold=config.BREAKPOINT_CONFIDENCE_THRESHOLD) + insertions = read_insertions(retroseq_out, sample_name, chromosomes, support_threshold=config.PARAMS["read_support_threshold"], breakpoint_threshold=config.PARAMS["breakpoint_confidence_threshold"]) if len(insertions) >= 1: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="retroseq") insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="retroseq") diff --git a/scripts/retroseq/retroseq_run.py b/scripts/retroseq/retroseq_run.py index 5e62909..f8d00ce 100644 --- a/scripts/retroseq/retroseq_run.py +++ b/scripts/retroseq/retroseq_run.py @@ -46,7 +46,7 @@ def main(): bed_location_file = make_consensus_beds(elements, ref_name, ref_te_bed, taxonomy, out_dir) - run_retroseq(bam, bed_location_file, ref_fasta, script_dir, sample_name, out_dir, config.PARAMETERS, log=log) + run_retroseq(bam, bed_location_file, ref_fasta, script_dir, sample_name, out_dir, config.PARAMS, log=log) with open(status_log,"w") as l: l.write("COMPLETED\n") @@ -133,11 +133,31 @@ def make_consensus_beds(elements, ref_name, te_bed, taxon, out): def run_retroseq(bam, bed_locations, ref_fasta, script_dir, sample_name, out_dir, params, log=None): discovery_out = out_dir+"/"+sample_name+".discovery" - command = ["perl", script_dir+"/retroseq.pl", "-discover", "-bam", bam, "-refTEs", bed_locations, "-output", discovery_out, "-depth", str(params["depth"]), "-reads", str(params['reads']), "-q", str(params['q'])] + command = ["perl", script_dir+"/retroseq.pl", + "-discover", + "-bam", bam, + "-refTEs", bed_locations, + "-output", discovery_out] + + for param in params.keys(): + command.append(param) + command.append(str(params[param])) mccutils.run_command(command, log=log) call_out = out_dir+"/"+sample_name+".call" - command = ["perl", script_dir+"/retroseq.pl", "-call", "-bam", bam, "-input", discovery_out, "-filter", bed_locations, "-ref", ref_fasta, "-output", call_out, "-orientate", "yes", "-depth", str(params["depth"]), "-reads", str(params['reads']), "-q", str(params['q'])] + command = ["perl", script_dir+"/retroseq.pl", + "-call", + "-bam", bam, + "-input", discovery_out, + "-filter", bed_locations, + "-ref", ref_fasta, + "-output", call_out, + "-orientate", "yes"] + + for param in params.keys(): + command.append(param) + command.append(str(params[param])) + mccutils.run_command(command, log=log) diff --git a/scripts/teflon/teflon_post.py b/scripts/teflon/teflon_post.py index c98d23a..c2f63d6 100644 --- a/scripts/teflon/teflon_post.py +++ b/scripts/teflon/teflon_post.py @@ -34,11 +34,11 @@ def main(): chromosomes, sample_name, ref_tes, - min_presence=config.PARAMETERS['min_presence_reads'], - max_absence=config.PARAMETERS['max_absence_reads'], - min_presence_fraction=config.PARAMETERS['min_presence_fraction'], - require_tsd=config.PARAMETERS['require_tsd'], - require_both_breakpoints=config.PARAMETERS['require_both_breakpoints'] + min_presence=config.PARAMS['min_presence_reads'], + max_absence=config.PARAMS['max_absence_reads'], + min_presence_fraction=config.PARAMS['min_presence_fraction'], + require_tsd=config.PARAMS['require_tsd'], + require_both_breakpoints=config.PARAMS['require_both_breakpoints'] ) if len(insertions) >= 1: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="teflon") diff --git a/scripts/teflon/teflon_run.py b/scripts/teflon/teflon_run.py index 46d991a..7cecd38 100644 --- a/scripts/teflon/teflon_run.py +++ b/scripts/teflon/teflon_run.py @@ -37,13 +37,13 @@ def main(): sample_table, threads=threads, log=log, - quality_threshold=config.PARAMETERS['q'], - stdev=config.PARAMETERS['sd'], - cov=config.PARAMETERS['cov'], - te_support1=config.PARAMETERS['n1'], - te_support2=config.PARAMETERS['n2'], - read_count_lower_threshold=config.PARAMETERS['lt'], - read_count_higher_threshold=config.PARAMETERS['ht'] + quality_threshold=config.PARAMS['-q'], + stdev=config.PARAMS['-sd'], + cov=config.PARAMS['-cov'], + te_support1=config.PARAMS['-n1'], + te_support2=config.PARAMS['-n2'], + read_count_lower_threshold=config.PARAMS['-lt'], + read_count_higher_threshold=config.PARAMS['-ht'] ) mccutils.check_file_exists(snakemake.output[0]) diff --git a/scripts/telocate/telocate_post.py b/scripts/telocate/telocate_post.py index 8ca193e..e018c19 100644 --- a/scripts/telocate/telocate_post.py +++ b/scripts/telocate/telocate_post.py @@ -25,7 +25,7 @@ def main(): prev_steps_succeeded = mccutils.check_status_file(status_log) if prev_steps_succeeded: - insertions = read_insertions(telocate_raw, sample_name, chromosomes, rp_threshold=config.READ_PAIR_SUPPORT_THRESHOLD) + insertions = read_insertions(telocate_raw, sample_name, chromosomes, rp_threshold=config.PARAMS['read_pair_support_threshold']) insertions = filter_by_reference(insertions, te_gff) if len(insertions) > 0: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="telocate") diff --git a/scripts/telocate/telocate_run.py b/scripts/telocate/telocate_run.py index 107a4c3..91e371e 100644 --- a/scripts/telocate/telocate_run.py +++ b/scripts/telocate/telocate_run.py @@ -46,9 +46,9 @@ def main(): median_insert_size = mccutils.get_median_insert_size(median_insert_size_file) - distance = (median_insert_size * config.MIN_DISTANCE) + distance = (median_insert_size * config.PARAMS["min_distance"]) - command = ["perl", telocate, str(config.MAX_MEM), sam_dir, te_gff, ref_fasta, out_dir, str(distance), str(config.MIN_SUPPORT_READS), str(config.MIN_SUPPORT_INDIVIDUALS)] + command = ["perl", telocate, str(config.PARAMS["max_mem"]), sam_dir, te_gff, ref_fasta, out_dir, str(distance), str(config.PARAMS["min_support_reads"]), str(config.PARAMS["min_support_individuals"])] mccutils.run_command(command, log=log) diff --git a/scripts/temp2/temp2_post.py b/scripts/temp2/temp2_post.py index 7d94948..031400a 100644 --- a/scripts/temp2/temp2_post.py +++ b/scripts/temp2/temp2_post.py @@ -78,8 +78,8 @@ def read_insertions(insert_bed, sample_name, chromosomes, config): if ( insert.chromosome in chromosomes and - insert.support_info.support["frequency"].value >= config.FREQUENCY_THRESHOLD and - insert.support_info.support["class"].value in config.ACCEPTABLE_INSERTION_SUPPORT_CLASSES + insert.support_info.support["frequency"].value >= config.PARAMS["frequency_threshold"] and + insert.support_info.support["class"].value in config.PARAMS["acceptable_insertion_support_classes"] ): insertions.append(insert) diff --git a/scripts/temp2/temp2_run.py b/scripts/temp2/temp2_run.py index a732fae..62e9c1c 100644 --- a/scripts/temp2/temp2_run.py +++ b/scripts/temp2/temp2_run.py @@ -89,21 +89,15 @@ def run_temp2_insertion(fq1, fq2, bam, insert_size, reference, scripts, consensu "-t", te_bed, "-c", str(threads), "-f", str(insert_size), - "-o", out, - "-M", str(config.GENOME_MISMATCH_PCT), - "-m", str(config.TE_MISMATCH_PCT), - "-U", str(config.RATIO), - "-N", str(config.FILTER_WINDOW), + "-o", out ] - if config.TRUNCATED: - command.append("-T") - - if config.LOOSE_FILTER: - command.append("-L") - - if config.SKIP_INS_LEN_CHECK: - command.append("-S") + for param in config.PARAMS['insertion'].keys(): + if config.PARAMS['insertion'][param] == True: + command.append(param) + elif config.PARAMS['insertion'][param] != False: + command.append(param) + command.append(str(config.PARAMS['insertion'][param])) mccutils.run_command(command, log=log) @@ -116,10 +110,13 @@ def run_temp2_absence(scripts, bam, reference_2bit, te_bed, insert_size, threads "-t", reference_2bit, "-f", str(insert_size), "-c", str(threads), - "-o", out, - "-x", str(config.UNIQ_MAP_SCORE) + "-o", out ] + for param in config.PARAMS['absence'].keys(): + command.append(param) + command.append(str(config.PARAMS['absence'])) + mccutils.run_command(command, log=log) if __name__ == "__main__": From e2dc8cb3f416d8e7756b9edd2765e8e08242c6e4 Mon Sep 17 00:00:00 2001 From: pbasting Date: Wed, 19 May 2021 17:49:06 -0400 Subject: [PATCH 04/13] updating readthedocs for the config files --- docs/running/config.rst | 53 ++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 9 deletions(-) diff --git a/docs/running/config.rst b/docs/running/config.rst index 3a2e194..4e17401 100644 --- a/docs/running/config.rst +++ b/docs/running/config.rst @@ -1,12 +1,13 @@ -=================== +################### Method config files -=================== +################### Each component method has associated config file(s) that enable the modification of run parameters and filtering parameters. TE detection components have separate run and post processing config files. These files are located in the `config directory `_ of mcclintock :code:`/path/to/mcclintock/config`. +*********************** Example run config file -""""""""""""""""""""""" +*********************** .. code:: python PARAMS = { @@ -19,10 +20,10 @@ Example run config file Each config file contains a python :code:`dict` named :code:`PARAMS` which contains :code:`key : value` pairs with the :code:`key` being the flag or name of the parameter, and the :code:`value` being the value associated with the parameter. Most of the :code:`values` are set to the default parameters for each tool. The :code:`values` can be modified to better fit the data being used. - -Coverage config -""""""""""""""" -This config files contains the parameters that can be modified for the :code:`coverage` component method. +******** +Coverage +******** +:code:`/path/to/mcclintock/config/coverage/coverage.py` : `GitHub link `_ .. code:: python @@ -32,7 +33,7 @@ This config files contains the parameters that can be modified for the :code:`co "omit_edges_length" : 300 } -:code:`/path/to/mcclintock/config/coverage/coverage.py` : `GitHub Link `_ +This config file contains the parameters that can be modified for the :code:`coverage` component method. omit_edges * Omits the edges of the TE sequence coverage when calculating average depth across the element. @@ -41,4 +42,38 @@ omit_edges_read_length * If :code:`omit_edges: True` and :code:`omit_edges_read_length True`, then the read length will be used as the length of the edges to omit omit_edges_length - * If :code:`omit_edges: True` and :code:`omit_edges_read_length False`, the value of :code:`omit_edges_length` will be used as the length of the edges to omit \ No newline at end of file + * If :code:`omit_edges: True` and :code:`omit_edges_read_length False`, the value of :code:`omit_edges_length` will be used as the length of the edges to omit + +************* +ngs_te_mapper +************* + +run config +========== +:code:`/path/to/mcclintock/config/ngs_te_mapper/ngs_te_mapper_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "tsd=" : 20 + } + +This config file contains the parameters that can be modified to influence the running of the :code:`ngs_te_mapper` component method. + +tsd= + * The max length of junction read overlap to consider a target site duplication + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/ngs_te_mapper/ngs_te_mapper_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "min_read_support" : 0 + } + +This config file contains the parameters that influence the post processing of the :code:`ngs_te_mapper` predictions. + +min_read_support + * the minimum number of reads supporting an insertion required to be considered a valid prediction \ No newline at end of file From deab09bf375b19910abf7752a110f3ca9a0ec267 Mon Sep 17 00:00:00 2001 From: pbasting Date: Fri, 21 May 2021 00:14:24 -0400 Subject: [PATCH 05/13] getting tepid and jitterbug working with new mcclintock updates --- config/tepid/tepid_run.py | 0 install/envs/mcc_tepid.yml | 3 +- install/scripts/ngs_te_mapper2.py | 2 +- internal/install.py | 4 +- internal/sysconfig.py | 14 +- scripts/jitterbug/jitterbug_post.py | 67 ++++--- scripts/jitterbug/jitterbug_run.py | 62 +++--- scripts/mccutils.py | 283 ---------------------------- scripts/output.py | 26 +-- scripts/tepid/tepid_post.py | 71 ++++--- scripts/tepid/tepid_run.py | 158 ++++++++-------- snakefiles/jitterbug.snakefile | 11 +- snakefiles/tepid.snakefile | 11 +- 13 files changed, 256 insertions(+), 456 deletions(-) create mode 100644 config/tepid/tepid_run.py diff --git a/config/tepid/tepid_run.py b/config/tepid/tepid_run.py new file mode 100644 index 0000000..e69de29 diff --git a/install/envs/mcc_tepid.yml b/install/envs/mcc_tepid.yml index eda6502..d36ac9a 100644 --- a/install/envs/mcc_tepid.yml +++ b/install/envs/mcc_tepid.yml @@ -4,4 +4,5 @@ channels: - conda-forge - defaults dependencies: - - tepid=0.10 \ No newline at end of file + - tepid=0.10 + - tbb=2020.2 \ No newline at end of file diff --git a/install/scripts/ngs_te_mapper2.py b/install/scripts/ngs_te_mapper2.py index 6c9f9ce..d463f2e 100644 --- a/install/scripts/ngs_te_mapper2.py +++ b/install/scripts/ngs_te_mapper2.py @@ -6,7 +6,7 @@ def main(): install_path = snakemake.config['paths']['install']+"/tools/" - raw_name="ngs_te_mapper2-1.01" + raw_name="ngs_te_mapper2-d6ed0941cb847771ba5fa17e82f2ef5b54351bd8" method_name = "ngs_te_mapper2" mccutils.remove(snakemake.params.zipfile) diff --git a/internal/install.py b/internal/install.py index 8e3edad..45d3f07 100644 --- a/internal/install.py +++ b/internal/install.py @@ -5,7 +5,7 @@ "temp2": "https://github.com/weng-lab/TEMP2/archive/4d25d050dc7832b82374502573e882a242457a0c.zip", "relocate" : "https://github.com/srobb1/RelocaTE/archive/ce3a2066e15f5c14e2887fdf8dce0485e1750e5b.zip", "ngs_te_mapper" : "https://github.com/bergmanlab/ngs_te_mapper/archive/f9f48996ac346ac86d57edbd00534aa1227b753e.zip", - "ngs_te_mapper2": "https://github.com/bergmanlab/ngs_te_mapper2/archive/v1.01.zip", + "ngs_te_mapper2": "https://github.com/bergmanlab/ngs_te_mapper2/archive/d6ed0941cb847771ba5fa17e82f2ef5b54351bd8.zip", "popoolationte" : "http://downloads.sourceforge.net/project/popoolationte/popoolationte_1.02.zip", "popoolationte2": "http://downloads.sourceforge.net/project/popoolation-te2/popte2-v1.10.03.jar", "relocate2" : "https://github.com/stajichlab/RelocaTE2/archive/v2.0.1.zip", @@ -22,7 +22,7 @@ "temp2" : "55b6449113e85540f8a39bb02cfec9b0", "relocate" : "64cc60f65154368518529fb57becc3d6", "ngs_te_mapper" : "a00e9962ee3daca9fac915c23c6af203", - "ngs_te_mapper2": "84232b8231bebe3888ab58954ef329e2", + "ngs_te_mapper2": "5527299633ea712001fd02785314a273", "popoolationte" : "315d266bd6da7487c919576dc46b92b4", "popoolationte2": "0c06186f0b1df215949f57a5cbb2d039", "relocate2" : "63eb7dbc09ffeb4063aef4857d198b06", diff --git a/internal/sysconfig.py b/internal/sysconfig.py index 8a9a91e..fbece79 100644 --- a/internal/sysconfig.py +++ b/internal/sysconfig.py @@ -1,7 +1,7 @@ -ALL_METHODS = ["ngs_te_mapper", "ngs_te_mapper2", "relocate", "relocate2", "temp", "temp2", "retroseq", "popoolationte", "popoolationte2", "te-locate", "teflon", "coverage", "trimgalore","map_reads", "tebreak"] -SINGLE_END_METHODS = ["ngs_te_mapper", "ngs_te_mapper2", "relocate", "coverage", "trimgalore", "map_reads", "tebreak"] -MULTI_THREAD_METHODS = ["coverage", "temp", "temp2", "relocate2", "ngs_te_mapper", "ngs_te_mapper2", "popoolationte", "teflon", "trimgalore", "tebreak"] -NO_INSTALL_METHODS = ["trimgalore", "map_reads", "coverage", "relocate2"] # no source code to install for these methods, just envs +ALL_METHODS = ["ngs_te_mapper", "ngs_te_mapper2", "relocate", "relocate2", "temp", "temp2", "retroseq", "popoolationte", "popoolationte2", "te-locate", "teflon", "coverage", "trimgalore","map_reads", "tebreak", "jitterbug", "tepid"] +SINGLE_END_METHODS = ["ngs_te_mapper", "ngs_te_mapper2", "relocate", "coverage", "trimgalore", "map_reads", "tebreak", "tepid"] +MULTI_THREAD_METHODS = ["coverage", "temp", "temp2", "relocate2", "ngs_te_mapper", "ngs_te_mapper2", "popoolationte", "teflon", "trimgalore", "tebreak", "jitterbug", "tepid"] +NO_INSTALL_METHODS = ["trimgalore", "map_reads", "coverage", "relocate2", "tepid"] # no source code to install for these methods, just envs INPUT_DIR = "{{indir}}" REF_DIR = "{{refdir}}" @@ -24,6 +24,8 @@ "te-locate": ["telocate/telocate_run.py", "telocate/telocate_post.py"], "temp": ["temp/temp_run.py", "temp/temp_post.py"], "temp2": ["temp2/temp2_run.py", "temp2/temp2_post.py"], + "jitterbug": ["jitterbug/jitterbug_run.py", "jitterbug/jitterbug_post.py"], + "tepid": ["tepid/tepid_run.py", "tepid/tepid_post.py"], "tebreak": ["tebreak/tebreak_run.py", "tebreak/tebreak_post.py"] } @@ -42,6 +44,8 @@ "te-locate": ["telocate_run", "telocate_post"], "temp": ["run_temp", "process_temp"], "temp2": ["run_temp2", "process_temp2"], + "jitterbug": ["jitterbug_run", "jitterbug_post"], + "tepid": ["tepid_run", "tepid_post"], "tebreak": ["tebreak_run", "tebreak_post"] } @@ -59,6 +63,8 @@ "te-locate": LOG_DIR+"status/telocate.status", "temp": LOG_DIR+"status/temp.status", "temp2": LOG_DIR+"status/temp2.status", + "jitterbug": LOG_DIR+"status/jitterbug.status", + "tepid": LOG_DIR+"status/tepid.status", "tebreak": LOG_DIR+"status/tebreak.status" } diff --git a/scripts/jitterbug/jitterbug_post.py b/scripts/jitterbug/jitterbug_post.py index 5be8eda..5f84893 100644 --- a/scripts/jitterbug/jitterbug_post.py +++ b/scripts/jitterbug/jitterbug_post.py @@ -1,37 +1,52 @@ import os import sys import subprocess +import importlib.util as il +spec = il.spec_from_file_location("config", snakemake.params.config) +config = il.module_from_spec(spec) +sys.modules[spec.name] = config +spec.loader.exec_module(config) sys.path.append(snakemake.config['args']['mcc_path']) import scripts.mccutils as mccutils -import config.jitterbug.jitterbug_post as config +import scripts.output as output def main(): mccutils.log("jitterbug","jitterbug postprocessing") jitterbug_out = snakemake.input.jitterbug_out te_taxonomy = snakemake.input.taxonomy + reference_fasta = snakemake.input.reference_fasta out_dir = snakemake.params.out_dir log = snakemake.params.log sample_name = snakemake.params.sample_name chromosomes = snakemake.params.chromosomes.split(",") + status_log = snakemake.params.status_log out = snakemake.output.out - insertions = read_insertions( - jitterbug_out, - te_taxonomy, - chromosomes, - sample_name, - min_fwd_read_support=config.FILTER['MIN_FWD_READ_SUPPORT'], - min_rev_read_support=config.FILTER['MIN_REV_READ_SUPPORT'], - min_sr_support=config.FILTER['MIN_SPLIT_READ_SUPPORT'], - min_zygosity=config.FILTER['MIN_ZYGOSITY'] - ) - - if len(insertions) >= 1: - insertions = mccutils.make_redundant_bed(insertions, sample_name, out_dir, method="jitterbug") - mccutils.make_nonredundant_bed(insertions, sample_name, out_dir, method="jitterbug") + prev_steps_succeeded = mccutils.check_status_file(status_log) + + if prev_steps_succeeded: + insertions = read_insertions( + jitterbug_out, + te_taxonomy, + chromosomes, + sample_name, + min_fwd_read_support=config.FILTER['MIN_FWD_READ_SUPPORT'], + min_rev_read_support=config.FILTER['MIN_REV_READ_SUPPORT'], + min_sr_support=config.FILTER['MIN_SPLIT_READ_SUPPORT'], + min_zygosity=config.FILTER['MIN_ZYGOSITY'] + ) + + if len(insertions) >= 1: + insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="jitterbug") + insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="jitterbug") + output.write_vcf(insertions, reference_fasta, sample_name, "jitterbug", out_dir) + else: + mccutils.run_command(["touch", out_dir+"/"+sample_name+"_jitterbug_redundant.bed"]) + mccutils.run_command(["touch", out_dir+"/"+sample_name+"_jitterbug_nonredundant.bed"]) + else: mccutils.run_command(["touch", out_dir+"/"+sample_name+"_jitterbug_redundant.bed"]) mccutils.run_command(["touch", out_dir+"/"+sample_name+"_jitterbug_nonredundant.bed"]) @@ -54,7 +69,8 @@ def read_insertions(jitterbug_gff, taxonomy, chroms, sample_name, min_fwd_read_s line = line.replace("\n","") split_line = line.split("\t") if len(split_line) == 9: - insert = mccutils.Insertion() + # insert = mccutils.Insertion() + insert = output.Insertion(output.Jitterbug()) insert.chromosome = split_line[0] if insert.chromosome in chroms: @@ -85,30 +101,31 @@ def read_insertions(jitterbug_gff, taxonomy, chroms, sample_name, min_fwd_read_s if "predicted_superfam" in feat: te = feat.split("=")[1] family = te_family[te] + insert.family = family if "supporting_fwd_reads" in feat: - insert.jitterbug.supporting_fwd_reads = int(feat.split("=")[1]) + insert.support_info.support['supporting_fwd_reads'].value = int(feat.split("=")[1]) if "supporting_rev_reads" in feat: - insert.jitterbug.supporting_rev_reads = int(feat.split("=")[1]) + insert.support_info.support['supporting_rev_reads'].value = int(feat.split("=")[1]) if "softclipped_support" in feat: - insert.jitterbug.split_read_support = int(feat.split("=")[1]) + insert.support_info.support['softclipped_support'].value = int(feat.split("=")[1]) if "zygosity" in feat: - insert.jitterbug.zygosity = float(feat.split("=")[1]) + insert.support_info.support['zygosity'].value = float(feat.split("=")[1]) - insert.name = family+"|non-reference|"+sample_name+"|jitterbug|" + insert.name = family+"|non-reference|"+str(insert.support_info.support['zygosity'].value)+"|"+sample_name+"|jitterbug|" if sr: insert.name += "sr|" else: insert.name = "rp|" if ( - (insert.jitterbug.supporting_fwd_reads >= min_fwd_read_support) and - (insert.jitterbug.supporting_rev_reads >= min_rev_read_support) and - (insert.jitterbug.split_read_support >= min_sr_support) and - (insert.jitterbug.zygosity >= min_zygosity) + (insert.support_info.support['supporting_fwd_reads'].value >= min_fwd_read_support) and + (insert.support_info.support['supporting_rev_reads'].value >= min_rev_read_support) and + (insert.support_info.support['softclipped_support'].value >= min_sr_support) and + (insert.support_info.support['zygosity'].value >= min_zygosity) ): insertions.append(insert) diff --git a/scripts/jitterbug/jitterbug_run.py b/scripts/jitterbug/jitterbug_run.py index 6ca2c26..dadf9c1 100644 --- a/scripts/jitterbug/jitterbug_run.py +++ b/scripts/jitterbug/jitterbug_run.py @@ -1,9 +1,14 @@ import os import sys import subprocess +import traceback +import importlib.util as il +spec = il.spec_from_file_location("config", snakemake.params.config) +config = il.module_from_spec(spec) +sys.modules[spec.name] = config +spec.loader.exec_module(config) sys.path.append(snakemake.config['args']['mcc_path']) import scripts.mccutils as mccutils -import config.jitterbug.jitterbug_run as config def main(): reference_te_gff = snakemake.input.reference_tes @@ -13,33 +18,46 @@ def main(): script_dir = snakemake.params.script_dir sample_name = snakemake.params.sample_name log = snakemake.params.log + status_log = snakemake.params.status_log threads = snakemake.threads out = snakemake.output.out mccutils.log("jitterbug","Running jitterbug", log=log) - out_gff, config_file = run_jitterbug( - script_dir, - bam, - reference_te_gff, - sample_name, out_dir, - minmapq=config.RUN['MINMAPQ'], - min_cluster_size=config.RUN['MIN_CLUSTER_SIZE'], - threads=threads, - log=log - ) - - config_file = make_config( - config_file, - out_dir, - cluster_size=config.FILTER["CLUSTER_SIZE"], - span=config.FILTER['SPAN'], - int_size=config.FILTER['INT_SIZE'], - softclipped=config.FILTER['SOFTCLIPPED'], - pick_consistent=config.FILTER['PICK_CONSISTENT'] - ) - filter_jitterbug(script_dir, out_gff, config_file, sample_name, out, log=log) + try: + out_gff, config_file = run_jitterbug( + script_dir, + bam, + reference_te_gff, + sample_name, out_dir, + minmapq=config.RUN['MINMAPQ'], + min_cluster_size=config.RUN['MIN_CLUSTER_SIZE'], + threads=threads, + log=log + ) + + config_file = make_config( + config_file, + out_dir, + cluster_size=config.FILTER["CLUSTER_SIZE"], + span=config.FILTER['SPAN'], + int_size=config.FILTER['INT_SIZE'], + softclipped=config.FILTER['SOFTCLIPPED'], + pick_consistent=config.FILTER['PICK_CONSISTENT'] + ) + filter_jitterbug(script_dir, out_gff, config_file, sample_name, out, log=log) + + except Exception as e: + track = traceback.format_exc() + print(track, file=sys.stderr) + with open(log,"a") as l: + print(track, file=l) + mccutils.log("Jitterbug","Jitterbug run failed") + with open(status_log,"w") as l: + l.write("FAILED\n") + + mccutils.run_command(["touch", out]) def run_jitterbug(script_dir, bam, ref_te_gff, sample_name, out_dir, minmapq=15, min_cluster_size=2, threads=1, log=None): diff --git a/scripts/mccutils.py b/scripts/mccutils.py index 6b8381d..d4265e3 100644 --- a/scripts/mccutils.py +++ b/scripts/mccutils.py @@ -18,100 +18,6 @@ def __init__(self, message): pass -class Ngs_te_mapper: - def __init__(self): - self.support = -1 - -class Temp: - def __init__(self): - self.classification = "None" - self.support = -1 - self.frequency = -1 - self.junction1 = -1 - self.junction1Support = -1 - self.junction2 = -1 - self.junction2Support = -1 - self.fivePrimeSupport = -1 - self.threePrimeSupport = -1 - -class Telocate: - def __init__(self): - self.read_pair_support = -1 - -class Retroseq: - def __init__(self): - self.read_support = -1 - self.breakpoint_confidence = -1 - -class Relocate2: - def __init__(self): - self.left_support = -1 - self.right_support = -1 - self.left_junction = -1 - self.right_junction = -1 - -class Relocate: - def __init__(self): - self.left_support = -1 - self.right_support = -1 - -class Popoolationte2: - def __init__(self): - self.support_type = "" - self.frequency = -1 - self.added = False - -class Popoolationte: - def __init__(self): - self.support_type = "" - self.f_read_support = 0 - self.r_read_support = 0 - self.f_read_support_percent = 0 - self.r_read_support_percent = 0 - -class Tepid: - def __init__(self): - self.id = -1 - self.support = 0 - -class Teflon: - def __init__(self): - left_sc_support = False - right_sc_support = False - presence_reads = 0 - absence_reads = 0 - ambiguous_reads = 0 - allele_frequency = 0.0 - -class Jitterbug: - def __init__(self): - supporting_fwd_reads = 0 - supporting_rev_reads = 0 - split_read_support = 0 - zygosity = 0.0 - -class Insertion: - def __init__(self): - self.chromosome = "None" - self.start = -1 - self.end = -1 - self.name = "None" - self.type = "None" - self.strand = "." - self.family = "" - self.popoolationte = Popoolationte() - self.popoolationte2 = Popoolationte2() - self.relocate = Relocate() - self.relocate2 = Relocate2() - self.retroseq = Retroseq() - self.telocate = Telocate() - self.temp = Temp() - self.ngs_te_mapper = Ngs_te_mapper() - self.tepid = Tepid() - self.teflon = Teflon() - self.jitterbug = Jitterbug() - - def mkdir(indir, log=None): if os.path.isdir(indir) == False: os.mkdir(indir) @@ -369,192 +275,3 @@ def log(step, msg, log=None): lines += " &> " + log print(lines) - - -def make_redundant_bed(insertions, sample_name, out_dir, method="popoolationte"): - tmp_bed = out_dir+"/tmp.bed" - - insertion_dict = {} - out_inserts = [] - malformed_inserts = [] - properly_formed_inserts = [] - for insert in insertions: - if insert.start <= insert.end: - insertion_dict[ "_".join([insert.chromosome, str(insert.start-1), str(insert.end), insert.name, "0", insert.strand])] = insert - properly_formed_inserts.append(insert) - else: - malformed_inserts.append(insert) - - insertions = properly_formed_inserts - # write malformed predictions to separate bed file - if len(malformed_inserts) > 0: - malformed_bed = out_dir+"/"+sample_name+"_"+method+"_malformed.bed" - with open(malformed_bed,"w") as out: - for insert in malformed_inserts: - out_line = "\t".join([insert.chromosome, str(insert.start-1), str(insert.end), insert.name, "0", insert.strand]) - out.write(out_line+"\n") - - with open(tmp_bed, "w") as out: - for insert in insertions: - out_line = "\t".join([insert.chromosome, str(insert.start-1), str(insert.end), insert.name, "0", insert.strand]) - out.write(out_line+"\n") - - sorted_bed = out_dir+"/sorted.bed" - command = ["bedtools", "sort", "-i", tmp_bed] - run_command_stdout(command, sorted_bed) - - redundant_bed = out_dir+"/"+sample_name+"_"+method+"_redundant.bed" - with open(redundant_bed, "w") as outbed: - header = 'track name="'+sample_name+'_'+method+'" description="'+sample_name+'_'+method+'"\n' - outbed.write(header) - with open(sorted_bed, "r") as inbed: - for x, line in enumerate(inbed): - - # outputs inserts in sorted order with unique number added to name - key = line.replace("\t","_") - key = key.replace("\n","") - insert = insertion_dict[key] - insert.name += str(x+1) - out_inserts.append(insert) - - # write to bed with unique number added to name - split_line = line.split("\t") - split_line[3] += str(x+1) - line = "\t".join(split_line) - outbed.write(line) - - remove(tmp_bed) - remove(sorted_bed) - - return out_inserts - -def make_nonredundant_bed(insertions, sample_name, out_dir, method="popoolationte"): - uniq_inserts = {} - - for insert in insertions: - key = "_".join([insert.chromosome, str(insert.start), str(insert.end), insert.type]) - if key not in uniq_inserts.keys(): - uniq_inserts[key] = insert - else: - ## method specific way to determine which duplicate to keep - if method == "popoolationte": - if (uniq_inserts[key].popoolationte.f_read_support + uniq_inserts[key].popoolationte.r_read_support) > (insert.popoolationte.f_read_support + insert.popoolationte.r_read_support): - uniq_inserts[key] = insert - - elif method == "popoolationte2": - if (uniq_inserts[key].popoolationte2.frequency) > (insert.popoolationte2.frequency): - uniq_inserts[key] = insert - - elif method == "relocate": - if (uniq_inserts[key].relocate.left_support + uniq_inserts[key].relocate.right_support) < (insert.relocate.left_support + insert.relocate.right_support): - uniq_inserts[key] = insert - - elif method == "relocate2": - if (uniq_inserts[key].relocate2.left_support + uniq_inserts[key].relocate2.right_support) < (insert.relocate2.left_support + insert.relocate2.right_support): - uniq_inserts[key] = insert - - elif method == "retroseq": - if uniq_inserts[key].retroseq.read_support > insert.retroseq.read_support: - uniq_inserts[key] = insert - - elif method == "te-locate": - if uniq_inserts[key].telocate.read_pair_support > insert.telocate.read_pair_support: - uniq_inserts[key] = insert - - elif method == "temp": - if insert.temp.support > uniq_inserts[key].temp.support: - uniq_inserts[key] = insert - - elif method == "ngs_te_mapper": - if insert.ngs_te_mapper.support > uniq_inserts[key].ngs_te_mapper.support: - uniq_inserts[key] = insert - - elif method == "tepid": - if insert.tepid.support > uniq_inserts[key].tepid.support: - uniq_inserts[key] = insert - - elif method == "teflon": - if insert.teflon.presence_reads > uniq_inserts[key].teflon.presence_reads: - uniq_inserts[key] = insert - - elif method == "jitterbug": - if insert.jitterbug.split_read_support > uniq_inserts[key].jitterbug.split_read_support: - uniq_inserts[key] = insert - - tmp_bed = out_dir+"/tmp.bed" - with open(tmp_bed, "w") as outbed: - for key in uniq_inserts.keys(): - insert = uniq_inserts[key] - out_line = "\t".join([insert.chromosome, str(insert.start-1), str(insert.end), insert.name, "0", insert.strand]) - outbed.write(out_line+"\n") - - sorted_bed = out_dir+"/sorted.bed" - command = ["bedtools", "sort", "-i", tmp_bed] - run_command_stdout(command, sorted_bed) - - nonredundant_bed = out_dir+"/"+sample_name+"_"+method+"_nonredundant.bed" - with open(sorted_bed, "r") as inbed: - with open(nonredundant_bed, "w") as outbed: - header = 'track name="'+sample_name+'_'+method+'" description="'+sample_name+'_'+method+'"\n' - outbed.write(header) - for line in inbed: - outbed.write(line) - - out_inserts = [] - with open(nonredundant_bed, "r") as outbed: - for line in outbed: - if "track name=" not in line and "description=" not in line: - split_line = line.split("\t") - ref_type = split_line[3].split("|")[1] - key = "_".join([split_line[0], str(int(split_line[1])+1), split_line[2], ref_type]) - out_inserts.append(uniq_inserts[key]) - remove(tmp_bed) - remove(sorted_bed) - - return out_inserts - -def write_vcf(inserts, genome_fasta, sample_name, method, out_dir): - contig_lengths = {} - with open(genome_fasta+".fai","r") as fai: - for line in fai: - split_line = line.split("\t") - contig_lengths[split_line[0]] = split_line[1] - - - contigs_with_inserts = [] - for insert in inserts: - if insert.chromosome not in contigs_with_inserts: - contigs_with_inserts.append(insert.chromosome) - - - out_vcf = out_dir+"/"+sample_name+"_"+method+"_nonredundant_non-reference.vcf" - with open(out_vcf, "w") as vcf: - today = date.today() - today_date = today.strftime("%Y-%m-%d") - meta = [ - "##fileformat=VCFv4.2", - "##fileDate="+today_date, - "##source=McClintock", - "##reference="+genome_fasta - ] - for contig in contigs_with_inserts: - meta.append("##contig=") - - meta.append('##INFO=') - meta.append('##INFO=') - meta.append('##INFO=') - meta.append('##INFO=') - - for line in meta: - vcf.write(line+"\n") - header = "\t".join(["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO"]) - vcf.write(header+"\n") - - ##TODO get ref sequence - ref = "N" - for insert in inserts: - vals = [insert.chromosome, str(insert.start), ".", ref, "", ".", "."] - info = ["END="+str(insert.end),"SVTYPE=INS", "STRAND="+insert.strand, "FAMILY="+insert.family] - - out_line = ("\t".join(vals)) + (";".join(info)) - vcf.write(out_line+"\n") diff --git a/scripts/output.py b/scripts/output.py index da807cb..bad049d 100644 --- a/scripts/output.py +++ b/scripts/output.py @@ -209,20 +209,24 @@ def __init__(self): self.redundancy_filter = RedundancyFilter(["split_reads_5prime", "split_reads_3prime"], "sum") -################################################# -## TODO convert to proper format for VCF creation -class Tepid: +class Jitterbug: def __init__(self): - self.id = -1 - self.support = 0 + self.support = { + "supporting_fwd_reads" : Info("SUPPORTING_FWD_READS", "Reads supporting the forward junction of the insertion", 0, "Integer"), + "supporting_rev_reads" : Info("SUPPORTING_REV_READS", "Reads supporting the reverse junction of the insertion", 0, "Integer"), + "softclipped_support" : Info("SOFTCLIPPED_SUPPORT", "Number of softclipped reads that support the breakpoint", 0, "Integer"), + "zygosity" : Info("ZYGOSITY", "The ratio of clipped and properly mapped reads at the insertion site", 0.0, "Float") + } -class Jitterbug: + self.redundancy_filter = RedundancyFilter(["supporting_fwd_reads", "supporting_rev_reads"], "sum") + +class Tepid: def __init__(self): - supporting_fwd_reads = 0 - supporting_rev_reads = 0 - split_read_support = 0 - zygosity = 0.0 -################################################ + self.id = -1 + self.support = { + "supporting_reads" : Info("SUPPORTING_READS", "Number of reads supporting the insertion", 0, "Integer") + } + self.redundancy_filter = RedundancyFilter(["supporting_reads"], "value") def make_redundant_bed(insertions, sample_name, out_dir, method="popoolationte"): diff --git a/scripts/tepid/tepid_post.py b/scripts/tepid/tepid_post.py index fae13a1..2477a56 100644 --- a/scripts/tepid/tepid_post.py +++ b/scripts/tepid/tepid_post.py @@ -2,10 +2,14 @@ import sys import subprocess import traceback +import importlib.util as il +spec = il.spec_from_file_location("config", snakemake.params.config) +config = il.module_from_spec(spec) +sys.modules[spec.name] = config +spec.loader.exec_module(config) sys.path.append(snakemake.config['args']['mcc_path']) import scripts.mccutils as mccutils -import config.tepid.tepid_post as config - +import scripts.output as output def main(): insertions_bed = snakemake.input.insertions_bed @@ -14,30 +18,45 @@ def main(): deletions_support = snakemake.input.deletions_support te_gff = snakemake.input.te_gff te_taxonomy = snakemake.input.te_taxonomy + reference_fasta = snakemake.input.reference_fasta + chromosomes = snakemake.params.chromosomes.split(",") + status_log = snakemake.params.status_log sample_name = snakemake.params.sample_name out_dir = snakemake.params.out_dir + + prev_steps_succeeded = mccutils.check_status_file(status_log) + mccutils.log("tepid","running TEPID post processing") - te_to_family = get_te_family_map(te_taxonomy) - te_pos_to_family = get_te_pos_family_map(te_gff, te_to_family) - insertions = read_insertions(insertions_bed, te_to_family, sample_name, te_pos_to_family, chromosomes, reference=False) - insertions = add_support(insertions, insertions_support, threshold=config.READ_SUPPORT_THRESHOLD) - - deletions = read_insertions(deletions_bed, te_to_family, sample_name, te_pos_to_family, chromosomes, reference=True) - deletions = add_support(deletions, deletions_support, threshold=config.READ_SUPPORT_THRESHOLD) - non_abs_ref_insertions = get_non_absent_ref_tes(deletions, te_gff, te_to_family, sample_name) - - insertions += non_abs_ref_insertions - if len(insertions) > 0: - mccutils.make_redundant_bed(insertions, sample_name, out_dir, method="tepid") - mccutils.make_nonredundant_bed(insertions, sample_name, out_dir, method="tepid") + + if prev_steps_succeeded: + te_to_family = get_te_family_map(te_taxonomy) + te_pos_to_family = get_te_pos_family_map(te_gff, te_to_family) + insertions = read_insertions(insertions_bed, te_to_family, sample_name, te_pos_to_family, chromosomes, reference=False) + insertions = add_support(insertions, insertions_support, threshold=config.READ_SUPPORT_THRESHOLD) + + deletions = read_insertions(deletions_bed, te_to_family, sample_name, te_pos_to_family, chromosomes, reference=True) + deletions = add_support(deletions, deletions_support, threshold=config.READ_SUPPORT_THRESHOLD) + non_abs_ref_insertions = get_non_absent_ref_tes(deletions, te_gff, te_to_family, sample_name) + + insertions += non_abs_ref_insertions + if len(insertions) > 0: + insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="tepid") + insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="tepid") + output.write_vcf(insertions, reference_fasta, sample_name, "tepid", out_dir) + else: + mccutils.run_command(["touch",out_dir+"/"+sample_name+"_tepid_redundant.bed"]) + mccutils.run_command(["touch",out_dir+"/"+sample_name+"_tepid_nonredundant.bed"]) + + else: - mccutils.run_command(["touch",out_dir+"/"+sample_name+"_tepid_redundant.bed"]) - mccutils.run_command(["touch",out_dir+"/"+sample_name+"_tepid_nonredundant.bed"]) - + mccutils.run_command(["touch",out_dir+"/"+sample_name+"_tepid_redundant.bed"]) + mccutils.run_command(["touch",out_dir+"/"+sample_name+"_tepid_nonredundant.bed"]) + mccutils.log("tepid","TEPID post processing complete") + def get_te_family_map(taxonomy): te_to_family = {} @@ -71,7 +90,7 @@ def read_insertions(bed, te_to_family, sample_name, te_pos_to_family, chromosome inserts = [] with open(bed,"r") as b: for line in b: - insert = mccutils.Insertion() + insert = output.Insertion(output.Tepid()) split_line = line.split("\t") insert.chromosome = split_line[0] if insert.chromosome in chromosomes: @@ -83,16 +102,16 @@ def read_insertions(bed, te_to_family, sample_name, te_pos_to_family, chromosome insert.family = te_to_family[te_name] insert.strand = split_line[3] insert.type = "reference" - insert.name = insert.family+"|reference|"+sample_name+"|tepid|nonab|" + insert.name = insert.family+"|reference|NA|"+sample_name+"|tepid|nonab|" else: te_chrom = split_line[3] te_start = split_line[4] te_end = split_line[5] insert.family = te_pos_to_family[te_chrom+"_"+te_start+"_"+te_end] insert.type = "non-reference" - insert.name = insert.family+"|non-reference|"+sample_name+"|tepid|" + insert.name = insert.family+"|non-reference|NA|"+sample_name+"|tepid|sr|" - insert.tepid.id = split_line[-1].replace("\n","") + insert.support_info.id = split_line[-1].replace("\n","") inserts.append(insert) return inserts @@ -110,8 +129,8 @@ def add_support(inserts, support_file, threshold=0): support[support_id] = support_val for insert in inserts: - insert.tepid.support = support[insert.tepid.id] - if insert.tepid.support >= threshold: + insert.support_info.support['supporting_reads'].value = support[insert.support_info.id] + if insert.support_info.support['supporting_reads'].value >= threshold: filtered_inserts.append(insert) @@ -122,7 +141,7 @@ def get_non_absent_ref_tes(deletions, te_gff, te_to_family, sample_name): ref_tes = [] with open(te_gff, "r") as gff: for line in gff: - ref_te = mccutils.Insertion() + ref_te = output.Insertion(output.Tepid()) split_line = line.split("\t") ref_te.chromosome = split_line[0] ref_te.start = int(split_line[3]) @@ -137,7 +156,7 @@ def get_non_absent_ref_tes(deletions, te_gff, te_to_family, sample_name): ref_te.family = te_to_family[te_id] ref_te.type = "reference" - ref_te.name = ref_te.family+"|reference|"+sample_name+"|tepid|nonab|" + ref_te.name = ref_te.family+"|reference|NA|"+sample_name+"|tepid|nonab|" ref_tes.append(ref_te) absent = [] diff --git a/scripts/tepid/tepid_run.py b/scripts/tepid/tepid_run.py index 50ff6f0..bf2fe80 100644 --- a/scripts/tepid/tepid_run.py +++ b/scripts/tepid/tepid_run.py @@ -2,6 +2,11 @@ import sys import subprocess import traceback +import importlib.util as il +spec = il.spec_from_file_location("config", snakemake.params.config) +config = il.module_from_spec(spec) +sys.modules[spec.name] = config +spec.loader.exec_module(config) sys.path.append(snakemake.config['args']['mcc_path']) import scripts.mccutils as mccutils @@ -19,35 +24,52 @@ def main(): ref_name = snakemake.params.ref_name out_dir = snakemake.params.out_dir log = snakemake.params.log + status_log = snakemake.params.status_log - # ensures intermediate files from previous runs are removed - for f in os.listdir(out_dir): - mccutils.remove(out_dir+"/"+f) - median_insert_size = "" - with open(median_insert_size_file,"r") as median_insert: - for line in median_insert: - line = line.split("=")[1].replace("\n","") - median_insert_size = line + try: + # ensures intermediate files from previous runs are removed + for f in os.listdir(out_dir): + mccutils.remove(out_dir+"/"+f) + + median_insert_size = "" + with open(median_insert_size_file,"r") as median_insert: + for line in median_insert: + line = line.split("=")[1].replace("\n","") + median_insert_size = line + + is_paired = True + if raw_fq2 == "False": + is_paired = False + + mccutils.log("tepid","indexing reference", log=log) + index_ref(ref_fasta, ref_name, out_dir, log=log) + mccutils.log("tepid","making TEPID reference TE bed") + te_bed = make_te_bed(te_gff, te_taxonomy, out_dir) + + mccutils.log("tepid","mapping reads", log=log) + if is_paired: + bam, split_bam = map_reads(fq1, fq2, ref_name, median_insert_size, out_dir, threads=threads, paired=True, log=log) + else: + bam, split_bam = map_reads(fq1, fq2, ref_name, median_insert_size, out_dir, threads=threads, paired=False, log=log) - is_paired = True - if raw_fq2 == "False": - is_paired = False + mccutils.log("tepid","discovering variants", log=log) + discover_variants(ref_name, bam, split_bam, te_bed, out_dir, threads=threads, log=log) + mccutils.log("tepid","TEPID run complete") - mccutils.log("tepid","indexing reference", log=log) - index_ref(ref_fasta, ref_name, out_dir, log=log) - mccutils.log("tepid","making TEPID reference TE bed") - te_bed = make_te_bed(te_gff, te_taxonomy, out_dir) - - mccutils.log("tepid","mapping reads", log=log) - if is_paired: - bam, split_bam = map_reads(fq1, fq2, ref_name, median_insert_size, out_dir, threads=threads, paired=True, log=log) - else: - bam, split_bam = map_reads(fq1, fq2, ref_name, median_insert_size, out_dir, threads=threads, paired=False, log=log) + except Exception as e: + track = traceback.format_exc() + print(track, file=sys.stderr) + with open(log,"a") as l: + print(track, file=l) + mccutils.log("tepid","TEPID run failed") + with open(status_log,"w") as l: + l.write("FAILED\n") - mccutils.log("tepid","discovering variants", log=log) - discover_variants(ref_name, bam, split_bam, te_bed, out_dir, threads=threads, log=log) - mccutils.log("tepid","TEPID run complete") + mccutils.run_command(["touch", snakemake.output[0]]) + mccutils.run_command(["touch", snakemake.output[1]]) + mccutils.run_command(["touch", snakemake.output.insertions_support]) + mccutils.run_command(["touch", snakemake.output.deletions_support]) def index_ref(fasta, ref_name, out, log=None): try: @@ -97,64 +119,50 @@ def make_te_bed(gff, taxonomy, out): return te_bed def map_reads(fq1, fq2, ref_name, median_insert_size, out, threads=1, paired=True, log=None): - try: - os.chdir(out) - if paired: - command = ["tepid-map", - "-x", out+"/"+ref_name, - "-y", out+"/"+ref_name+".X15_01_65525S", - "-p", str(threads), - "-s", median_insert_size, - "-n", ref_name, - "-1", fq1, - "-2", fq2 ] - else: - command = ["tepid-map-se", - "-x", out+"/"+ref_name, - "-y", out+"/"+ref_name+".X15_01_65525S", - "-p", str(threads), - "-n", ref_name, - "-q", fq1] + os.chdir(out) + if paired: + command = ["tepid-map", + "-x", out+"/"+ref_name, + "-y", out+"/"+ref_name+".X15_01_65525S", + "-p", str(threads), + "-s", median_insert_size, + "-n", ref_name, + "-1", fq1, + "-2", fq2 ] + else: + command = ["tepid-map-se", + "-x", out+"/"+ref_name, + "-y", out+"/"+ref_name+".X15_01_65525S", + "-p", str(threads), + "-n", ref_name, + "-q", fq1] - mccutils.run_command(command, log=log) + mccutils.run_command(command, log=log) - bam = out+"/"+ref_name+".bam" - split_bam = out+"/"+ref_name+".split.bam" - mccutils.check_file_exists(bam) - mccutils.check_file_exists(split_bam) - - except Exception as e: - track = traceback.format_exc() - print(track, file=sys.stderr) - print("ERROR...Failed to run TEPID mapping step...exiting...", file=sys.stderr) - sys.exit(1) + bam = out+"/"+ref_name+".bam" + split_bam = out+"/"+ref_name+".split.bam" + mccutils.check_file_exists(bam) + mccutils.check_file_exists(split_bam) return bam, split_bam def discover_variants(ref_name, bam, split_bam, te_bed, out, threads=1, log=None): - try: - os.chdir(out) - command = ["tepid-discover", - "-p", str(threads), - "-n", ref_name, - "-c", bam, - "-s", split_bam, - "-t", te_bed] - - mccutils.run_command(command, log=log) - - if not os.path.exists(snakemake.output[0]): - mccutils.run_command(["touch", snakemake.output[0]]) - if not os.path.exists(snakemake.output[2]): - mccutils.run_command(["touch", snakemake.output[2]]) - mccutils.check_file_exists(snakemake.output[1]) - - except Exception as e: - track = traceback.format_exc() - print(track, file=sys.stderr) - print("ERROR...Failed to run TEPID discover step...exiting...", file=sys.stderr) - sys.exit(1) + os.chdir(out) + command = ["tepid-discover", + "-p", str(threads), + "-n", ref_name, + "-c", bam, + "-s", split_bam, + "-t", te_bed] + + mccutils.run_command(command, log=log) + + if not os.path.exists(snakemake.output[0]): + mccutils.run_command(["touch", snakemake.output[0]]) + if not os.path.exists(snakemake.output[2]): + mccutils.run_command(["touch", snakemake.output[2]]) + mccutils.check_file_exists(snakemake.output[1]) if __name__ == "__main__": main() \ No newline at end of file diff --git a/snakefiles/jitterbug.snakefile b/snakefiles/jitterbug.snakefile index 4332d59..29db847 100644 --- a/snakefiles/jitterbug.snakefile +++ b/snakefiles/jitterbug.snakefile @@ -11,7 +11,9 @@ rule jitterbug_run: out_dir = config['args']['out']+"results/jitterbug/unfiltered/", script_dir = config['args']['mcc_path']+"/install/tools/jitterbug/", sample_name = config['args']['sample_name'], - log=config['args']['log_dir']+"jitterbug.log" + log=config['args']['log_dir']+"jitterbug.log", + config = config['config']['jitterbug']['files'][0], + status_log = config['status']['jitterbug'] output: out = config['args']['out']+"results/jitterbug/unfiltered/"+config['args']['sample_name']+".TE_insertions_paired_clusters.filtered.gff3" @@ -22,7 +24,8 @@ rule jitterbug_run: rule jitterbug_post: input: jitterbug_out = config['args']['out']+"results/jitterbug/unfiltered/"+config['args']['sample_name']+".TE_insertions_paired_clusters.filtered.gff3", - taxonomy = config['mcc']['taxonomy'] + taxonomy = config['mcc']['taxonomy'], + reference_fasta = config['mcc']['reference'] threads: 1 @@ -32,7 +35,9 @@ rule jitterbug_post: out_dir = config['args']['out']+"results/jitterbug/", log=config['args']['log_dir']+"jitterbug.log", sample_name = config['args']['sample_name'], - chromosomes = config['args']['chromosomes'] + chromosomes = config['args']['chromosomes'], + config = config['config']['jitterbug']['files'][1], + status_log = config['status']['jitterbug'] output: out = config['out']['jitterbug'] diff --git a/snakefiles/tepid.snakefile b/snakefiles/tepid.snakefile index a877429..54d2134 100644 --- a/snakefiles/tepid.snakefile +++ b/snakefiles/tepid.snakefile @@ -15,7 +15,9 @@ rule tepid_run: raw_fq2 = config['in']['fq2'], ref_name = config['args']['ref_name'], out_dir = config['args']['out']+"/results/tepid/unfiltered/", - log = config['args']['log_dir']+"tepid.log" + log = config['args']['log_dir']+"tepid.log", + config = config['config']['tepid']['files'][0], + status_log = config['status']['tepid'] output: config['args']['out']+"results/tepid/unfiltered/insertions_"+config['args']['ref_name']+".bed", @@ -33,7 +35,8 @@ rule tepid_post: insertions_support = config['args']['out']+"results/tepid/unfiltered/insertion_reads_"+config['args']['ref_name']+".txt", deletions_support = config['args']['out']+"results/tepid/unfiltered/deletion_reads_"+config['args']['ref_name']+".txt", te_gff = config['mcc']['locations'], - te_taxonomy = config['mcc']['taxonomy'] + te_taxonomy = config['mcc']['taxonomy'], + reference_fasta = config['mcc']['reference'] threads: 1 @@ -42,7 +45,9 @@ rule tepid_post: params: sample_name = config['args']['sample_name'], out_dir = config['args']['out']+"/results/tepid/", - chromosomes = config['args']['chromosomes'] + chromosomes = config['args']['chromosomes'], + config = config['config']['tepid']['files'][1], + status_log = config['status']['tepid'] output: config['out']['tepid'] From c5ca626873d7452f73d6ad84e79e9b058aa7cef2 Mon Sep 17 00:00:00 2001 From: pbasting Date: Tue, 25 May 2021 17:15:40 -0400 Subject: [PATCH 06/13] updating readthedocs config --- docs/running/config.rst | 634 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 633 insertions(+), 1 deletion(-) diff --git a/docs/running/config.rst b/docs/running/config.rst index 4e17401..36fb844 100644 --- a/docs/running/config.rst +++ b/docs/running/config.rst @@ -76,4 +76,636 @@ postprocessing config This config file contains the parameters that influence the post processing of the :code:`ngs_te_mapper` predictions. min_read_support - * the minimum number of reads supporting an insertion required to be considered a valid prediction \ No newline at end of file + * the minimum number of reads supporting an insertion required to be considered a valid prediction + + +************** +ngs_te_mapper2 +************** + +run config +========== +:code:`/path/to/mcclintock/config/ngs_te_mapper2/ngs_te_mapper2_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "--window" : 10, + "--min_mapq" : 20, + "--min_af" : 0.1, + "--tsd_max" : 25, + "--gap_max" : 5 + } + +This config file contains the parameters that can be modified to influence the running of the :code:`ngs_te_mapper2` component method. + +--window + * size of the window to merge for identifying TE clusters + +--min_mapq + * minimum mapping quality of alignment + +--min_af + * minimum allele frequency + +--tsd_max + * maximum target site duplication size + +--gap_max + * maximum gap size + +************* +popoolationte +************* + +run config +========== +:code:`/path/to/mcclintock/config/popoolationte/popoolationte_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "identify-te-insertsites.pl" : { + "--min-count" : 3, + "--min-map-qual" : 15 + }, + "crosslink-te-sites.pl" : { + "--single-site-shift": 100 + }, + "update-teinserts-with-knowntes.pl" : { + "--single-site-shift": 100 + }, + "estimate-polymorphism.pl" : { + "--min-map-qual": 15 + }, + "filter-teinserts.pl" : { + "--min-count": 5 + } + } + +identify-te-insertsites.pl + * Identifies TE insertion sites (forward or reverse insertion) from a sam file + * :code:`--min-count`: the minimum number of PE-fragments that confirm the insertion of a TE of a certain family + * :code:`--min-map-qual`: the minimum mapping quality; this will only apply to reads mapping to a reference contig. + +crosslink-te-sites.pl + * Crosslinks forward and reverse insertions and outputs transposable element insertions + * :code:`--single-site-shift`: the exact position of a TE insertion can only be approximated for TE insertions where only the forward or only the reverse insertion was found. For forward insertions the positon of the TE insertion is calculated as the end of the range. For reverse insertions the position of the TE insertion is calculated as the start of the range + +update-teinserts-with-knowntes.pl + * :code:`--single-site-shift`: the exact position of a TE insertion can only be approximated for TE insertions where only the forward or only the reverse insertion was found. For forward insertions the positon of the TE insertion is calculated as the end of the range. For reverse insertions the position of the TE insertion is calculated as the start of the range + +estimate-polymorphism.pl + * Estimate the insertion frequencies for a given set of TE insertions + * :code:`--min-map-qual`: the minimum mapping quality + +filter-teinserts.pl + * :code:`--min-count`: the minimum number of PE-fragments that confirm the insertion of a TE of a certain family + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/popoolationte/popoolationte_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "require_both_end_support" : True, + "percent_read_support_threshold" : 0.1 + } + +require_both_end_support + * requires that final results have support on both ends of prediction + +percent_read_support_threshold + * threshold for the minimum acceptable fraction of the reads supporting the prediction + + +************** +popoolationte2 +************** + +run config +========== +:code:`/path/to/mcclintock/config/popoolationte2/popoolationte2_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "ppileup" : { + "--map-qual": 15, + "--sr-mindist" : 10000, + "--id-up-quant": 0.01 + }, + + "subsampleppileup" : { + "run" : False, + "--target-coverage": 100, + "--with-replace": False + }, + + "identifySignatures" : { + "--min-count": 2.0, + "--signature-window": "median", + "--min-valley": "median", + "--chunk-distance": 5 + }, + + "updateStrand" : { + "--map-qual": 15, + "--max-disagreement": 0.1, + "--sr-mindist": 10000, + "--id-up-quant": 0.01 + }, + + "pairupSignatures" : { + "--min-distance": -100, + "--max-distance": 500, + "--max-freq-diff": 1.0 + } + + } + +ppileup + * create a physical pileup file from one or multiple bam files + * :code:`--map-qual`: minimum mapping quality + * :code:`--sr-mindist`: minimum inner distance for structural rearrangements + * :code:`--id-up-quant`: paired end fragments with an insert size in the upper quantile will be ignored + +subsampleppileup + * subsample a ppileup file to uniform coverage + * :code:`run`: The subsampleppileup step is optional. Set this option to :code:`True` if you wish to perform this step + * :code:`--target-coverage`: the target coverage of the output file + * :code:`--with-replace`: use sampling with replacement instead of without replacement + +identifySignatures + * identify signatures of TE insertions + * :code:`--min-count`: the minimum count of a TE insertion + * :code:`--signature-window`: the window size of the signatures of TE insertions. options: :code:`median`, :code:`fixNNNN`, :code:`minimumSampleMedian`, :code:`maximumSampleMedian` + * :code:`min-valley`: the minimum size of the valley between two consecutive signatures of the same family. options: :code:`median`, :code:`fixNNNN`, :code:`minimumSampleMedian`, :code:`maximumSampleMedian` + * :code:`--chunk-distance`: minimum distance between chromosomal chunks in multiples of :code:`--min-valley` + +updateStrand + * estimate the strand of TEs for signatures of TE insertions + * :code:`--map-qual`: minimum mapping quality + * :code:`--max-disagreement`: the maximum disagreement for the strand of the TE insertion in fraction of reads + * :code:`--sr-mindist`: minimum inner distance for structural rearrangements + * :code:`--id-up-quant`: paired end fragments with an insert size in the upper quantile will be ignored + +pairupSignatures + * pairs up signatures of TE insertions and yields TE insertions + * :code:`--min-distance`: the minimum distance between signatures + * :code:`--max-distance`: the maximum distance between signatures + * :code:`--max-freq-diff`: the maximum frequency difference between signatures + + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/popoolationte2/popoolationte2_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "require_both_end_support" : True, + "frequency_threshold" : 0.1 + } + +require_both_end_support + * require that the TE prediction have support from both junctions + +frequency_threshold + * threshold for the minimum acceptable (average physical coverage supporting the insertion of the given TE) / (average physical coverage) + + +******** +relocate +******** + +run config +========== +:code:`/path/to/mcclintock/config/relocate/relocate_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + '-l' : 10, + '-m' : 0.0, + '-bm' : 10, + '-bt' : 7, + '-f' : 100 + } + +-l + * len cutoff for the TE trimmed reads to be aligned + +-m + * mismatch allowance for alignment to TE + +-bm + * blat minScore value, used by blat in the comparison of reads to TE sequence + +-bt + * blat tileSize value, used by blat in the comparison of reads to TE sequence + +-f + * length of the sequence flanking the found insertion to be returned. This sequence is taken from the reference genome + + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/relocate/relocate_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "ref_left_threshold" : 0, + "ref_right_threshold" : 0, + "nonref_left_threshold" : 0, + "nonref_right_threshold" : 0 + } + +ref_left_threshold + * minimum number of left flanking reads for a reference prediction. + +ref_right_threshold + * minimum number of right flanking reads for a reference prediction. + +nonref_left_threshold + * minimum number of left flanking reads for a non-reference prediction. + +nonref_right_threshold + * minimum number of right flanking reads for a non-reference prediction. + + +********* +relocate2 +********* + +run config +========== +:code:`/path/to/mcclintock/config/relocate2/relocate2_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + '--aligner' : "blat", + '--len_cut_match' : 10, + '--len_cut_trim' : 10, + '--mismatch' : 2, + '--mismatch_junction' : 2 + } + +--aligner + * aligner used to map reads to repeat elements + +--len_cut_match + * length cutoff threshold for match between reads and repeat elements. Large value will lead to less sensitive but more accuracy + +--len_cut_trim + * length cutoff threshold for trimed reads after trimming repeat sequence from reads. Large value will lead to less sensitive but more accuracy + +--mismatch + * Number of mismatches allowed for matches between reads and repeat elements + +--mismatch_junction + * Number of mismatches allowed for matches between junction reads and repeat elements + + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/relocate2/relocate2_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "ref_left_support_threshold" : 0, + "ref_right_support_threshold" : 0, + "ref_left_junction_threshold" : 0, + "ref_right_junction_threshold" : 0, + + "nonref_left_support_threshold" : 0, + "nonref_right_support_threshold" : 0, + "nonref_left_junction_threshold" : 0, + "nonref_right_junction_threshold" : 0 + } + +ref_left_support_threshold + * Minimum number of reads not covering the junction of TE insertion, but supporting TE insertion by paired-end reads on left side/downstream of a reference prediction + +ref_right_support_threshold + * Minimum number of reads not covering the junction of TE insertion, but supporting TE insertion by paired-end reads on right side/downstream of a reference prediction + +ref_left_junction_threshold + * Minimum number of reads covering the junction of TE insertion on left side/upstream of a reference prediction + +ref_right_junction_threshold + * Minimum number of reads covering the junction of TE insertion on right side/downstream of a reference prediction + +nonref_left_support_threshold + * Minimum number of reads not covering the junction of TE insertion, but supporting TE insertion by paired-end reads on left side/downstream of a non-reference prediction + +nonref_right_support_threshold + * Minimum number of reads not covering the junction of TE insertion, but supporting TE insertion by paired-end reads on right side/downstream of a non-reference prediction + +nonref_left_junction_threshold + * Minimum number of reads covering the junction of TE insertion on left side/upstream of a non-reference prediction + +nonref_right_junction_threshold + * Minimum number of reads covering the junction of TE insertion on right side/downstream of a non-reference prediction + + +******** +retroseq +******** + +run config +========== +:code:`/path/to/mcclintock/config/retroseq/retroseq_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "-depth" : 200, + "-reads" : 10, + "-q": 20 + } + +-depth + * Max average depth of a region to be considered for calling + +-reads + * Minimum number of reads required to make a call + +-q + * Minimum mapping quality for a read mate that anchors the insertion call + + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/retroseq/retroseq_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "read_support_threshold" : 0, + "breakpoint_confidence_threshold" : 6 + } + +read_support_threshold + * Minimum number of correctly mapped read pairs spanning breakpoint for predictions + +breakpoint_confidence_threshold + * Minimum FL tag for predictions. The FL tag ranges from 1-8 and gives information on the breakpoint with 8 being the most confident calls and lower values indicating calls that don’t meet the breakpoint criteria for reasons such as lack of 5’ or 3’ reads + + +******* +tebreak +******* + +run config +========== +:code:`/path/to/mcclintock/config/tebreak/tebreak_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "--minMWP": "0.01", + "--min_minclip" : "3", + "--min_maxclip" : "10", + "--min_sr_per_break" : "1", + "--min_consensus_score" : "0.9", + "--min_chr_len" : "0", + "--max_ins_reads" : "1000", + "--min_split_reads" : "4", + "--min_prox_mapq" : "10", + "--max_N_consensus" : "4", + "--max_disc_fetch" : "50", + "--min_disc_reads" : "4", + "--sr_density" : "2.0", + "--min_ins_match" : "0.90", + "--min_ref_match" : "0.98", + "--min_cons_len" : "250", + "--keep_all_tmp_bams" : False, + "--skip_final_filter" : False, + "--debug": False + } + +--minMWP + * minimum Mann-Whitney P-value for split qualities + +--min_minclip + * min. shortest clipped bases per cluster + +--min_maxclip + * min. longest clipped bases per cluster + +--min_sr_per_break + * minimum split reads per breakend + +--min_consensus_score + * quality of consensus alignment + +--min_chr_len + * minimum chromosome length to consider in discordant site search + +--max_ins_reads + * maximum number of reads to use per insertion call + +--min_split_reads + * minimum total split reads per insertion call + +--min_prox_mapq + * minimum map quality for proximal subread + +--max_N_consensus + * exclude breakend seqs with > this number of N bases + +--max_disc_fetch + * maximum number of discordant reads to fetch per insertion site per BAM + +--min_disc_reads + * if using -d/--disco_target, minimum number of discordant reads to trigger a call + +--sr_density + * maximum split read density in chunk + +--min_ins_match + * (output) minumum match to insertion library + +--min_ref_match + * (output) minimum match to reference genome + +--min_cons_len + * (output) min total consensus length + +--keep_all_tmp_bams + * leave ALL temporary BAMs + +--skip_final_filter + * do not apply final filters or fix for orientation + +--debug + * run in debug mode + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/tebreak/tebreak_post.py` : `GitHub link `_ + +.. code:: python + + MIN_5P_ELT_MATCH = 0.0 + MIN_3P_ELT_MATCH = 0.0 + MIN_5P_GENOME_MATCH = 0.0 + MIN_3P_GENOME_MATCH = 0.0 + MIN_SPLIT_READS_5P = 0 + MIN_SPLIT_READS_3P = 0 + MIN_REMAPPED_DISCORDANT = 0 + MIN_REMAP_DISC_FRACTION = 0.0 + MIN_REMAPPED_SPLITREADS = 0 + MIN_REMAP_SPLIT_FRACTION = 0.0 + +MIN_5P_ELT_MATCH + * Minimum fraction of bases matched to reference for inserted sequence on insertion seqment of 5' supporting contig + +MIN_3P_ELT_MATCH + * Minimum fraction of bases matched to reference for inserted sequence on insertion seqment of 3' supporting contig + +MIN_5P_GENOME_MATCH + * Minimum fraction of bases matched to reference genome on genomic segment of 5' supporting contig + +MIN_3P_GENOME_MATCH + * Minimum fraction of bases matched to reference genome on genomic segment of 3' supporting contig + +MIN_SPLIT_READS_5P + * Minimum number of split reads supporting 5' end of the insertion + +MIN_SPLIT_READS_3P + * Minimum number of split reads supporting 3' end of the insertion + +MIN_REMAPPED_DISCORDANT + * Minimum number of discordant read ends re-mappable to insertion reference sequence + +MIN_REMAP_DISC_FRACTION + * Minimum proportion of remapped discordant reads mapping to the reference insertion sequence + +MIN_REMAPPED_SPLITREADS + * Minimum number of split reads re-mappable to insertion reference sequence + +MIN_REMAP_SPLIT_FRACTION + * Minimum proportion of remapped split reads mapping to the reference insertion sequence + +****** +teflon +****** + +run config +========== +:code:`/path/to/mcclintock/config/teflon/teflon_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "-q": 20, + "-sd" : None, + "-cov" : None, + "-n1" : 1, + "-n2" : 1, + "-lt" : 1, + "-ht" : None + } + +-q + * Map quality threshold. Mapped reads with map qualities lower than this number will be discarded + +-sd + * Insert size standard deviation. Used to manually override the insert size StdDev identified by samtools stat (check this number in the generated stats.txt file to ensure it seems more or less correct based on knowledge of sequencing library!) + * Set to :code:`None` if you want this value to be calculated by TEFLoN + +-cov + * Coverage override. Used to manually override the coverage estimate if you get the error: :code:`Warning: coverage could not be estimated` + +-n1 + * TEs must be supported by >= n reads in at least one sample + +-n2 + * TEs must be supported by >= n reads summed across all samples + +-lt + * sites genotyped as -9 if adjusted read counts lower than this threshold + +-ht + * sites genotyped as -9 if adjusted read counts higher than this threshold + * Set to :code:`None` if you want this value to be calculated by TEFLoN + + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/teflon/teflon_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "min_presence_reads" : 3, + "max_absence_reads" : None, + "min_presence_fraction" : 0.1, + "require_tsd" : True, + "require_both_breakpoints" : True + + } + +min_presence_reads + * Minimum number of reads supporting the presence of the insertion + +max_absence_reads + * Maximum number of reads that support the absence of the insertion + +min_presence_fraction + * The minimum fraction of reads supporting the presence of the insertion. presence/(absence+presence) + +require_tsd + * If :code:`True`, non-ref predictions must have a target site duplication + +require_both_breakpoints + * If :code:`True`, non-ref predictions must have both breakpoints predicted + + +********* +te-locate +********* + +run config +========== +:code:`/path/to/mcclintock/config/telocate/telocate_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "max_mem" : 4, + "min_distance" : 5, + "min_support_reads" : 3, + "min_support_individuals" : 1 + } + +max_mem + * Max memory to use (GB) + +min_distance + * resolution for the loci, if a supporting read-pair is found in a distance up to this value it is counted for the same event. Multiplied by median insert size. + +min_support_reads + * only events supported by this number of reads in all accessions are kept + +min_support_individuals + * only events supported in this number of individuals are kept. + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/telocate/telocate_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "read_pair_support_threshold" : 0 + } + +read_pair_support_threshold + * Minimum number of read pairs supporting a prediction \ No newline at end of file From e1a9737255987dcffb2667080c79854a9c77082e Mon Sep 17 00:00:00 2001 From: Preston Basting Date: Wed, 26 May 2021 14:15:30 -0400 Subject: [PATCH 07/13] using ART as read simulator, replacing all q scores with 5s --- auxiliary/simulation/mcc_sim.yml | 3 +- auxiliary/simulation/mcclintock_simulation.py | 42 +++++++++++++++++-- .../mcclintock_simulation_analysis.py | 11 ++++- auxiliary/simulation/yeast/yeast_config.json | 2 +- scripts/tebreak/tebreak_post.py | 2 +- 5 files changed, 51 insertions(+), 9 deletions(-) diff --git a/auxiliary/simulation/mcc_sim.yml b/auxiliary/simulation/mcc_sim.yml index 4694462..dd5abed 100644 --- a/auxiliary/simulation/mcc_sim.yml +++ b/auxiliary/simulation/mcc_sim.yml @@ -10,4 +10,5 @@ dependencies: - wgsim - seqtk - samtools - - bedtools \ No newline at end of file + - bedtools + - art diff --git a/auxiliary/simulation/mcclintock_simulation.py b/auxiliary/simulation/mcclintock_simulation.py index e380995..912337a 100644 --- a/auxiliary/simulation/mcclintock_simulation.py +++ b/auxiliary/simulation/mcclintock_simulation.py @@ -42,8 +42,8 @@ def main(): fastq1 = modified_reference.replace(".fasta", "_1.fastq") fastq2 = modified_reference.replace(".fasta", "_2.fastq") if not os.path.exists(fastq1) or not os.path.exists(fastq2): - num_pairs = calculate_num_pairs(modified_reference, args.coverage, args.length, single=args.single) - fastq1, fastq2 = create_synthetic_reads(modified_reference, num_pairs, args.length, args.insert, args.error, x, args.out, run_id=args.runid, seed=args.seed) + # num_pairs = calculate_num_pairs(modified_reference, args.coverage, args.length, single=args.single) + fastq1, fastq2 = create_synthetic_reads(modified_reference, args.coverage, args.length, args.insert, args.error, x, args.out, run_id=args.runid, seed=args.seed) if not os.path.exists(summary_report): run_mcclintock(fastq1, fastq2, args.reference, args.consensus, args.locations, args.taxonomy, x, args.proc, args.out, args.config, args.keep_intermediate, run_id=args.runid, single=args.single, reverse=reverse) @@ -374,7 +374,7 @@ def calculate_num_pairs(fasta, coverage, length, single=False): return num_pairs -def create_synthetic_reads(reference, num_pairs, length, insert, error, rep, out, run_id="", seed=None): +def create_synthetic_reads(reference, coverage, length, insert, error, rep, out, run_id="", seed=None): if seed is not None: random.seed(seed+"create_synthetic_reads"+str(rep)) else: @@ -386,9 +386,43 @@ def create_synthetic_reads(reference, num_pairs, length, insert, error, rep, out fastq2 = reference.replace(".fasta", "_2.fastq") report = reference.replace(".fasta", "_wgsim_report.txt") - command = ["wgsim", "-1", str(length), "-2", str(length), "-d", str(insert), "-N", str(num_pairs), "-S", str(seed_for_wgsim), "-e", str(error), "-h", reference, fastq1, fastq2] + # command = ["wgsim", "-1", str(length), "-2", str(length), "-d", str(insert), "-N", str(num_pairs), "-S", str(seed_for_wgsim), "-e", str(error), "-h", reference, fastq1, fastq2] + command = ["art_illumina", "-ss", "HS25", "-sam", "-i", reference, "-p", "-l", str(length), "-f", str(coverage), "-m", str(insert), "-s", "10", "-o", reference.replace(".fasta", "")] run_command_stdout(command, report) + tmp_fastq1 = reference.replace(".fasta", "") + "1.fq" + tmp_fastq2 = reference.replace(".fasta", "") + "2.fq" + + with open(tmp_fastq1,"r") as infq, open(fastq1,"w") as outfq: + ln = 1 + for line in infq: + line = line.replace("\n","") + if ln == 4: + line = "5"*len(line) + ln = 1 + else: + ln += 1 + + outfq.write(line+"\n") + + with open(tmp_fastq2,"r") as infq, open(fastq2,"w") as outfq: + ln = 1 + for line in infq: + line = line.replace("\n","") + if ln == 4: + line = "5"*len(line) + ln = 1 + else: + ln += 1 + + outfq.write(line+"\n") + + + + + # run_command(["mv", tmp_fastq1, fastq1]) + # run_command(["mv", tmp_fastq2, fastq2]) + return fastq1, fastq2 def run_mcclintock(fastq1, fastq2, reference, consensus, locations, taxonomy, rep, threads, out, config, keep_intermediate, run_id="", reverse=False, single=False): diff --git a/auxiliary/simulation/mcclintock_simulation_analysis.py b/auxiliary/simulation/mcclintock_simulation_analysis.py index 42b74e0..fa295fd 100644 --- a/auxiliary/simulation/mcclintock_simulation_analysis.py +++ b/auxiliary/simulation/mcclintock_simulation_analysis.py @@ -317,8 +317,15 @@ def write_combined_metrics(forward_metrics, reverse_metrics, outfile): with open(outfile, "w") as out: out.write(",".join(forward_metrics[0]) + "\n") for method in metrics.keys(): - precision = metrics[method][0] / (metrics[method][0] + metrics[method][1]) - recall = metrics[method][0] / (metrics[method][0] + metrics[method][2]) + if metrics[method][0] + metrics[method][1] > 0: + precision = metrics[method][0] / (metrics[method][0] + metrics[method][1]) + else: + precision = 0 + + if metrics[method][0] + metrics[method][2] > 0: + recall = metrics[method][0] / (metrics[method][0] + metrics[method][2]) + else: + recall = 0 outline = [method, str(metrics[method][0]), str(metrics[method][1]), str(metrics[method][2]), str(precision), str(recall)] out.write(",".join(outline) + "\n") diff --git a/auxiliary/simulation/yeast/yeast_config.json b/auxiliary/simulation/yeast/yeast_config.json index 60908e8..348e223 100644 --- a/auxiliary/simulation/yeast/yeast_config.json +++ b/auxiliary/simulation/yeast/yeast_config.json @@ -20,6 +20,6 @@ "mcclintock": { "path" : "/home/pjb68507/git//mcclintock/", - "methods" : "ngs_te_mapper,ngs_te_mapper2,relocate,relocate2,temp,temp2,retroseq,popoolationte,popoolationte2,te-locate,teflon" + "methods" : "ngs_te_mapper,ngs_te_mapper2,relocate,relocate2,temp,temp2,retroseq,popoolationte,popoolationte2,te-locate,teflon,jitterbug,tepid,tebreak" } } diff --git a/scripts/tebreak/tebreak_post.py b/scripts/tebreak/tebreak_post.py index 72cbec1..c43ee33 100644 --- a/scripts/tebreak/tebreak_post.py +++ b/scripts/tebreak/tebreak_post.py @@ -53,7 +53,7 @@ def read_insertions(tebreak_out, sample_name, chromosomes, config): else: insert = output.Insertion(output.Tebreak()) insert.chromosome = split_line[header['Chromosome']] - insert.start = int(split_line[header['3_Prime_End']]) + insert.start = int(split_line[header['3_Prime_End']])+1 insert.end = int(split_line[header['5_Prime_End']]) insert.family = split_line[header['Superfamily']] insert.type = "non-reference" From e03c80391a2fac58727b65aae7ca35beee94afab Mon Sep 17 00:00:00 2001 From: Preston Basting Date: Wed, 26 May 2021 15:10:48 -0400 Subject: [PATCH 08/13] using wgsim with qscores as I's --- auxiliary/simulation/mcclintock_simulation.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/auxiliary/simulation/mcclintock_simulation.py b/auxiliary/simulation/mcclintock_simulation.py index 912337a..b887bef 100644 --- a/auxiliary/simulation/mcclintock_simulation.py +++ b/auxiliary/simulation/mcclintock_simulation.py @@ -42,8 +42,8 @@ def main(): fastq1 = modified_reference.replace(".fasta", "_1.fastq") fastq2 = modified_reference.replace(".fasta", "_2.fastq") if not os.path.exists(fastq1) or not os.path.exists(fastq2): - # num_pairs = calculate_num_pairs(modified_reference, args.coverage, args.length, single=args.single) - fastq1, fastq2 = create_synthetic_reads(modified_reference, args.coverage, args.length, args.insert, args.error, x, args.out, run_id=args.runid, seed=args.seed) + num_pairs = calculate_num_pairs(modified_reference, args.coverage, args.length, single=args.single) + fastq1, fastq2 = create_synthetic_reads(modified_reference, args.coverage, num_pairs, args.length, args.insert, args.error, x, args.out, run_id=args.runid, seed=args.seed) if not os.path.exists(summary_report): run_mcclintock(fastq1, fastq2, args.reference, args.consensus, args.locations, args.taxonomy, x, args.proc, args.out, args.config, args.keep_intermediate, run_id=args.runid, single=args.single, reverse=reverse) @@ -374,7 +374,7 @@ def calculate_num_pairs(fasta, coverage, length, single=False): return num_pairs -def create_synthetic_reads(reference, coverage, length, insert, error, rep, out, run_id="", seed=None): +def create_synthetic_reads(reference, coverage, num_pairs, length, insert, error, rep, out, run_id="", seed=None): if seed is not None: random.seed(seed+"create_synthetic_reads"+str(rep)) else: @@ -382,23 +382,25 @@ def create_synthetic_reads(reference, coverage, length, insert, error, rep, out, seed_for_wgsim = random.randint(0,1000) + tmp_fastq1 = reference.replace(".fasta", "") + "1.fq" + tmp_fastq2 = reference.replace(".fasta", "") + "2.fq" + fastq1 = reference.replace(".fasta", "_1.fastq") fastq2 = reference.replace(".fasta", "_2.fastq") report = reference.replace(".fasta", "_wgsim_report.txt") - # command = ["wgsim", "-1", str(length), "-2", str(length), "-d", str(insert), "-N", str(num_pairs), "-S", str(seed_for_wgsim), "-e", str(error), "-h", reference, fastq1, fastq2] - command = ["art_illumina", "-ss", "HS25", "-sam", "-i", reference, "-p", "-l", str(length), "-f", str(coverage), "-m", str(insert), "-s", "10", "-o", reference.replace(".fasta", "")] + command = ["wgsim", "-1", str(length), "-2", str(length), "-d", str(insert), "-N", str(num_pairs), "-S", str(seed_for_wgsim), "-e", str(error), "-h", reference, tmp_fastq1, tmp_fastq2] + # command = ["art_illumina", "-ss", "HS25", "-sam", "-i", reference, "-p", "-l", str(length), "-f", str(coverage), "-m", str(insert), "-s", "10", "-o", reference.replace(".fasta", "")] run_command_stdout(command, report) - tmp_fastq1 = reference.replace(".fasta", "") + "1.fq" - tmp_fastq2 = reference.replace(".fasta", "") + "2.fq" + with open(tmp_fastq1,"r") as infq, open(fastq1,"w") as outfq: ln = 1 for line in infq: line = line.replace("\n","") if ln == 4: - line = "5"*len(line) + line = "I"*len(line) ln = 1 else: ln += 1 @@ -410,7 +412,7 @@ def create_synthetic_reads(reference, coverage, length, insert, error, rep, out, for line in infq: line = line.replace("\n","") if ln == 4: - line = "5"*len(line) + line = "I"*len(line) ln = 1 else: ln += 1 From b4daf2e83ff46b7354d572b69f6543c40eca1e1c Mon Sep 17 00:00:00 2001 From: pbasting Date: Wed, 26 May 2021 16:17:25 -0400 Subject: [PATCH 09/13] config doc contains info for all methods --- docs/running/config.rst | 133 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 132 insertions(+), 1 deletion(-) diff --git a/docs/running/config.rst b/docs/running/config.rst index 36fb844..170ee62 100644 --- a/docs/running/config.rst +++ b/docs/running/config.rst @@ -708,4 +708,135 @@ postprocessing config } read_pair_support_threshold - * Minimum number of read pairs supporting a prediction \ No newline at end of file + * Minimum number of read pairs supporting a prediction + +**** +temp +**** + +run config +========== +:code:`/path/to/mcclintock/config/temp/temp_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + '-x' : 30, + '-m' : 1 + } + +-x + * The minimum score difference between the best hit and the second best hit for considering a read as uniquely mapped. For BWA mem. + +-m + * Number of mismatch allowed when mapping to TE concensus sequences. + + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/temp/temp_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "acceptable_insertion_support_classes" : ["1p1"], + "frequency_threshold" : 0.1 + } + +acceptable_insertion_support_classes + * valid options: :code:`1p1`, :code:`2p`, :code:`singleton` + * The class of the insertions that are acceptable for final predictions. + * :code:`1p1` means that the detected insertion is supported by reads at both sides. + * :code:`2p` means the detected insertion is supported by more than 1 read at only 1 side. + * :code:`singleton` means the detected insertion is supported by only 1 read at 1 side. + +frequency_threshold + * minimum frequency of the inserted transposon + + +***** +temp2 +***** + +run config +========== +:code:`/path/to/mcclintock/config/temp2/temp2_run.py` : `GitHub link `_ + +.. code:: python + + PARAMS ={ + "insertion" : { + "-M" : 2, + "-m" : 5, + "-U" : 0.8, + "-N" : 300, + "-T" : False, + "-L" : False, + "-S" : False + }, + + "absence" : { + "-x" : 0 + } + } + +insertion + * :code:`-M` : Percentage of mismatch allowed when anchor to genome. + * :code:`-m` : Percentage of mismatch allowed when mapping to TEs. + * :code:`-U` : The ratio between the second best alignment and the best alignment to judge if a read is uniquely mapped. + * :code:`-N` : window size (+-n) for filtering insertions overlapping reference insertions. + * :code:`-T` : Set this parameter to :code:`True` to allow truncated de novo insertions. + * :code:`-L` : Set this parameter to :code:`True` to use a looser criteria to filter reference annotated copy overlapped insertions. + * :code:`-S` : Set this parameter to :code:`True` to skip insertion length checking; Default is to remove those insertions that are not full length of shorter than 500bp. + +absence + * :code:`-x` : The minimum score difference between the best hit and the second best hit for considering a read as uniquely mapped. For BWA MEM. + + +postprocessing config +===================== +:code:`/path/to/mcclintock/config/temp2/temp2_post.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "acceptable_insertion_support_classes" : ["1p1"], + "frequency_threshold" : 0.1 + } + +acceptable_insertion_support_classes + * valid options: :code:`1p1`, :code:`2p`, :code:`singleton` + * The class of the insertions that are acceptable for final predictions. + * :code:`1p1` means that the detected insertion is supported by reads at both sides. + * :code:`2p` means the detected insertion is supported by more than 1 read at only 1 side. + * :code:`singleton` means the detected insertion is supported by only 1 read at 1 side. + +frequency_threshold + * minimum frequency of the inserted transposon. It generally means what fraction of sequenced genome present this insertion. + + +********** +trimgalore +********** + +:code:`/path/to/mcclintock/config/trimgalore/trimgalore.py` : `GitHub link `_ + +.. code:: python + + PARAMS = { + "single_end": { + "--fastqc": True, + }, + + "paired_end": { + "--fastqc": True, + "--paired": True + } + } + +single_end + * :code:`--fastqc`: Set to true to run fastqc when using single end data + +paired_end + * :code:`--fastqc`: Set to true to run fastqc when using paired end data + * :code:`--paired`: Run trimgalore using length trimming of quality/adapter/RRBS trimmed reads for paired-end data \ No newline at end of file From 657da9478c674683233b193defb5f872e7ddfeb1 Mon Sep 17 00:00:00 2001 From: Preston Basting Date: Fri, 28 May 2021 11:23:51 -0400 Subject: [PATCH 10/13] adding ART as an option for read simulator --- auxiliary/simulation/mcclintock_simulation.py | 53 +++++++------------ 1 file changed, 18 insertions(+), 35 deletions(-) diff --git a/auxiliary/simulation/mcclintock_simulation.py b/auxiliary/simulation/mcclintock_simulation.py index b887bef..78579e7 100644 --- a/auxiliary/simulation/mcclintock_simulation.py +++ b/auxiliary/simulation/mcclintock_simulation.py @@ -43,7 +43,7 @@ def main(): fastq2 = modified_reference.replace(".fasta", "_2.fastq") if not os.path.exists(fastq1) or not os.path.exists(fastq2): num_pairs = calculate_num_pairs(modified_reference, args.coverage, args.length, single=args.single) - fastq1, fastq2 = create_synthetic_reads(modified_reference, args.coverage, num_pairs, args.length, args.insert, args.error, x, args.out, run_id=args.runid, seed=args.seed) + fastq1, fastq2 = create_synthetic_reads(args.sim, modified_reference, args.coverage, num_pairs, args.length, args.insert, args.error, x, args.out, run_id=args.runid, seed=args.seed) if not os.path.exists(summary_report): run_mcclintock(fastq1, fastq2, args.reference, args.consensus, args.locations, args.taxonomy, x, args.proc, args.out, args.config, args.keep_intermediate, run_id=args.runid, single=args.single, reverse=reverse) @@ -71,6 +71,7 @@ def parse_args(): parser.add_argument("--end", type=int, help="The number of replicates to run. [default = 300]", required=False) parser.add_argument("--seed", type=str, help="a seed to the random number generator so runs can be replicated", required=False) parser.add_argument("--runid", type=str, help="a string to prepend to output files so that multiple runs can be run at the same time without file name clashes", required=False) + parser.add_argument("--sim", type=str, help="Short read simulator to use (options=wgsim,art) [default = wgsim]", required=False) parser.add_argument("-s","--single", action="store_true", help="runs the simulation in single ended mode", required=False) args = parser.parse_args() @@ -142,6 +143,13 @@ def parse_args(): elif args.strand != "plus" and args.strand != "minus": sys.exit("ERROR: --strand must be plus or minus \n") + + if args.sim is None: + args.sim = "wgsim" + elif args.sim not in ["wgsim", "art"]: + sys.exit("ERROR: --sim must be wgsim or art \n") + + return args def run_command(cmd_list, log=None): @@ -374,7 +382,7 @@ def calculate_num_pairs(fasta, coverage, length, single=False): return num_pairs -def create_synthetic_reads(reference, coverage, num_pairs, length, insert, error, rep, out, run_id="", seed=None): +def create_synthetic_reads(simulator, reference, coverage, num_pairs, length, insert, error, rep, out, run_id="", seed=None): if seed is not None: random.seed(seed+"create_synthetic_reads"+str(rep)) else: @@ -389,41 +397,16 @@ def create_synthetic_reads(reference, coverage, num_pairs, length, insert, error fastq2 = reference.replace(".fasta", "_2.fastq") report = reference.replace(".fasta", "_wgsim_report.txt") - command = ["wgsim", "-1", str(length), "-2", str(length), "-d", str(insert), "-N", str(num_pairs), "-S", str(seed_for_wgsim), "-e", str(error), "-h", reference, tmp_fastq1, tmp_fastq2] - # command = ["art_illumina", "-ss", "HS25", "-sam", "-i", reference, "-p", "-l", str(length), "-f", str(coverage), "-m", str(insert), "-s", "10", "-o", reference.replace(".fasta", "")] - run_command_stdout(command, report) + if simulator == "wgsim": + command = ["wgsim", "-1", str(length), "-2", str(length), "-d", str(insert), "-N", str(num_pairs), "-S", str(seed_for_wgsim), "-e", str(error), "-h", reference, tmp_fastq1, tmp_fastq2] + else: + command = ["art_illumina", "-ss", "HS25", "--rndSeed", str(seed_for_wgsim), "-sam", "-i", reference, "-p", "-l", str(length), "-f", str(coverage), "-m", str(insert), "-s", "10", "-o", reference.replace(".fasta", "")] - - with open(tmp_fastq1,"r") as infq, open(fastq1,"w") as outfq: - ln = 1 - for line in infq: - line = line.replace("\n","") - if ln == 4: - line = "I"*len(line) - ln = 1 - else: - ln += 1 - - outfq.write(line+"\n") - - with open(tmp_fastq2,"r") as infq, open(fastq2,"w") as outfq: - ln = 1 - for line in infq: - line = line.replace("\n","") - if ln == 4: - line = "I"*len(line) - ln = 1 - else: - ln += 1 - - outfq.write(line+"\n") - - - - - # run_command(["mv", tmp_fastq1, fastq1]) - # run_command(["mv", tmp_fastq2, fastq2]) + + run_command_stdout(command, report) + run_command(["mv", tmp_fastq1, fastq1]) + run_command(["mv", tmp_fastq2, fastq2]) return fastq1, fastq2 From d175797b8739bed3f49b6ee383c00ce75ba13c35 Mon Sep 17 00:00:00 2001 From: pbasting Date: Mon, 7 Jun 2021 11:53:59 -0400 Subject: [PATCH 11/13] updating ngs_te_mapper2 version --- install/envs/mcc_ngs_te_mapper2.yml | 9 ++++++--- install/scripts/ngs_te_mapper2.py | 2 +- internal/install.py | 4 ++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/install/envs/mcc_ngs_te_mapper2.yml b/install/envs/mcc_ngs_te_mapper2.yml index 4f72cde..80bd3aa 100644 --- a/install/envs/mcc_ngs_te_mapper2.yml +++ b/install/envs/mcc_ngs_te_mapper2.yml @@ -8,11 +8,12 @@ dependencies: - _openmp_mutex=4.5=1_gnu - bcftools=1.9=h68d8f2e_9 - bedtools=2.29.2=hc088bd4_0 + - biopython=1.78=py36h8f6f2f9_2 - bwa=0.7.17=hed695b0_7 - bzip2=1.0.8=h7f98852_4 - c-ares=1.11.0=h470a237_1 - - ca-certificates=2020.12.5=ha878542_0 - - certifi=2020.12.5=py36h5fab9bb_0 + - ca-certificates=2021.5.30=ha878542_0 + - certifi=2021.5.30=py36h5fab9bb_0 - curl=7.71.1=he644dc0_8 - entrez-direct=13.9=pl526h375a9b1_0 - expat=2.2.9=he1b5a44_2 @@ -35,6 +36,7 @@ dependencies: - libgfortran-ng=9.3.0=he4bcb1c_17 - libgfortran5=9.3.0=he4bcb1c_17 - libgomp=9.3.0=h5dbcf3e_17 + - liblapack=3.9.0=6_openblas - libnghttp2=1.41.0=hab1572f_1 - libopenblas=0.3.12=pthreads_h4812303_1 - libssh2=1.9.0=hab1572f_5 @@ -42,7 +44,8 @@ dependencies: - minimap2=2.17=hed695b0_3 - ncurses=6.1=hf484d3e_1002 - nettle=3.3=0 - - openssl=1.1.1i=h7f98852_0 + - numpy=1.19.5=py36h2aa4a07_1 + - openssl=1.1.1k=h7f98852_0 - pcre=8.44=he1b5a44_0 - perl=5.26.2=h36c2ea0_1008 - perl-app-cpanminus=1.7044=pl526_1 diff --git a/install/scripts/ngs_te_mapper2.py b/install/scripts/ngs_te_mapper2.py index d463f2e..0888892 100644 --- a/install/scripts/ngs_te_mapper2.py +++ b/install/scripts/ngs_te_mapper2.py @@ -6,7 +6,7 @@ def main(): install_path = snakemake.config['paths']['install']+"/tools/" - raw_name="ngs_te_mapper2-d6ed0941cb847771ba5fa17e82f2ef5b54351bd8" + raw_name="ngs_te_mapper2-220619a3dd00b8c05c3b96328eb2424c6aad623e" method_name = "ngs_te_mapper2" mccutils.remove(snakemake.params.zipfile) diff --git a/internal/install.py b/internal/install.py index 45d3f07..1e563d1 100644 --- a/internal/install.py +++ b/internal/install.py @@ -5,7 +5,7 @@ "temp2": "https://github.com/weng-lab/TEMP2/archive/4d25d050dc7832b82374502573e882a242457a0c.zip", "relocate" : "https://github.com/srobb1/RelocaTE/archive/ce3a2066e15f5c14e2887fdf8dce0485e1750e5b.zip", "ngs_te_mapper" : "https://github.com/bergmanlab/ngs_te_mapper/archive/f9f48996ac346ac86d57edbd00534aa1227b753e.zip", - "ngs_te_mapper2": "https://github.com/bergmanlab/ngs_te_mapper2/archive/d6ed0941cb847771ba5fa17e82f2ef5b54351bd8.zip", + "ngs_te_mapper2": "https://github.com/bergmanlab/ngs_te_mapper2/archive/220619a3dd00b8c05c3b96328eb2424c6aad623e.zip", "popoolationte" : "http://downloads.sourceforge.net/project/popoolationte/popoolationte_1.02.zip", "popoolationte2": "http://downloads.sourceforge.net/project/popoolation-te2/popte2-v1.10.03.jar", "relocate2" : "https://github.com/stajichlab/RelocaTE2/archive/v2.0.1.zip", @@ -22,7 +22,7 @@ "temp2" : "55b6449113e85540f8a39bb02cfec9b0", "relocate" : "64cc60f65154368518529fb57becc3d6", "ngs_te_mapper" : "a00e9962ee3daca9fac915c23c6af203", - "ngs_te_mapper2": "5527299633ea712001fd02785314a273", + "ngs_te_mapper2": "95198ab68fcb4f9dc267460a7f906fae", "popoolationte" : "315d266bd6da7487c919576dc46b92b4", "popoolationte2": "0c06186f0b1df215949f57a5cbb2d039", "relocate2" : "63eb7dbc09ffeb4063aef4857d198b06", From 111bfcc55978a5d4282136385d6929076e48989b Mon Sep 17 00:00:00 2001 From: pbasting Date: Mon, 7 Jun 2021 12:12:35 -0400 Subject: [PATCH 12/13] disabling jitterbug and tepid --- internal/sysconfig.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/internal/sysconfig.py b/internal/sysconfig.py index fbece79..dc8eca4 100644 --- a/internal/sysconfig.py +++ b/internal/sysconfig.py @@ -1,7 +1,7 @@ -ALL_METHODS = ["ngs_te_mapper", "ngs_te_mapper2", "relocate", "relocate2", "temp", "temp2", "retroseq", "popoolationte", "popoolationte2", "te-locate", "teflon", "coverage", "trimgalore","map_reads", "tebreak", "jitterbug", "tepid"] -SINGLE_END_METHODS = ["ngs_te_mapper", "ngs_te_mapper2", "relocate", "coverage", "trimgalore", "map_reads", "tebreak", "tepid"] -MULTI_THREAD_METHODS = ["coverage", "temp", "temp2", "relocate2", "ngs_te_mapper", "ngs_te_mapper2", "popoolationte", "teflon", "trimgalore", "tebreak", "jitterbug", "tepid"] -NO_INSTALL_METHODS = ["trimgalore", "map_reads", "coverage", "relocate2", "tepid"] # no source code to install for these methods, just envs +ALL_METHODS = ["ngs_te_mapper", "ngs_te_mapper2", "relocate", "relocate2", "temp", "temp2", "retroseq", "popoolationte", "popoolationte2", "te-locate", "teflon", "coverage", "trimgalore","map_reads", "tebreak"] +SINGLE_END_METHODS = ["ngs_te_mapper", "ngs_te_mapper2", "relocate", "coverage", "trimgalore", "map_reads", "tebreak"] +MULTI_THREAD_METHODS = ["coverage", "temp", "temp2", "relocate2", "ngs_te_mapper", "ngs_te_mapper2", "popoolationte", "teflon", "trimgalore", "tebreak"] +NO_INSTALL_METHODS = ["trimgalore", "map_reads", "coverage", "relocate2"] # no source code to install for these methods, just envs INPUT_DIR = "{{indir}}" REF_DIR = "{{refdir}}" From a6e02732ab0d88f61fecc3d79bac6e0fc363fa50 Mon Sep 17 00:00:00 2001 From: pbasting Date: Tue, 8 Jun 2021 14:05:06 -0400 Subject: [PATCH 13/13] removing tepid and jitterbug from yeast sim config --- auxiliary/simulation/yeast/yeast_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auxiliary/simulation/yeast/yeast_config.json b/auxiliary/simulation/yeast/yeast_config.json index 348e223..e2a47e8 100644 --- a/auxiliary/simulation/yeast/yeast_config.json +++ b/auxiliary/simulation/yeast/yeast_config.json @@ -20,6 +20,6 @@ "mcclintock": { "path" : "/home/pjb68507/git//mcclintock/", - "methods" : "ngs_te_mapper,ngs_te_mapper2,relocate,relocate2,temp,temp2,retroseq,popoolationte,popoolationte2,te-locate,teflon,jitterbug,tepid,tebreak" + "methods" : "ngs_te_mapper,ngs_te_mapper2,relocate,relocate2,temp,temp2,retroseq,popoolationte,popoolationte2,te-locate,teflon,tebreak" } }