Skip to content

Commit

Permalink
Merge pull request #87 from bergmanlab/tebreak
Browse files Browse the repository at this point in the history
adding tebreak as an additional method. Reformatting config files to be consistent across methods. Updating documentation.
  • Loading branch information
pbasting authored Jun 8, 2021
2 parents a393b73 + a6e0273 commit 93369ef
Show file tree
Hide file tree
Showing 92 changed files with 2,064 additions and 908 deletions.
5 changes: 4 additions & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ include: config['args']['mcc_path']+"/snakefiles/retroseq.snakefile"
include: config['args']['mcc_path']+"/snakefiles/tepid.snakefile"
include: config['args']['mcc_path']+"/snakefiles/teflon.snakefile"
include: config['args']['mcc_path']+"/snakefiles/jitterbug.snakefile"
include: config['args']['mcc_path']+"/snakefiles/tebreak.snakefile"

rule setup_reads:
input:
Expand Down Expand Up @@ -223,7 +224,9 @@ rule sam_to_bam:
output:
bam = config['mcc']['bam'],
flagstat = config['mcc']['flagstat'],
tmp_bam = temp(config['mcc']['mcc_files']+config['args']['run_id']+".tmp.bam")
metrics = config['mcc']['mcc_files']+config['args']['run_id']+".metrics",
tmp_bam = temp(config['mcc']['mcc_files']+config['args']['run_id']+".tmp.bam"),
tmp2_bam = temp(config['mcc']['mcc_files']+config['args']['run_id']+".tmp2.bam")

script:
config['args']['mcc_path']+"/scripts/preprocessing/sam_to_bam.py"
Expand Down
3 changes: 2 additions & 1 deletion auxiliary/simulation/mcc_sim.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ dependencies:
- wgsim
- seqtk
- samtools
- bedtools
- bedtools
- art
25 changes: 22 additions & 3 deletions auxiliary/simulation/mcclintock_simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def main():
fastq2 = modified_reference.replace(".fasta", "_2.fastq")
if not os.path.exists(fastq1) or not os.path.exists(fastq2):
num_pairs = calculate_num_pairs(modified_reference, args.coverage, args.length, single=args.single)
fastq1, fastq2 = create_synthetic_reads(modified_reference, num_pairs, args.length, args.insert, args.error, x, args.out, run_id=args.runid, seed=args.seed)
fastq1, fastq2 = create_synthetic_reads(args.sim, modified_reference, args.coverage, num_pairs, args.length, args.insert, args.error, x, args.out, run_id=args.runid, seed=args.seed)

if not os.path.exists(summary_report):
run_mcclintock(fastq1, fastq2, args.reference, args.consensus, args.locations, args.taxonomy, x, args.proc, args.out, args.config, args.keep_intermediate, run_id=args.runid, single=args.single, reverse=reverse)
Expand Down Expand Up @@ -71,6 +71,7 @@ def parse_args():
parser.add_argument("--end", type=int, help="The number of replicates to run. [default = 300]", required=False)
parser.add_argument("--seed", type=str, help="a seed to the random number generator so runs can be replicated", required=False)
parser.add_argument("--runid", type=str, help="a string to prepend to output files so that multiple runs can be run at the same time without file name clashes", required=False)
parser.add_argument("--sim", type=str, help="Short read simulator to use (options=wgsim,art) [default = wgsim]", required=False)
parser.add_argument("-s","--single", action="store_true", help="runs the simulation in single ended mode", required=False)

args = parser.parse_args()
Expand Down Expand Up @@ -142,6 +143,13 @@ def parse_args():
elif args.strand != "plus" and args.strand != "minus":
sys.exit("ERROR: --strand must be plus or minus \n")


if args.sim is None:
args.sim = "wgsim"
elif args.sim not in ["wgsim", "art"]:
sys.exit("ERROR: --sim must be wgsim or art \n")


return args

def run_command(cmd_list, log=None):
Expand Down Expand Up @@ -374,20 +382,31 @@ def calculate_num_pairs(fasta, coverage, length, single=False):

return num_pairs

def create_synthetic_reads(reference, num_pairs, length, insert, error, rep, out, run_id="", seed=None):
def create_synthetic_reads(simulator, reference, coverage, num_pairs, length, insert, error, rep, out, run_id="", seed=None):
if seed is not None:
random.seed(seed+"create_synthetic_reads"+str(rep))
else:
random.seed(str(datetime.now())+"create_synthetic_reads"+str(rep))

seed_for_wgsim = random.randint(0,1000)

tmp_fastq1 = reference.replace(".fasta", "") + "1.fq"
tmp_fastq2 = reference.replace(".fasta", "") + "2.fq"

fastq1 = reference.replace(".fasta", "_1.fastq")
fastq2 = reference.replace(".fasta", "_2.fastq")
report = reference.replace(".fasta", "_wgsim_report.txt")

command = ["wgsim", "-1", str(length), "-2", str(length), "-d", str(insert), "-N", str(num_pairs), "-S", str(seed_for_wgsim), "-e", str(error), "-h", reference, fastq1, fastq2]
if simulator == "wgsim":
command = ["wgsim", "-1", str(length), "-2", str(length), "-d", str(insert), "-N", str(num_pairs), "-S", str(seed_for_wgsim), "-e", str(error), "-h", reference, tmp_fastq1, tmp_fastq2]

else:
command = ["art_illumina", "-ss", "HS25", "--rndSeed", str(seed_for_wgsim), "-sam", "-i", reference, "-p", "-l", str(length), "-f", str(coverage), "-m", str(insert), "-s", "10", "-o", reference.replace(".fasta", "")]


run_command_stdout(command, report)
run_command(["mv", tmp_fastq1, fastq1])
run_command(["mv", tmp_fastq2, fastq2])

return fastq1, fastq2

Expand Down
11 changes: 9 additions & 2 deletions auxiliary/simulation/mcclintock_simulation_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,8 +317,15 @@ def write_combined_metrics(forward_metrics, reverse_metrics, outfile):
with open(outfile, "w") as out:
out.write(",".join(forward_metrics[0]) + "\n")
for method in metrics.keys():
precision = metrics[method][0] / (metrics[method][0] + metrics[method][1])
recall = metrics[method][0] / (metrics[method][0] + metrics[method][2])
if metrics[method][0] + metrics[method][1] > 0:
precision = metrics[method][0] / (metrics[method][0] + metrics[method][1])
else:
precision = 0

if metrics[method][0] + metrics[method][2] > 0:
recall = metrics[method][0] / (metrics[method][0] + metrics[method][2])
else:
recall = 0
outline = [method, str(metrics[method][0]), str(metrics[method][1]), str(metrics[method][2]), str(precision), str(recall)]
out.write(",".join(outline) + "\n")

Expand Down
2 changes: 1 addition & 1 deletion auxiliary/simulation/yeast/yeast_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@

"mcclintock": {
"path" : "/home/pjb68507/git//mcclintock/",
"methods" : "ngs_te_mapper,ngs_te_mapper2,relocate,relocate2,temp,temp2,retroseq,popoolationte,popoolationte2,te-locate,teflon"
"methods" : "ngs_te_mapper,ngs_te_mapper2,relocate,relocate2,temp,temp2,retroseq,popoolationte,popoolationte2,te-locate,teflon,tebreak"
}
}
19 changes: 9 additions & 10 deletions config/coverage/coverage.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@

# omits the edges of the TE sequence coverage when calculating average depth across the element
OMIT_EDGES = True

# If OMIT_EDGES = True, use the read length as the length of the edges to omit
OMIT_EDGES_READ_LENGTH = True

# IF OMIT_EDGES = True and OMIT_EDGES_READ_LENGTH = False
# use this value as the length of the edges to omit
OMIT_EDGES_LENGTH = 300
PARAMS = {
# omits the edges of the TE sequence coverage when calculating average depth across the element
"omit_edges": True,
# If OMIT_EDGES = True, use the read length as the length of the edges to omit
"omit_edges_read_length" : True,
# IF OMIT_EDGES = True and OMIT_EDGES_READ_LENGTH = False
# use this value as the length of the edges to omit
"omit_edges_length" : 300
}
5 changes: 3 additions & 2 deletions config/ngs_te_mapper/ngs_te_mapper_post.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@

MIN_READ_SUPPORT = 0
PARAMS = {
"min_read_support" : 0
}
7 changes: 4 additions & 3 deletions config/ngs_te_mapper/ngs_te_mapper_run.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

# max length of junction read overlap to consider a target site duplication
MAX_TSD = 20
PARAMS = {
# max length of junction read overlap to consider a target site duplication
"tsd=" : 20
}
28 changes: 13 additions & 15 deletions config/ngs_te_mapper2/ngs_te_mapper2_run.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@

# merge window for identifying TE clusters
WINDOW = 10

# minimum mapping quality of alignment
MIN_MAPQ = 20

# minimum allele frequency
MIN_ALLELE_FREQUENCY = 0.1

# max length of junction read overlap to consider a target site duplication
MAX_TSD = 25

# maximum gap size
MAX_GAP = 5
PARAMS = {
"--window" : 10,
"--min_mapq" : 20,
"--min_af" : 0.1,
"--tsd_max" : 25,
"--gap_max" : 5
}

# --window WINDOW merge window for identifying TE clusters (default = 10)
# --min_mapq MIN_MAPQ minimum mapping quality of alignment (default = 20)
# --min_af MIN_AF minimum allele frequency (default = 0.1)
# --tsd_max TSD_MAX maximum TSD size (default = 25)
# --gap_max GAP_MAX maximum gap size (default = 5)


12 changes: 6 additions & 6 deletions config/popoolationte/popoolationte_post.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

# requires that final results have support on both ends of prediction
REQUIRE_BOTH_END_SUPPORT=True

# threshold for the minimum acceptable fraction of the reads supporting the prediction
PERCENT_READ_SUPPORT_THRESHOLD=0.1
PARAMS = {
# requires that final results have support on both ends of prediction
"require_both_end_support" : True,
# threshold for the minimum acceptable fraction of the reads supporting the prediction
"percent_read_support_threshold" : 0.1
}
51 changes: 21 additions & 30 deletions config/popoolationte/popoolationte_run.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,23 @@

PARAMS = {
"identify-te-insertsites.pl" : {
"--min-count" : 3,
"--min-map-qual" : 15
},
"crosslink-te-sites.pl" : {
"--single-site-shift": 100
},
"update-teinserts-with-knowntes.pl" : {
"--single-site-shift": 100
},
"estimate-polymorphism.pl" : {
"--min-map-qual": 15
},
"filter-teinserts.pl" : {
"--min-count": 5
}
}

'''
NAME
perl identify-te-insertsites.pl - Identifies TE insertion sites (forward
Expand All @@ -11,27 +31,7 @@
--min-map-qual
the minimum mapping quality; this will only apply to reads mapping
to a reference contig.
'''
IDENTIFY_TE_INSERTSITES = {
"min-count" : 3,
"min-map-qual" : 15
}




CROSSLINK_TE_SITES = {
"single-site-shift": 100
}


UPDATE_TEINSERTS_WITH_KNOWNTES = {
"single-site-shift": 100
}


'''
NAME
perl estimate-polymorphism.pl - Estimte the insertion frequencies for a
given set of TE insertions
Expand All @@ -43,13 +43,4 @@
--min-map-qual
the minimum mapping quality
'''

ESTIMATE_POLYMORPHISM = {
"min-map-qual": 15
}


FILTER = {
"min-count": 5
}
'''
11 changes: 6 additions & 5 deletions config/popoolationte2/popoolationte2_post.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
PARAMS = {
# require that the TE prediction have support from both junctions
"require_both_end_support" : True,

# require that the TE prediction have support from both junctions
REQUIRE_BOTH_END_SUPPORT = True

# threshold for the minimum acceptable (average physical coverage supporting the insertion of the given TE) / (average physical coverage)
FREQUENCY_THRESHOLD=0.1
# threshold for the minimum acceptable (average physical coverage supporting the insertion of the given TE) / (average physical coverage)
"frequency_threshold" : 0.1
}
64 changes: 37 additions & 27 deletions config/popoolationte2/popoolationte2_run.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,39 @@

PARAMS = {
"ppileup" : {
"--map-qual": 15,
"--sr-mindist" : 10000,
"--id-up-quant": 0.01
},

"subsampleppileup" : {
"run" : False,
"--target-coverage": 100,
"--with-replace": False
},

"identifySignatures" : {
"--min-count": 2.0,
"--signature-window": "median",
"--min-valley": "median",
"--chunk-distance": 5
},

"updateStrand" : {
"--map-qual": 15,
"--max-disagreement": 0.1,
"--sr-mindist": 10000,
"--id-up-quant": 0.01
},

"pairupSignatures" : {
"--min-distance": -100,
"--max-distance": 500,
"--max-freq-diff": 1.0
}

}

#ppileup
# create a physical pileup file from one or multiple bam files

Expand All @@ -11,11 +46,6 @@
# --id-up-quant paired end fragments with an insert size in the upper
# quantile will be ignored [fraction]; default=0.01

ppileup = {
"map-qual": 15,
"sr-mindist" : 10000,
"id-up-quant": 0.01
}


# subsampleppileup
Expand All @@ -28,11 +58,7 @@
# == Parameters for fine tuning ==
# --with-replace use sampling with replacement instead of without replacement

subsampleppileup = {
"run" : False,
"target-coverage": 100,
"with-replace": False
}



# identifySignatures
Expand All @@ -51,12 +77,7 @@
# --chunk-distance minimum distance between chromosomal chunks in multiples
# of --min-valley [int]; default=5

identifySignatures = {
"min-count": 2.0,
"signature-window": "median",
"min-valley": "median",
"chunk-distance": 5
}



# updateStrand
Expand All @@ -73,12 +94,6 @@
# --id-up-quant paired end fragments with an insert size in the upper
# quantile will be ignored [fraction]; default=0.01

updateStrand = {
"map-qual": 15,
"max-disagreement": 0.1,
"sr-mindist": 10000,
"id-up-quant": 0.01
}


# pairupSignatures
Expand All @@ -92,8 +107,3 @@
# --max-freq-diff the maximum frequency difference between signatures;
# default=1.0

pairupSignatures = {
"min-distance": -100,
"max-distance": 500,
"max-freq-diff": 1.0
}
11 changes: 6 additions & 5 deletions config/relocate/relocate_post.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
REF_LEFT_THRESHOLD = 0
REF_RIGHT_THRESHOLD = 0

NONREF_LEFT_THRESHOLD = 0
NONREF_RIGHT_THRESHOLD = 0
PARAMS = {
"ref_left_threshold" : 0,
"ref_right_threshold" : 0,
"nonref_left_threshold" : 0,
"nonref_right_threshold" : 0
}
Loading

0 comments on commit 93369ef

Please sign in to comment.