From 868189fa27e7884b4e4941075311e7a7a1c849a2 Mon Sep 17 00:00:00 2001 From: skchronicles Date: Wed, 3 Apr 2024 19:49:52 -0400 Subject: [PATCH] Updating pipeline to use additional strains fasta file and batch id information --- workflow/Snakefile | 33 ++++++++++++++++++++++++++++++--- workflow/rules/map.smk | 19 +++++++++++++------ workflow/rules/tree.smk | 9 +++++---- 3 files changed, 48 insertions(+), 13 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index d32fb3b..2e61a78 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -16,6 +16,26 @@ configfile: 'config.json' # Generated from user input and c workpath = config['project']['workpath'] # Pipeline's output directory tmpdir = config['options']['tmp_dir'] # Temporary directory samples2barcodes = config['barcodes'] # Samples to demultiplex, `cat` together +# Creates a unique sub directory +# within using the identifer with +# project level folder. This is to +# ensure files are not over-written +# between runs of the pipeline, +# needs to be added to paths in the +# project folder. +batch_id = config['options']['batch_id'] # Batch Identifer, default: ''. +strains_fasta = config['options']['additional_strains'] # Additional strains fasta, default: "None" +# Build phylogentic tree with +# additional mpox strains provided +# via --additional-strains option, +# If option is not provided, it will +# resolve to "None" +add_strains = False if strains_fasta == "None" else True +strains_fasta = strains_fasta if add_strains else '' + +decompress_strains_fasta = False +if strains_fasta.endswith('.gz') or strains_fasta.endswith('.gzip'): + decompress_strains_fasta = True # Find list of sample which # have mulitple barcodes, this @@ -56,6 +76,13 @@ if use_conda or conda_env_name: # Rule DAG built from listed here rule all: input: + # Uncompress additional strains fasta file, + # conditionally run if --additional-strains + # option is provided and file is gzipped + provided( + [join(workpath, "project", "additional_strains.fa")], + add_strains and decompress_strains_fasta + ), # Merge samples with multiple barcodes, # @imported from `rule setup` in rules/trim.smk expand( @@ -83,13 +110,13 @@ rule all: # Create input file for MSA, concatenates the ref and # each samples consequence sequence. # @imported from `rule concat` in rules/map.smk - join(workpath, "project", "consensus.fa"), + join(workpath, "project", batch_id, "consensus.fa"), # Mutiple sequence alignment (MSA), # @imported from `rule mafft` in rules/map.smk - join(workpath, "project", "msa.fa"), + join(workpath, "project", batch_id, "msa.fa"), # Build a phylogentic tree from MSA, # @imported from `rule tree` in rules/tree.smk - join(workpath, "project", "mpox_phylogeny.raxml.bestTree"), + join(workpath, "project", batch_id, "mpox_phylogeny.raxml.bestTree"), # Import rules diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk index b501feb..d4b485d 100644 --- a/workflow/rules/map.smk +++ b/workflow/rules/map.smk @@ -81,18 +81,25 @@ rule concat: """ input: fas = expand(join(workpath, "{name}", "consensus", "{name}_consensus_seqid.fa"), name=samples), + strain = lambda _: join(workpath, "project", "additional_strains.fa") \ + if decompress_strains_fasta else [], output: - fa = join(workpath, "project", "consensus.fa"), + fa = join(workpath, "project", batch_id, "consensus.fa"), params: rname = 'premafft', - ref_fa = config['references']['mpox_pcr_sequence'], - conda: depending(conda_yaml_or_named_env, use_conda) + ref_fa = config['references']['mpox_pcr_sequence'], + # Use decompressed strains fasta file if + # a gzipped input file was provided, else + # use strains_fa (either file or empty string) + strain_fa = lambda _: join(workpath, "project", "additional_strains.fa") \ + if decompress_strains_fasta else strains_fasta, + conda: depending(conda_yaml_or_named_env, use_conda), container: depending(config['images']['mpox-seek'], use_singularity) shell: """ # Create FASTA file with the reference genome # and the consensus sequence of each sample - cat {params.ref_fa} \\ + cat {params.ref_fa} {params.strain_fa} \\ {input.fas} \\ > {output.fa} """ @@ -108,9 +115,9 @@ rule mafft: Multiple sequence alignment (MSA) FASTA file from mafft. """ input: - fa = join(workpath, "project", "consensus.fa"), + fa = join(workpath, "project", batch_id, "consensus.fa"), output: - msa = join(workpath, "project", "msa.fa"), + msa = join(workpath, "project", batch_id, "msa.fa"), params: rname = 'msa', conda: depending(conda_yaml_or_named_env, use_conda) diff --git a/workflow/rules/tree.smk b/workflow/rules/tree.smk index 7759036..79c39d0 100644 --- a/workflow/rules/tree.smk +++ b/workflow/rules/tree.smk @@ -12,12 +12,12 @@ rule tree: Phylogenetic tree (Newick format). """ input: - msa = join(workpath, "project", "msa.fa"), + msa = join(workpath, "project", batch_id, "msa.fa"), output: - nw = join(workpath, "project", "mpox_phylogeny.raxml.bestTree"), + nw = join(workpath, "project", batch_id, "mpox_phylogeny.raxml.bestTree"), params: rname = 'tree', - prefix = join(workpath, "project", "mpox_phylogeny"), + prefix = join(workpath, "project", batch_id, "mpox_phylogeny"), conda: depending(conda_yaml_or_named_env, use_conda) container: depending(config['images']['mpox-seek'], use_singularity) shell: @@ -30,5 +30,6 @@ rule tree: --msa {input.msa} \\ --model GTR+G \\ --msa-format FASTA \\ - --prefix {params.prefix} + --prefix {params.prefix} \\ + --seed 42 """ \ No newline at end of file