From 868189fa27e7884b4e4941075311e7a7a1c849a2 Mon Sep 17 00:00:00 2001
From: skchronicles <kuhnsa3@gmail.com>
Date: Wed, 3 Apr 2024 19:49:52 -0400
Subject: [PATCH] Updating pipeline to use additional strains fasta file and
 batch id information

---
 workflow/Snakefile      | 33 ++++++++++++++++++++++++++++++---
 workflow/rules/map.smk  | 19 +++++++++++++------
 workflow/rules/tree.smk |  9 +++++----
 3 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index d32fb3b..2e61a78 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -16,6 +16,26 @@ configfile: 'config.json'                      # Generated from user input and c
 workpath = config['project']['workpath']       # Pipeline's output directory
 tmpdir   = config['options']['tmp_dir']        # Temporary directory
 samples2barcodes = config['barcodes']          # Samples to demultiplex, `cat` together
+# Creates a unique sub directory
+# within using the identifer with
+# project level folder. This is to
+# ensure files are not over-written
+# between runs of the pipeline,
+# needs to be added to paths in the
+# project folder.
+batch_id = config['options']['batch_id']                  # Batch Identifer, default: ''.
+strains_fasta = config['options']['additional_strains']   # Additional strains fasta, default: "None"  
+# Build phylogentic tree with
+# additional mpox strains provided
+# via --additional-strains option,
+# If option is not provided, it will
+# resolve to "None"
+add_strains = False if strains_fasta == "None" else True
+strains_fasta = strains_fasta if add_strains else ''
+
+decompress_strains_fasta = False
+if strains_fasta.endswith('.gz') or strains_fasta.endswith('.gzip'):
+    decompress_strains_fasta = True 
 
 # Find list of sample which 
 # have mulitple barcodes, this 
@@ -56,6 +76,13 @@ if use_conda or conda_env_name:
 # Rule DAG built from listed here 
 rule all:
     input:
+        # Uncompress additional strains fasta file,
+        # conditionally run if --additional-strains
+        # option is provided and file is gzipped
+        provided(
+            [join(workpath, "project", "additional_strains.fa")],
+            add_strains and decompress_strains_fasta
+        ),
         # Merge samples with multiple barcodes,
         # @imported from `rule setup` in rules/trim.smk 
         expand(
@@ -83,13 +110,13 @@ rule all:
         # Create input file for MSA, concatenates the ref and 
         # each samples consequence sequence.
         # @imported from `rule concat` in rules/map.smk
-        join(workpath, "project", "consensus.fa"),
+        join(workpath, "project", batch_id, "consensus.fa"),
         # Mutiple sequence alignment (MSA),
         # @imported from `rule mafft` in rules/map.smk
-        join(workpath, "project", "msa.fa"),
+        join(workpath, "project", batch_id, "msa.fa"),
         # Build a phylogentic tree from MSA,
         # @imported from `rule tree` in rules/tree.smk
-        join(workpath, "project", "mpox_phylogeny.raxml.bestTree"),
+        join(workpath, "project", batch_id, "mpox_phylogeny.raxml.bestTree"),
 
 
 # Import rules 
diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk
index b501feb..d4b485d 100644
--- a/workflow/rules/map.smk
+++ b/workflow/rules/map.smk
@@ -81,18 +81,25 @@ rule concat:
     """
     input:
         fas = expand(join(workpath, "{name}", "consensus", "{name}_consensus_seqid.fa"), name=samples),
+        strain = lambda _: join(workpath, "project", "additional_strains.fa") \
+        if decompress_strains_fasta else [],
     output:
-        fa  = join(workpath, "project", "consensus.fa"),
+        fa  = join(workpath, "project", batch_id, "consensus.fa"),
     params:
         rname  = 'premafft',
-        ref_fa  = config['references']['mpox_pcr_sequence'],
-    conda: depending(conda_yaml_or_named_env, use_conda)
+        ref_fa = config['references']['mpox_pcr_sequence'],
+        # Use decompressed strains fasta file if
+        # a gzipped input file was provided, else
+        # use strains_fa (either file or empty string)
+        strain_fa = lambda _: join(workpath, "project", "additional_strains.fa") \
+        if decompress_strains_fasta else strains_fasta,
+    conda: depending(conda_yaml_or_named_env, use_conda),
     container: depending(config['images']['mpox-seek'], use_singularity)
     shell: 
         """
         # Create FASTA file with the reference genome
         # and the consensus sequence of each sample
-        cat {params.ref_fa} \\
+        cat {params.ref_fa} {params.strain_fa} \\
             {input.fas} \\
             > {output.fa}
         """
@@ -108,9 +115,9 @@ rule mafft:
         Multiple sequence alignment (MSA) FASTA file from mafft.
     """
     input:
-        fa  = join(workpath, "project", "consensus.fa"),
+        fa  = join(workpath, "project", batch_id, "consensus.fa"),
     output:
-        msa = join(workpath, "project", "msa.fa"),
+        msa = join(workpath, "project", batch_id, "msa.fa"),
     params:
         rname  = 'msa',
     conda: depending(conda_yaml_or_named_env, use_conda)
diff --git a/workflow/rules/tree.smk b/workflow/rules/tree.smk
index 7759036..79c39d0 100644
--- a/workflow/rules/tree.smk
+++ b/workflow/rules/tree.smk
@@ -12,12 +12,12 @@ rule tree:
         Phylogenetic tree (Newick format).
     """
     input:
-        msa  = join(workpath, "project", "msa.fa"),
+        msa = join(workpath, "project", batch_id, "msa.fa"),
     output:
-        nw  = join(workpath, "project", "mpox_phylogeny.raxml.bestTree"),
+        nw  = join(workpath, "project", batch_id, "mpox_phylogeny.raxml.bestTree"),
     params:
         rname  = 'tree',
-        prefix = join(workpath, "project", "mpox_phylogeny"),
+        prefix = join(workpath, "project", batch_id, "mpox_phylogeny"),
     conda: depending(conda_yaml_or_named_env, use_conda)
     container: depending(config['images']['mpox-seek'], use_singularity)
     shell: 
@@ -30,5 +30,6 @@ rule tree:
             --msa {input.msa} \\
             --model GTR+G \\
             --msa-format FASTA \\
-            --prefix {params.prefix}
+            --prefix {params.prefix} \\
+            --seed 42
         """
\ No newline at end of file