Updating pipeline to use additional strains fasta file and batch id i…

…nformation
OpenOmics · Apr 3, 2024 · 868189f · 868189f
1 parent 1138f7e
commit 868189f
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 13 deletions.
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -16,6 +16,26 @@ configfile: 'config.json'                      # Generated from user input and c
 workpath = config['project']['workpath']       # Pipeline's output directory
 tmpdir   = config['options']['tmp_dir']        # Temporary directory
 samples2barcodes = config['barcodes']          # Samples to demultiplex, `cat` together
+# Creates a unique sub directory
+# within using the identifer with
+# project level folder. This is to
+# ensure files are not over-written
+# between runs of the pipeline,
+# needs to be added to paths in the
+# project folder.
+batch_id = config['options']['batch_id']                  # Batch Identifer, default: ''.
+strains_fasta = config['options']['additional_strains']   # Additional strains fasta, default: "None"  
+# Build phylogentic tree with
+# additional mpox strains provided
+# via --additional-strains option,
+# If option is not provided, it will
+# resolve to "None"
+add_strains = False if strains_fasta == "None" else True
+strains_fasta = strains_fasta if add_strains else ''
+
+decompress_strains_fasta = False
+if strains_fasta.endswith('.gz') or strains_fasta.endswith('.gzip'):
+    decompress_strains_fasta = True 
 
 # Find list of sample which 
 # have mulitple barcodes, this 
@@ -56,6 +76,13 @@ if use_conda or conda_env_name:
 # Rule DAG built from listed here 
 rule all:
     input:
+        # Uncompress additional strains fasta file,
+        # conditionally run if --additional-strains
+        # option is provided and file is gzipped
+        provided(
+            [join(workpath, "project", "additional_strains.fa")],
+            add_strains and decompress_strains_fasta
+        ),
         # Merge samples with multiple barcodes,
         # @imported from `rule setup` in rules/trim.smk 
         expand(
@@ -83,13 +110,13 @@ rule all:
         # Create input file for MSA, concatenates the ref and 
         # each samples consequence sequence.
         # @imported from `rule concat` in rules/map.smk
-        join(workpath, "project", "consensus.fa"),
+        join(workpath, "project", batch_id, "consensus.fa"),
         # Mutiple sequence alignment (MSA),
         # @imported from `rule mafft` in rules/map.smk
-        join(workpath, "project", "msa.fa"),
+        join(workpath, "project", batch_id, "msa.fa"),
         # Build a phylogentic tree from MSA,
         # @imported from `rule tree` in rules/tree.smk
-        join(workpath, "project", "mpox_phylogeny.raxml.bestTree"),
+        join(workpath, "project", batch_id, "mpox_phylogeny.raxml.bestTree"),
 
 
 # Import rules 

diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk
@@ -81,18 +81,25 @@ rule concat:
     """
     input:
         fas = expand(join(workpath, "{name}", "consensus", "{name}_consensus_seqid.fa"), name=samples),
+        strain = lambda _: join(workpath, "project", "additional_strains.fa") \
+        if decompress_strains_fasta else [],
     output:
-        fa  = join(workpath, "project", "consensus.fa"),
+        fa  = join(workpath, "project", batch_id, "consensus.fa"),
     params:
         rname  = 'premafft',
-        ref_fa  = config['references']['mpox_pcr_sequence'],
-    conda: depending(conda_yaml_or_named_env, use_conda)
+        ref_fa = config['references']['mpox_pcr_sequence'],
+        # Use decompressed strains fasta file if
+        # a gzipped input file was provided, else
+        # use strains_fa (either file or empty string)
+        strain_fa = lambda _: join(workpath, "project", "additional_strains.fa") \
+        if decompress_strains_fasta else strains_fasta,
+    conda: depending(conda_yaml_or_named_env, use_conda),
     container: depending(config['images']['mpox-seek'], use_singularity)
     shell: 
         """
         # Create FASTA file with the reference genome
         # and the consensus sequence of each sample
-        cat {params.ref_fa} \\
+        cat {params.ref_fa} {params.strain_fa} \\
             {input.fas} \\
             > {output.fa}
         """
@@ -108,9 +115,9 @@ rule mafft:
         Multiple sequence alignment (MSA) FASTA file from mafft.
     """
     input:
-        fa  = join(workpath, "project", "consensus.fa"),
+        fa  = join(workpath, "project", batch_id, "consensus.fa"),
     output:
-        msa = join(workpath, "project", "msa.fa"),
+        msa = join(workpath, "project", batch_id, "msa.fa"),
     params:
         rname  = 'msa',
     conda: depending(conda_yaml_or_named_env, use_conda)

diff --git a/workflow/rules/tree.smk b/workflow/rules/tree.smk
@@ -12,12 +12,12 @@ rule tree:
         Phylogenetic tree (Newick format).
     """
     input:
-        msa  = join(workpath, "project", "msa.fa"),
+        msa = join(workpath, "project", batch_id, "msa.fa"),
     output:
-        nw  = join(workpath, "project", "mpox_phylogeny.raxml.bestTree"),
+        nw  = join(workpath, "project", batch_id, "mpox_phylogeny.raxml.bestTree"),
     params:
         rname  = 'tree',
-        prefix = join(workpath, "project", "mpox_phylogeny"),
+        prefix = join(workpath, "project", batch_id, "mpox_phylogeny"),
     conda: depending(conda_yaml_or_named_env, use_conda)
     container: depending(config['images']['mpox-seek'], use_singularity)
     shell: 
@@ -30,5 +30,6 @@ rule tree:
             --msa {input.msa} \\
             --model GTR+G \\
             --msa-format FASTA \\
-            --prefix {params.prefix}
+            --prefix {params.prefix} \\
+            --seed 42
         """