Skip to content

Commit

Permalink
Updating pipeline to use additional strains fasta file and batch id i…
Browse files Browse the repository at this point in the history
…nformation
  • Loading branch information
skchronicles committed Apr 3, 2024
1 parent 1138f7e commit 868189f
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 13 deletions.
33 changes: 30 additions & 3 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,26 @@ configfile: 'config.json' # Generated from user input and c
workpath = config['project']['workpath'] # Pipeline's output directory
tmpdir = config['options']['tmp_dir'] # Temporary directory
samples2barcodes = config['barcodes'] # Samples to demultiplex, `cat` together
# Creates a unique sub directory
# within using the identifer with
# project level folder. This is to
# ensure files are not over-written
# between runs of the pipeline,
# needs to be added to paths in the
# project folder.
batch_id = config['options']['batch_id'] # Batch Identifer, default: ''.
strains_fasta = config['options']['additional_strains'] # Additional strains fasta, default: "None"
# Build phylogentic tree with
# additional mpox strains provided
# via --additional-strains option,
# If option is not provided, it will
# resolve to "None"
add_strains = False if strains_fasta == "None" else True
strains_fasta = strains_fasta if add_strains else ''

decompress_strains_fasta = False
if strains_fasta.endswith('.gz') or strains_fasta.endswith('.gzip'):
decompress_strains_fasta = True

# Find list of sample which
# have mulitple barcodes, this
Expand Down Expand Up @@ -56,6 +76,13 @@ if use_conda or conda_env_name:
# Rule DAG built from listed here
rule all:
input:
# Uncompress additional strains fasta file,
# conditionally run if --additional-strains
# option is provided and file is gzipped
provided(
[join(workpath, "project", "additional_strains.fa")],
add_strains and decompress_strains_fasta
),
# Merge samples with multiple barcodes,
# @imported from `rule setup` in rules/trim.smk
expand(
Expand Down Expand Up @@ -83,13 +110,13 @@ rule all:
# Create input file for MSA, concatenates the ref and
# each samples consequence sequence.
# @imported from `rule concat` in rules/map.smk
join(workpath, "project", "consensus.fa"),
join(workpath, "project", batch_id, "consensus.fa"),
# Mutiple sequence alignment (MSA),
# @imported from `rule mafft` in rules/map.smk
join(workpath, "project", "msa.fa"),
join(workpath, "project", batch_id, "msa.fa"),
# Build a phylogentic tree from MSA,
# @imported from `rule tree` in rules/tree.smk
join(workpath, "project", "mpox_phylogeny.raxml.bestTree"),
join(workpath, "project", batch_id, "mpox_phylogeny.raxml.bestTree"),


# Import rules
Expand Down
19 changes: 13 additions & 6 deletions workflow/rules/map.smk
Original file line number Diff line number Diff line change
Expand Up @@ -81,18 +81,25 @@ rule concat:
"""
input:
fas = expand(join(workpath, "{name}", "consensus", "{name}_consensus_seqid.fa"), name=samples),
strain = lambda _: join(workpath, "project", "additional_strains.fa") \
if decompress_strains_fasta else [],
output:
fa = join(workpath, "project", "consensus.fa"),
fa = join(workpath, "project", batch_id, "consensus.fa"),
params:
rname = 'premafft',
ref_fa = config['references']['mpox_pcr_sequence'],
conda: depending(conda_yaml_or_named_env, use_conda)
ref_fa = config['references']['mpox_pcr_sequence'],
# Use decompressed strains fasta file if
# a gzipped input file was provided, else
# use strains_fa (either file or empty string)
strain_fa = lambda _: join(workpath, "project", "additional_strains.fa") \
if decompress_strains_fasta else strains_fasta,
conda: depending(conda_yaml_or_named_env, use_conda),
container: depending(config['images']['mpox-seek'], use_singularity)
shell:
"""
# Create FASTA file with the reference genome
# and the consensus sequence of each sample
cat {params.ref_fa} \\
cat {params.ref_fa} {params.strain_fa} \\
{input.fas} \\
> {output.fa}
"""
Expand All @@ -108,9 +115,9 @@ rule mafft:
Multiple sequence alignment (MSA) FASTA file from mafft.
"""
input:
fa = join(workpath, "project", "consensus.fa"),
fa = join(workpath, "project", batch_id, "consensus.fa"),
output:
msa = join(workpath, "project", "msa.fa"),
msa = join(workpath, "project", batch_id, "msa.fa"),
params:
rname = 'msa',
conda: depending(conda_yaml_or_named_env, use_conda)
Expand Down
9 changes: 5 additions & 4 deletions workflow/rules/tree.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ rule tree:
Phylogenetic tree (Newick format).
"""
input:
msa = join(workpath, "project", "msa.fa"),
msa = join(workpath, "project", batch_id, "msa.fa"),
output:
nw = join(workpath, "project", "mpox_phylogeny.raxml.bestTree"),
nw = join(workpath, "project", batch_id, "mpox_phylogeny.raxml.bestTree"),
params:
rname = 'tree',
prefix = join(workpath, "project", "mpox_phylogeny"),
prefix = join(workpath, "project", batch_id, "mpox_phylogeny"),
conda: depending(conda_yaml_or_named_env, use_conda)
container: depending(config['images']['mpox-seek'], use_singularity)
shell:
Expand All @@ -30,5 +30,6 @@ rule tree:
--msa {input.msa} \\
--model GTR+G \\
--msa-format FASTA \\
--prefix {params.prefix}
--prefix {params.prefix} \\
--seed 42
"""

0 comments on commit 868189f

Please sign in to comment.