Adding first step to plot coverage

OpenOmics · Jun 4, 2024 · db1b0b2 · db1b0b2
1 parent 28f5b4a
commit db1b0b2
Show file tree

Hide file tree

Showing 6 changed files with 99 additions and 8 deletions.
diff --git a/config/genome.json b/config/genome.json
@@ -2,4 +2,4 @@
     "references": {
         "mpox_pcr_sequence": "resources/mpox_NC_003310_1_pcr_sequence.fa"
     }
-}
+}
diff --git a/mpox-seek b/mpox-seek
@@ -400,6 +400,7 @@ def parsed_arguments(name, description):
                 [--additional-strains ADDITIONAL_STRAINS] \\
                 [--batch-id BATCH_ID] \\
                 [--bootstrap-trees] \\
+                [--plot-coverage] \\
                 --input INPUT [INPUT ...] \\
                 --output OUTPUT
 
@@ -469,6 +470,15 @@ def parsed_arguments(name, description):
                                 calculated by bootstrapping, will be added to the best 
                                 scoring tree. 
                                   Example: --bootstrap-trees
+        
+          --plot-coverage
+                                Plots coverage of along the reference genome. If this flag            
+                                is provided, per-sample plots of raw and library normalized 
+                                coverage will be created. This plot can be useful for ident-
+                                ifying samples or regions of the reference genome with low 
+                                coverage. The library normalized coverage track is useful 
+                                for comparing coverage across samples.
+                                  Example: --plot-coverage
 
         {3}{4}Orchestration options:{5}
           --mode {{local,slurm}}  
@@ -685,6 +695,15 @@ def parsed_arguments(name, description):
         help = argparse.SUPPRESS
     )
 
+    # Plot coverage of samples
+    subparser_run.add_argument(
+        '--plot-coverage',
+        action = 'store_true',
+        required = False,
+        default = False,
+        help = argparse.SUPPRESS
+    )
+
     # Optional Arguments
     # Add custom help message
     subparser_run.add_argument(

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -41,6 +41,9 @@ if strains_fasta.endswith('.gz') or strains_fasta.endswith('.gzip'):
 # to the best tree via bootstrapping data 
 bootstrap_trees = str_bool(config['options']['bootstrap_trees'])
 
+# Plot coverage of reads across the genome
+plot_coverage = str_bool(config['options']['plot_coverage'])
+
 # Find list of sample which 
 # have mulitple barcodes, this 
 # means they need to be merged  
@@ -102,7 +105,7 @@ rule all:
         # Align reads against monkeypox reference,
         # @imported from `rule minimap2` in rules/map.smk
         expand(
-            join(workpath, "{name}", "bams", "{name}.sam"),
+            join(workpath, "{name}", "bams", "{name}.bam"),
             name=samples
         ),
         # Create a consensus sequence from alignments
@@ -121,10 +124,23 @@ rule all:
         # Build a phylogentic tree from MSA,
         # @imported from `rule tree` in rules/tree.smk
         join(workpath, "project", batch_id, "mpox_phylogeny.raxml.bestTree"),
+        # Visualize coverage of reads across the genome,
+        # imported from `rule bigwig` in rules/visualize.smk,
+        # These rules are conditionally run if --plot-coverage
+        # option is provided.
+        expand(
+            join(workpath, "{name}", "bams", "{name}.cpm_normalized.bw"),
+            name=provided(samples, plot_coverage)
+        ),
+        expand(
+            join(workpath, "{name}", "bams", "{name}.raw_coverage.bw"),
+            name=provided(samples, plot_coverage)
+        ),
 
 
 # Import rules 
 include: join("rules", "common.smk")
 include: join("rules", "trim.smk")
 include: join("rules", "map.smk")
-include: join("rules", "tree.smk")
+include: join("rules", "tree.smk")
+include: join("rules", "visualize.smk")
diff --git a/workflow/envs/mpox.yaml b/workflow/envs/mpox.yaml
@@ -13,3 +13,5 @@ dependencies:
  - viral_consensus=0.0.4
  - raxml-ng=1.2.1
  - gawk
+ - deeptools
+ - samtools
diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk
@@ -11,7 +11,8 @@ rule minimap2:
     input:
         fq   = join(workpath, "{name}", "fastqs", "{name}.trimmed.fastq.gz"),
     output:
-        sam  = join(workpath, "{name}", "bams", "{name}.sam"),
+        bam  = join(workpath, "{name}", "bams", "{name}.bam"),
+        bai  = join(workpath, "{name}", "bams", "{name}.bam.bai"),
     params:
         rname  = 'minimap2',
         ref_fa  = config['references']['mpox_pcr_sequence'],
@@ -27,7 +28,11 @@ rule minimap2:
             --sam-hit-only \\
             {params.ref_fa} \\
             {input.fq} \\
-        > {output.sam}
+        | samtools sort \\
+            -O bam \\
+            --write-index \\
+            -o {output.bam}##idx##{output.bai} \\
+            -
         """
 
 
@@ -42,7 +47,7 @@ rule consensus:
         Consensus FASTA file with samples names for sequence identifers
     """
     input:
-        sam = join(workpath, "{name}", "bams", "{name}.sam"),
+        bam = join(workpath, "{name}", "bams", "{name}.bam"),
     output:
         fa    = join(workpath, "{name}", "consensus", "{name}_consensus.fa"),
         fixed = join(workpath, "{name}", "consensus", "{name}_consensus_seqid.fa"),
@@ -55,15 +60,15 @@ rule consensus:
         """
         # Create a consensus sequence of aligned reads
         viral_consensus \\
-            -i {input.sam} \\
+            -i {input.bam} \\
             -r {params.ref_fa} \\
             -o {output.fa}
         
         # Rename the sequence identifers in the FASTA 
         # file to contain only the sample name, by
         # default viral_consensus contains the info
         # related to the command that was run.
-        awk '{{split($0,a," "); n=split(a[4],b,"/"); gsub(/\\.sam$/,"",b[n]); if(a[2]) print ">"b[n]; else print; }}' \\
+        awk '{{split($0,a," "); n=split(a[4],b,"/"); gsub(/\\.bam$/,"",b[n]); if(a[2]) print ">"b[n]; else print; }}' \\
             {output.fa} \\
             > {output.fixed}
         """

diff --git a/workflow/rules/visualize.smk b/workflow/rules/visualize.smk
@@ -0,0 +1,49 @@
+# Data processing rules to convert SAM to BigWig
+# for visualization of coverage along the genome
+rule bigwig:
+    """
+    Data-processing step to convert the align reads against NCBI MonkeyPox reference 
+    sequence ("NC_003310.1"): https://www.ncbi.nlm.nih.gov/nuccore/NC_003310.1 into 
+    a raw/normalized bigwig files for visualization. CPM normalization is peformed
+    to take into account the total number of reads in the experiment. The bin size 
+    will be set to 1bp to allow for high resolution visualization. The raw and cpm
+    normalized bigwig files will be plotted for each sample. The normalized bigwig
+    allows for comparison of coverage across samples, while the raw bigwig file can
+    be used to visualize the raw read coverage of each sample to view the observed 
+    sequencing depth along the genome.
+    @Input:
+        SAM file (scatter)
+    @Output:
+        CPM normlized bigwig file
+    """
+    input:
+        bam = join(workpath, "{name}", "bams", "{name}.bam"),
+    output:
+        cpm_bw = join(workpath, "{name}", "bams", "{name}.cpm_normalized.bw"),
+        raw_bw = join(workpath, "{name}", "bams", "{name}.raw_coverage.bw"),
+    params:
+        rname  = 'bamcoverage',
+    conda: depending(conda_yaml_or_named_env, use_conda)
+    container: depending(config['images']['mpox-seek'], use_singularity)
+    shell: 
+        """
+        # Convert SAM to normalized bigwig file
+        # using DeepTools bamCoverage:
+        # https://deeptools.readthedocs.io/en/develop/content/tools/bamCoverage.html
+        bamCoverage \\
+            -b {input.bam} \\
+            -o {output.cpm_bw} \\
+            -of bigwig \\
+            -bs 1 \\
+            -p 1 \\
+            --normalizeUsing CPM
+        
+        # Convert SAM to un-normalized bigwig file
+        # using DeepTools bamCoverage
+        bamCoverage \\
+            -b {input.bam} \\
+            -o {output.raw_bw} \\
+            -of bigwig \\
+            -bs 1 \\
+            -p 1
+        """