snakemake-workflows · bouchetj · Aug 6, 2024 · Aug 19, 2024 · Aug 19, 2024 · Aug 19, 2024
diff --git a/.gitignore b/.gitignore
@@ -7,4 +7,6 @@ benchmarks/**
 logs/**
 resources/**
 results/**
-.snakemake/**
+.snakemake/**
+.DS_Store
+settings.json
diff --git a/.test/sra_test/config.yaml b/.test/sra_test/config.yaml
@@ -0,0 +1,70 @@
+# path or URL to sample sheet (TSV format, columns: sample, condition, ...)
+samples: .test/sra_test/samples.tsv
+# path or URL to sequencing unit sheet (TSV format, columns: sample, unit, fq1, fq2)
+# Units are technical replicates (e.g. lanes, or resequencing of the same biological
+# sample).
+units: .test/sra_test/units.tsv
+
+
+ref:
+  # Ensembl species name
+  species: homo_sapiens
+  # Ensembl release (make sure to take one where snpeff data is available, check 'snpEff databases' output)
+  release: 100
+  # Genome build
+  build: GRCh38
+
+trimming:
+  # If you activate trimming by setting this to `True`, you will have to
+  # specify the respective cutadapt adapter trimming flag for each unit
+  # in the `units.tsv` file's `adapters` column
+  activate: False
+
+pca:
+  activate: True
+  # Per default, a separate PCA plot is generated for each of the
+  # `variables_of_interest` and the `batch_effects`, coloring according to
+  # that variables groups.
+  # If you want PCA plots for further columns in the samples.tsv sheet, you
+  # can request them under labels as a list, for example:
+  # - relatively_uninteresting_variable_X
+  # - possible_batch_effect_Y
+  labels: ""
+
+diffexp:
+  # variables for whome you are interested in whether they have an effect on
+  # expression levels
+  variables_of_interest:
+    Treated:
+      # any fold change will be relative to this factor level
+      base_level: no treatment
+    Time:
+      # any fold change will be relative to this factor level
+      base_level: 6 h
+  # variables whose effect you want to model to separate them from your
+  # variables_of_interest
+  batch_effects:
+    - Cell
+  # contrasts for the deseq2 results method to determine fold changes
+  contrasts:
+    Treated:
+      # must be one of the variables_of_interest, for details see:
+      # https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#contrasts
+      variable_of_interest: Treated
+      # must be a level present in the variable_of_interest that is not the
+      # base_level specified above
+      level_of_interest: R848 (5mM) for 6 h
+  # The default model includes all interactions among variables_of_interest
+  # and batch_effects added on. For the example above this implicitly is:
+  # model: ~jointly_handled + treatment_1 * treatment_2
+  # For the default model to be used, simply specify an empty `model: ""` below.
+  # If you want to introduce different assumptions into your model, you can
+  # specify a different model to use, for example skipping the interaction:
+  # model: ~jointly_handled + treatment_1 + treatment_2
+  model: ""
+
+
+params:
+  cutadapt-pe: ""
+  cutadapt-se: ""
+  star: ""
diff --git a/.test/sra_test/samples.tsv b/.test/sra_test/samples.tsv
@@ -0,0 +1,25 @@
+sample_name	Cell	Treated	Time
+SAMN15946684	human neutrophil	no treatment	6 h
+SAMN15946683	human neutrophil	no treatment	6 h
+SAMN15946682	human neutrophil	no treatment	6 h
+SAMN15946681	human neutrophil	R848 (5mM)	6 h
+SAMN15946680	human neutrophil	R848 (5mM)	6 h
+SAMN15946679	human neutrophil	R848 (5mM)	6 h
+SAMN15946678	human neutrophil	no treatment	20 h
+SAMN15946677	human neutrophil	no treatment	20 h
+SAMN15946676	human neutrophil	no treatment	20 h
+SAMN15946675	human neutrophil	R848 (5mM)	20 h
+SAMN15946674	human neutrophil	R848 (5mM)	20 h
+SAMN15946673	human neutrophil	R848 (5mM)	20 h
+SAMN15946672	human CD14+ monocytes	no treatment	6 h
+SAMN15946671	human CD14+ monocytes	no treatment	6 h
+SAMN15946670	human CD14+ monocytes	no treatment	6 h
+SAMN15946704	human CD14+ monocytes	R848 (5mM)	6 h
+SAMN15946703	human CD14+ monocytes	R848 (5mM)	6 h
+SAMN15946702	human CD14+ monocytes	R848 (5mM)	6 h
+SAMN15946701	human CD14+ monocytes	no treatment	20 h
+SAMN15946700	human CD14+ monocytes	no treatment	20 h
+SAMN15946699	human CD14+ monocytes	no treatment	20 h
+SAMN15946698	human CD14+ monocytes	R848 (5mM)	20 h
+SAMN15946697	human CD14+ monocytes	R848 (5mM)	20 h
+SAMN15946696	human CD14+ monocytes	R848 (5mM)	20 h
diff --git a/.test/sra_test/units.tsv b/.test/sra_test/units.tsv
@@ -0,0 +1,25 @@
+sample_name	unit_name	fq1	fq2	sra	adapters	strandedness
+SAMN15946684	GSM4756845			SRR12550894		
+SAMN15946683	GSM4756846			SRR12550895		
+SAMN15946682	GSM4756847			SRR12550896		
+SAMN15946681	GSM4756848			SRR12550897		
+SAMN15946680	GSM4756849			SRR12550898		
+SAMN15946679	GSM4756850			SRR12550899		
+SAMN15946678	GSM4756851			SRR12550900		
+SAMN15946677	GSM4756852			SRR12550901		
+SAMN15946676	GSM4756853			SRR12550902		
+SAMN15946675	GSM4756854			SRR12550903		
+SAMN15946674	GSM4756855			SRR12550904		
+SAMN15946673	GSM4756856			SRR12550905		
+SAMN15946672	GSM4756857			SRR12550906		
+SAMN15946671	GSM4756858			SRR12550907		
+SAMN15946670	GSM4756859			SRR12550908		
+SAMN15946704	GSM4756860			SRR12550909		
+SAMN15946703	GSM4756861			SRR12550910		
+SAMN15946702	GSM4756862			SRR12550911		
+SAMN15946701	GSM4756863			SRR12550912		
+SAMN15946700	GSM4756864			SRR12550913		
+SAMN15946699	GSM4756865			SRR12550914		
+SAMN15946698	GSM4756866			SRR12550915		
+SAMN15946697	GSM4756867			SRR12550916		
+SAMN15946696	GSM4756868			SRR12550917		
diff --git a/config/config.yaml b/config/config.yaml
@@ -63,6 +63,9 @@ diffexp:
   # model: ~jointly_handled + treatment_1 + treatment_2
   model: ""
 
+pathway:
+# Generates the Hallmark pathway analysis and heatmap
+  activate: True
 
 params:
   cutadapt-pe: ""

diff --git a/workflow/envs/pathway.yaml b/workflow/envs/pathway.yaml
@@ -0,0 +1,8 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - r-gsva
+  - r-msigdbr
+  - r-pheatmap
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -1,4 +1,5 @@
 import glob
+import subprocess
 
 import pandas as pd
 from snakemake.utils import validate
@@ -31,6 +32,9 @@ def get_final_output():
         final_output.extend(
             expand("results/pca.{variable}.svg", variable=pca_variables)
         )
+    if config["pathway"]["activate"]:
+        final_output.append("results/pathway/hallmark_GSVA.csv")
+        final_output.append("results/pathway/hallmark_GSVA.svg")
     return final_output
 
 
@@ -53,9 +57,17 @@ def get_cutadapt_input(wildcards):
     unit = units.loc[wildcards.sample].loc[wildcards.unit]
 
     if pd.isna(unit["fq1"]):
-        # SRA sample (always paired-end for now)
+        # SRA sample
         accession = unit["sra"]
-        return expand("sra/{accession}_{read}.fastq", accession=accession, read=[1, 2])
+        # Check if pe or se
+        if is_sra_paired_end(wildcards.sample):
+            # paired-end sample
+            return expand(
+                "sra/{accession}_{read}.fastq", accession=accession, read=[1, 2]
+            )
+        else:
+            # single-end sample
+            return "sra/{}.fastq".format(accession)
 
     if unit["fq1"].endswith("gz"):
         ending = ".gz"
@@ -100,6 +112,44 @@ def is_paired_end(sample):
     return all_paired
 
 
+def is_sra_paired_end(sample):
+    sample_units = units.loc[sample]
+
+    sra_accession = sample_units["sra"]
+
+    if sra_accession.isnull().all():
+        raise ValueError(f"No SRA accession found for sample {sample}")
+
+    # Dictionary to cache SRA accession and their LibraryLayout
+    sra_layout_cache = {}
+
+    def get_library_layout(sra):
+        if sra in sra_layout_cache:
+            return sra_layout_cache[sra]
+
+        try:
+            cmd = f'esearch -db sra -query "{sra}" | efetch -format runinfo | tail -n +2 | cut -d "," -f 16'
+            library_layout = subprocess.check_output(cmd, shell=True).decode("utf-8").strip()
+
+            if library_layout not in ["SINGLE", "PAIRED"]:
+                raise ValueError(f"Unexpected LibraryLayout: {library_layout} for SRA accession {sra}")
+
+            sra_layout_cache[sra] = library_layout
+            return library_layout
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"Failed to fetch LibraryLayout for {sra}: {e}")
+
+    # Fetch the LibraryLayout for the given sample
+    library_layout = get_library_layout(sra_accession.iloc[0])
+
+    # Check for consistency among all SRA accessions for this sample
+    for sra in sra_accession.dropna():
+        if get_library_layout(sra) != library_layout:
+            raise ValueError(f"Inconsistent LibraryLayout found among samples with SRA accessions.")
+
+    return library_layout == "PAIRED"
+
+
 def get_fq(wildcards):
     if config["trimming"]["activate"]:
         # activated trimming, use trimmed data
@@ -120,25 +170,27 @@ def get_fq(wildcards):
             "fq1": "results/trimmed/{sample}_{unit}_single.fastq.gz".format(**wildcards)
         }
     else:
-        # no trimming, use raw reads
         u = units.loc[(wildcards.sample, wildcards.unit)]
         if pd.isna(u["fq1"]):
-            # SRA sample (always paired-end for now)
             accession = u["sra"]
-            return dict(
-                zip(
-                    ["fq1", "fq2"],
-                    expand(
-                        "sra/{accession}_{group}.fastq",
-                        accession=accession,
-                        group=["R1", "R2"],
-                    ),
+            if is_sra_paired_end(wildcards.sample):
+                return dict(
+                    zip(
+                        ["fq1", "fq2"],
+                        expand(
+                            "sra/{accession}_{group}.fastq",
+                            accession=accession,
+                            group=["1", "2"],
+                        ),
+                    )
                 )
-            )
-        if not is_paired_end(wildcards.sample):
-            return {"fq1": f"{u.fq1}"}
+            else:
+                return {"fq1": f"sra/{accession}.fastq"}
         else:
-            return {"fq1": f"{u.fq1}", "fq2": f"{u.fq2}"}
+            if not is_paired_end(wildcards.sample):
+                return {"fq1": f"{u.fq1}"}
+            else:
+                return {"fq1": f"{u.fq1}", "fq2": f"{u.fq2}"}
 
 
 def get_strandedness(units):

diff --git a/workflow/rules/pathway.smk b/workflow/rules/pathway.smk
@@ -0,0 +1,12 @@
+rule GSVA:
+    input:
+        "results/deseq2/normcounts.tsv"
+    output:
+        "results/pathway/hallmark_GSVA.csv",
+        "results/pathway/hallmark_GSVA.svg",
+    log:
+        "logs/pathway/gsva.log",
+    conda:
+        "../envs/pathway.yaml"
+    script:
+        "../scripts/pathway.R"
diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
@@ -146,6 +146,19 @@ rule rseqc_readgc:
         "read_GC.py -i {input} -o {params.prefix} > {log} 2>&1"
 
 
+rule rseqc_readqual:
+    input:
+        bam="results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam"
+    output:
+        "results/qc/rseqc/{sample}_{unit}.readqual.Quality_plot.pdf"
+    log:
+        "logs/rseqc/rseqc_readqual/{sample}_{unit}.log"
+    conda:
+        "../envs/rseqc.yaml"
+    shell:
+        "read_quality.py -i {input.bam} -o {output} > {log} 2>&1"
+
+
 rule multiqc:
     input:
         expand(
@@ -184,6 +197,10 @@ rule multiqc:
             "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.readgc.GC_plot.pdf",
             unit=units.itertuples(),
         ),
+        expand(
+            "results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.readqual.Quality_plot.pdf",
+            unit=units.itertuples(),
+        ),
         expand(
             "logs/rseqc/rseqc_junction_annotation/{unit.sample_name}_{unit.unit_name}.log",
             unit=units.itertuples(),

diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk
@@ -1,4 +1,4 @@
-rule get_sra:
+rule get_sra_pe:
     output:
         "sra/{accession}_1.fastq",
         "sra/{accession}_2.fastq",
@@ -8,6 +8,15 @@ rule get_sra:
         "v3.5.3/bio/sra-tools/fasterq-dump"
 
 
+rule get_sra_se:
+    output:
+        "sra/{accession}.fastq",
+    log:
+        "logs/get-sra/{accession}.log",
+    wrapper:
+        "v3.5.3/bio/sra-tools/fasterq-dump"
+
+
 rule cutadapt_pipe:
     input:
         get_cutadapt_pipe_input,

diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
@@ -55,6 +55,14 @@ properties:
         type: string
     required:
       - contrasts
+
+  pathway:
+    type: object
+    properties:
+      activate:
+        type: boolean
+    required:
+      - activate
 
   params:
     type: object

diff --git a/workflow/scripts/deseq2-init.R b/workflow/scripts/deseq2-init.R
@@ -23,6 +23,7 @@ counts_data <- counts_data[, order(names(counts_data))]
 
 col_data <- read.table(
   snakemake@config[["samples"]],
+  sep = "\t",
   header = TRUE,
   row.names = "sample_name",
   check.names = FALSE