snakemake-workflows · dlaehnemann · Feb 14, 2020 · Feb 14, 2020
diff --git a/.test/config/config.yaml b/.test/config/config.yaml
@@ -19,6 +19,7 @@ pca:
   labels:
     # columns of sample sheet to use for PCA
     - condition
+    - batch_effect
 
 diffexp:
   # samples to exclude (e.g. outliers due to technical problems)

diff --git a/.test/config/units.tsv b/.test/config/units.tsv
@@ -1,6 +1,7 @@
 sample	unit	fragment_len_mean	fragment_len_sd	fq1	fq2
 A	1			ngs-test-data/reads/a.chr21.1.fq	ngs-test-data/reads/a.chr21.2.fq
 B	1			ngs-test-data/reads/b.chr21.1.fq	ngs-test-data/reads/b.chr21.2.fq
-B	2	300	14	ngs-test-data/reads/b.chr21.1.fq	
-C	1			ngs-test-data/reads/a.chr21.1.fq	ngs-test-data/reads/a.chr21.2.fq
+B	2			ngs-test-data/reads/b.chr21.1.fq	ngs-test-data/reads/b.chr21.2.fq
+C	1	300	14	ngs-test-data/reads/a.chr21.1.fq
+C	2	300	14	ngs-test-data/reads/b.chr21.1.fq
 D	1			ngs-test-data/reads/b.chr21.1.fq	ngs-test-data/reads/b.chr21.2.fq
diff --git a/.test/report.html b/.test/report.html
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -91,8 +91,8 @@ def all_input(wildcards):
 
     # fragment length distribution plots
     wanted_input.extend(
-            expand("results/plots/fld/{unit.sample}-{unit.unit}.fragment-length-dist.pdf",
-                unit=units[["sample", "unit"]].itertuples()
+            expand("results/plots/fld/{sample}.fragment-length-dist.pdf",
+                sample=samples.index.tolist()
             )
         )
 

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -1,4 +1,5 @@
 from snakemake.utils import validate
+from itertools import product
 import pandas as pd
 
 
@@ -21,7 +22,6 @@ units.index.names = ["sample_id", "unit_id"]
 units.index = units.index.set_levels(
     [i.astype(str) for i in units.index.levels])  # enforce str in index
 validate(units, schema="../schemas/units.schema.yaml")
-
 report: "../report/workflow.rst"
 
 ##### wildcard constraints #####
@@ -46,6 +46,17 @@ def is_single_end(sample, unit):
         )
     return fq2_present
 
+### check for each sample, that...
+for s, r in units.groupby("sample"):
+    if not ( # all units are single end
+            all(map(lambda x: is_single_end(x[0], x[1]), product(s, r.unit.values))) or
+            # all units are paired end
+            all(map(lambda x: not is_single_end(x[0], x[1]), product(s, r.unit.values)))
+            ):
+        raise ValueError("kallisto requires units within a sample to either\n"
+                            "all be paired end, or all be single end.\n"
+                            f"Sample {s} has a mix, please fix.")
+
 def get_fastqs(wildcards):
     """Get raw FASTQ files from unit sheet."""
     if is_single_end(wildcards.sample, wildcards.unit):
@@ -55,12 +66,19 @@ def get_fastqs(wildcards):
         return [ f"{u.fq1}", f"{u.fq2}" ]
 
 def get_trimmed(wildcards):
-    if not is_single_end(**wildcards):
-        # paired-end sample
-        return expand("results/trimmed/{sample}-{unit}.{group}.fastq.gz",
-                      group=[1, 2], **wildcards)
-    # single end sample
-    return expand("results/trimmed/{sample}-{unit}.fastq.gz", **wildcards)
+    files=[]
+    sample=wildcards.sample
+    us=units.loc[sample, "unit"].tolist()
+    for unit in us:
+        if not is_single_end(sample, unit):
+            # paired-end sample
+            files.extend(
+                expand( [ "results/trimmed/{sample}-{unit}.{group}.fastq.gz" ],
+                        sample=sample, unit=unit,  group=[1, 2])
+                        )
+        else:
+            files.extend([ f"results/trimmed/{sample}-{unit}.fastq.gz" ])
+    return files
 
 def get_bioc_species_pkg(wildcards):
     """Get the package bioconductor package name for the the species in config.yaml"""

diff --git a/workflow/rules/diffexp.smk b/workflow/rules/diffexp.smk
@@ -1,6 +1,6 @@
 
 kallisto_output = expand(
-    "results/kallisto/{unit.sample}-{unit.unit}", unit=units.itertuples())
+    "results/kallisto/{sample}", sample=samples.index.tolist())
 
 
 rule compose_sample_sheet:
@@ -11,11 +11,8 @@ rule compose_sample_sheet:
         "results/sleuth/samples.tsv"
     group: "sleuth-init"
     run:
-        samples_ = units[["sample", "unit"]].merge(samples, on="sample")
-        samples_["sample"] = samples_.apply(
-            lambda row: "{}-{}".format(row["sample"], row["unit"]), axis=1)
+        samples_ = samples
         samples_["path"] = kallisto_output
-        del samples_["unit"]
         samples_.to_csv(output[0], sep="\t")
 
 
@@ -147,11 +144,11 @@ rule plot_fragment_length_dist:
     input:
         "results/sleuth/all.rds"
     output:
-        report("results/plots/fld/{sample}-{unit}.fragment-length-dist.pdf", caption="../report/fld.rst", category="Fragment length distribution")
+        report("results/plots/fld/{sample}.fragment-length-dist.pdf", caption="../report/fld.rst", category="Fragment length distribution")
     conda:
         "../envs/sleuth.yaml"
     log:
-        "logs/plots/fld/{sample}-{unit}.fragment-length-dist.log"
+        "logs/plots/fld/{sample}.fragment-length-dist.log"
     script:
         "../scripts/plot-fld.R"
 

diff --git a/workflow/rules/quant.smk b/workflow/rules/quant.smk
@@ -23,15 +23,14 @@ def kallisto_params(wildcards, input):
         extra += " --fusion"
     return extra
 
-
 rule kallisto_quant:
     input:
         fq=get_trimmed,
         idx="results/kallisto/transcripts.idx"
     output:
-        directory("results/kallisto/{sample}-{unit}")
+        directory("results/kallisto/{sample}")
     log:
-        "results/logs/kallisto/quant/{sample}-{unit}.log"
+        "results/logs/kallisto/quant/{sample}.log"
     params:
         extra=kallisto_params
     conda:

diff --git a/workflow/scripts/plot-fld.R b/workflow/scripts/plot-fld.R
@@ -3,9 +3,10 @@ sink(log)
 sink(log, type="message")
 
 library("sleuth")
+library("ggplot2")
 
 so <- sleuth_load(snakemake@input[[1]])
 
-pdf(file = snakemake@output[[1]])
-plot_fld(so, paste0(snakemake@wildcards[["sample"]], "-", snakemake@wildcards[["unit"]]))
-dev.off()
+p <- plot_fld(so, snakemake@wildcards[["sample"]])
+
+ggsave(filename = snakemake@output[[1]], plot = p)
diff --git a/workflow/scripts/sleuth-init.R b/workflow/scripts/sleuth-init.R
@@ -26,7 +26,7 @@ t2g <- biomaRt::getBM(
 
 samples <- read_tsv(snakemake@input[["samples"]], na = "", col_names = TRUE) %>%
             # make everything except the index, sample name and path string a factor
-            mutate_at(  vars(-X1, -sample, -path),
+            mutate_at(  vars(-sample, -path),
                         list(~factor(.))
                         )