Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge samples in kallisto_quant step #16

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .test/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ pca:
labels:
# columns of sample sheet to use for PCA
- condition
- batch_effect

diffexp:
# samples to exclude (e.g. outliers due to technical problems)
Expand Down
5 changes: 3 additions & 2 deletions .test/config/units.tsv
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
sample unit fragment_len_mean fragment_len_sd fq1 fq2
A 1 ngs-test-data/reads/a.chr21.1.fq ngs-test-data/reads/a.chr21.2.fq
B 1 ngs-test-data/reads/b.chr21.1.fq ngs-test-data/reads/b.chr21.2.fq
B 2 300 14 ngs-test-data/reads/b.chr21.1.fq
C 1 ngs-test-data/reads/a.chr21.1.fq ngs-test-data/reads/a.chr21.2.fq
B 2 ngs-test-data/reads/b.chr21.1.fq ngs-test-data/reads/b.chr21.2.fq
C 1 300 14 ngs-test-data/reads/a.chr21.1.fq
C 2 300 14 ngs-test-data/reads/b.chr21.1.fq
D 1 ngs-test-data/reads/b.chr21.1.fq ngs-test-data/reads/b.chr21.2.fq
1,126 changes: 0 additions & 1,126 deletions .test/report.html

This file was deleted.

4 changes: 2 additions & 2 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ def all_input(wildcards):

# fragment length distribution plots
wanted_input.extend(
expand("results/plots/fld/{unit.sample}-{unit.unit}.fragment-length-dist.pdf",
unit=units[["sample", "unit"]].itertuples()
expand("results/plots/fld/{sample}.fragment-length-dist.pdf",
sample=samples.index.tolist()
)
)

Expand Down
32 changes: 25 additions & 7 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from snakemake.utils import validate
from itertools import product
import pandas as pd


Expand All @@ -21,7 +22,6 @@ units.index.names = ["sample_id", "unit_id"]
units.index = units.index.set_levels(
[i.astype(str) for i in units.index.levels]) # enforce str in index
validate(units, schema="../schemas/units.schema.yaml")

report: "../report/workflow.rst"

##### wildcard constraints #####
Expand All @@ -46,6 +46,17 @@ def is_single_end(sample, unit):
)
return fq2_present

### check for each sample, that...
for s, r in units.groupby("sample"):
if not ( # all units are single end
all(map(lambda x: is_single_end(x[0], x[1]), product(s, r.unit.values))) or
# all units are paired end
all(map(lambda x: not is_single_end(x[0], x[1]), product(s, r.unit.values)))
):
raise ValueError("kallisto requires units within a sample to either\n"
"all be paired end, or all be single end.\n"
f"Sample {s} has a mix, please fix.")

def get_fastqs(wildcards):
"""Get raw FASTQ files from unit sheet."""
if is_single_end(wildcards.sample, wildcards.unit):
Expand All @@ -55,12 +66,19 @@ def get_fastqs(wildcards):
return [ f"{u.fq1}", f"{u.fq2}" ]

def get_trimmed(wildcards):
if not is_single_end(**wildcards):
# paired-end sample
return expand("results/trimmed/{sample}-{unit}.{group}.fastq.gz",
group=[1, 2], **wildcards)
# single end sample
return expand("results/trimmed/{sample}-{unit}.fastq.gz", **wildcards)
files=[]
sample=wildcards.sample
us=units.loc[sample, "unit"].tolist()
for unit in us:
if not is_single_end(sample, unit):
# paired-end sample
files.extend(
expand( [ "results/trimmed/{sample}-{unit}.{group}.fastq.gz" ],
sample=sample, unit=unit, group=[1, 2])
)
else:
files.extend([ f"results/trimmed/{sample}-{unit}.fastq.gz" ])
return files

def get_bioc_species_pkg(wildcards):
"""Get the package bioconductor package name for the the species in config.yaml"""
Expand Down
11 changes: 4 additions & 7 deletions workflow/rules/diffexp.smk
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

kallisto_output = expand(
"results/kallisto/{unit.sample}-{unit.unit}", unit=units.itertuples())
"results/kallisto/{sample}", sample=samples.index.tolist())


rule compose_sample_sheet:
Expand All @@ -11,11 +11,8 @@ rule compose_sample_sheet:
"results/sleuth/samples.tsv"
group: "sleuth-init"
run:
samples_ = units[["sample", "unit"]].merge(samples, on="sample")
samples_["sample"] = samples_.apply(
lambda row: "{}-{}".format(row["sample"], row["unit"]), axis=1)
samples_ = samples
samples_["path"] = kallisto_output
del samples_["unit"]
samples_.to_csv(output[0], sep="\t")


Expand Down Expand Up @@ -147,11 +144,11 @@ rule plot_fragment_length_dist:
input:
"results/sleuth/all.rds"
output:
report("results/plots/fld/{sample}-{unit}.fragment-length-dist.pdf", caption="../report/fld.rst", category="Fragment length distribution")
report("results/plots/fld/{sample}.fragment-length-dist.pdf", caption="../report/fld.rst", category="Fragment length distribution")
conda:
"../envs/sleuth.yaml"
log:
"logs/plots/fld/{sample}-{unit}.fragment-length-dist.log"
"logs/plots/fld/{sample}.fragment-length-dist.log"
script:
"../scripts/plot-fld.R"

Expand Down
5 changes: 2 additions & 3 deletions workflow/rules/quant.smk
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,14 @@ def kallisto_params(wildcards, input):
extra += " --fusion"
return extra


rule kallisto_quant:
input:
fq=get_trimmed,
idx="results/kallisto/transcripts.idx"
output:
directory("results/kallisto/{sample}-{unit}")
directory("results/kallisto/{sample}")
log:
"results/logs/kallisto/quant/{sample}-{unit}.log"
"results/logs/kallisto/quant/{sample}.log"
params:
extra=kallisto_params
conda:
Expand Down
7 changes: 4 additions & 3 deletions workflow/scripts/plot-fld.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@ sink(log)
sink(log, type="message")

library("sleuth")
library("ggplot2")

so <- sleuth_load(snakemake@input[[1]])

pdf(file = snakemake@output[[1]])
plot_fld(so, paste0(snakemake@wildcards[["sample"]], "-", snakemake@wildcards[["unit"]]))
dev.off()
p <- plot_fld(so, snakemake@wildcards[["sample"]])

ggsave(filename = snakemake@output[[1]], plot = p)
2 changes: 1 addition & 1 deletion workflow/scripts/sleuth-init.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ t2g <- biomaRt::getBM(

samples <- read_tsv(snakemake@input[["samples"]], na = "", col_names = TRUE) %>%
# make everything except the index, sample name and path string a factor
mutate_at( vars(-X1, -sample, -path),
mutate_at( vars(-sample, -path),
list(~factor(.))
)

Expand Down