Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pathway Enrichment Analysis #86

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,6 @@ benchmarks/**
logs/**
resources/**
results/**
.snakemake/**
.snakemake/**
.DS_Store
settings.json
70 changes: 70 additions & 0 deletions .test/sra_test/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# path or URL to sample sheet (TSV format, columns: sample, condition, ...)
samples: .test/sra_test/samples.tsv
# path or URL to sequencing unit sheet (TSV format, columns: sample, unit, fq1, fq2)
# Units are technical replicates (e.g. lanes, or resequencing of the same biological
# sample).
units: .test/sra_test/units.tsv


ref:
# Ensembl species name
species: homo_sapiens
# Ensembl release (make sure to take one where snpeff data is available, check 'snpEff databases' output)
release: 100
# Genome build
build: GRCh38

trimming:
# If you activate trimming by setting this to `True`, you will have to
# specify the respective cutadapt adapter trimming flag for each unit
# in the `units.tsv` file's `adapters` column
activate: False

pca:
activate: True
# Per default, a separate PCA plot is generated for each of the
# `variables_of_interest` and the `batch_effects`, coloring according to
# that variables groups.
# If you want PCA plots for further columns in the samples.tsv sheet, you
# can request them under labels as a list, for example:
# - relatively_uninteresting_variable_X
# - possible_batch_effect_Y
labels: ""

diffexp:
# variables for whome you are interested in whether they have an effect on
# expression levels
variables_of_interest:
Treated:
# any fold change will be relative to this factor level
base_level: no treatment
Time:
# any fold change will be relative to this factor level
base_level: 6 h
# variables whose effect you want to model to separate them from your
# variables_of_interest
batch_effects:
- Cell
# contrasts for the deseq2 results method to determine fold changes
contrasts:
Treated:
# must be one of the variables_of_interest, for details see:
# https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#contrasts
variable_of_interest: Treated
# must be a level present in the variable_of_interest that is not the
# base_level specified above
level_of_interest: R848 (5mM) for 6 h
# The default model includes all interactions among variables_of_interest
# and batch_effects added on. For the example above this implicitly is:
# model: ~jointly_handled + treatment_1 * treatment_2
# For the default model to be used, simply specify an empty `model: ""` below.
# If you want to introduce different assumptions into your model, you can
# specify a different model to use, for example skipping the interaction:
# model: ~jointly_handled + treatment_1 + treatment_2
model: ""


params:
cutadapt-pe: ""
cutadapt-se: ""
star: ""
25 changes: 25 additions & 0 deletions .test/sra_test/samples.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
sample_name Cell Treated Time
SAMN15946684 human neutrophil no treatment 6 h
SAMN15946683 human neutrophil no treatment 6 h
SAMN15946682 human neutrophil no treatment 6 h
SAMN15946681 human neutrophil R848 (5mM) 6 h
SAMN15946680 human neutrophil R848 (5mM) 6 h
SAMN15946679 human neutrophil R848 (5mM) 6 h
SAMN15946678 human neutrophil no treatment 20 h
SAMN15946677 human neutrophil no treatment 20 h
SAMN15946676 human neutrophil no treatment 20 h
SAMN15946675 human neutrophil R848 (5mM) 20 h
SAMN15946674 human neutrophil R848 (5mM) 20 h
SAMN15946673 human neutrophil R848 (5mM) 20 h
SAMN15946672 human CD14+ monocytes no treatment 6 h
SAMN15946671 human CD14+ monocytes no treatment 6 h
SAMN15946670 human CD14+ monocytes no treatment 6 h
SAMN15946704 human CD14+ monocytes R848 (5mM) 6 h
SAMN15946703 human CD14+ monocytes R848 (5mM) 6 h
SAMN15946702 human CD14+ monocytes R848 (5mM) 6 h
SAMN15946701 human CD14+ monocytes no treatment 20 h
SAMN15946700 human CD14+ monocytes no treatment 20 h
SAMN15946699 human CD14+ monocytes no treatment 20 h
SAMN15946698 human CD14+ monocytes R848 (5mM) 20 h
SAMN15946697 human CD14+ monocytes R848 (5mM) 20 h
SAMN15946696 human CD14+ monocytes R848 (5mM) 20 h
25 changes: 25 additions & 0 deletions .test/sra_test/units.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
sample_name unit_name fq1 fq2 sra adapters strandedness
SAMN15946684 GSM4756845 SRR12550894
SAMN15946683 GSM4756846 SRR12550895
SAMN15946682 GSM4756847 SRR12550896
SAMN15946681 GSM4756848 SRR12550897
SAMN15946680 GSM4756849 SRR12550898
SAMN15946679 GSM4756850 SRR12550899
SAMN15946678 GSM4756851 SRR12550900
SAMN15946677 GSM4756852 SRR12550901
SAMN15946676 GSM4756853 SRR12550902
SAMN15946675 GSM4756854 SRR12550903
SAMN15946674 GSM4756855 SRR12550904
SAMN15946673 GSM4756856 SRR12550905
SAMN15946672 GSM4756857 SRR12550906
SAMN15946671 GSM4756858 SRR12550907
SAMN15946670 GSM4756859 SRR12550908
SAMN15946704 GSM4756860 SRR12550909
SAMN15946703 GSM4756861 SRR12550910
SAMN15946702 GSM4756862 SRR12550911
SAMN15946701 GSM4756863 SRR12550912
SAMN15946700 GSM4756864 SRR12550913
SAMN15946699 GSM4756865 SRR12550914
SAMN15946698 GSM4756866 SRR12550915
SAMN15946697 GSM4756867 SRR12550916
SAMN15946696 GSM4756868 SRR12550917
3 changes: 3 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ diffexp:
# model: ~jointly_handled + treatment_1 + treatment_2
model: ""

pathway:
# Generates the Hallmark pathway analysis and heatmap
activate: True

params:
cutadapt-pe: ""
Expand Down
8 changes: 8 additions & 0 deletions workflow/envs/pathway.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
channels:
- conda-forge
- bioconda
- nodefaults
dependencies:
- r-gsva
- r-msigdbr
- r-pheatmap
84 changes: 68 additions & 16 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import glob
import subprocess

import pandas as pd
from snakemake.utils import validate
Expand Down Expand Up @@ -31,6 +32,9 @@ def get_final_output():
final_output.extend(
expand("results/pca.{variable}.svg", variable=pca_variables)
)
if config["pathway"]["activate"]:
final_output.append("results/pathway/hallmark_GSVA.csv")
final_output.append("results/pathway/hallmark_GSVA.svg")
return final_output


Expand All @@ -53,9 +57,17 @@ def get_cutadapt_input(wildcards):
unit = units.loc[wildcards.sample].loc[wildcards.unit]

if pd.isna(unit["fq1"]):
# SRA sample (always paired-end for now)
# SRA sample
accession = unit["sra"]
return expand("sra/{accession}_{read}.fastq", accession=accession, read=[1, 2])
# Check if pe or se
if is_sra_paired_end(wildcards.sample):
# paired-end sample
return expand(
"sra/{accession}_{read}.fastq", accession=accession, read=[1, 2]
)
else:
# single-end sample
return "sra/{}.fastq".format(accession)

if unit["fq1"].endswith("gz"):
ending = ".gz"
Expand Down Expand Up @@ -100,6 +112,44 @@ def is_paired_end(sample):
return all_paired


def is_sra_paired_end(sample):
sample_units = units.loc[sample]

sra_accession = sample_units["sra"]

if sra_accession.isnull().all():
raise ValueError(f"No SRA accession found for sample {sample}")

# Dictionary to cache SRA accession and their LibraryLayout
sra_layout_cache = {}

def get_library_layout(sra):
if sra in sra_layout_cache:
return sra_layout_cache[sra]

try:
cmd = f'esearch -db sra -query "{sra}" | efetch -format runinfo | tail -n +2 | cut -d "," -f 16'
library_layout = subprocess.check_output(cmd, shell=True).decode("utf-8").strip()

if library_layout not in ["SINGLE", "PAIRED"]:
raise ValueError(f"Unexpected LibraryLayout: {library_layout} for SRA accession {sra}")

sra_layout_cache[sra] = library_layout
return library_layout
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Failed to fetch LibraryLayout for {sra}: {e}")

# Fetch the LibraryLayout for the given sample
library_layout = get_library_layout(sra_accession.iloc[0])

# Check for consistency among all SRA accessions for this sample
for sra in sra_accession.dropna():
if get_library_layout(sra) != library_layout:
raise ValueError(f"Inconsistent LibraryLayout found among samples with SRA accessions.")

return library_layout == "PAIRED"


def get_fq(wildcards):
if config["trimming"]["activate"]:
# activated trimming, use trimmed data
Expand All @@ -120,25 +170,27 @@ def get_fq(wildcards):
"fq1": "results/trimmed/{sample}_{unit}_single.fastq.gz".format(**wildcards)
}
else:
# no trimming, use raw reads
u = units.loc[(wildcards.sample, wildcards.unit)]
if pd.isna(u["fq1"]):
# SRA sample (always paired-end for now)
accession = u["sra"]
return dict(
zip(
["fq1", "fq2"],
expand(
"sra/{accession}_{group}.fastq",
accession=accession,
group=["R1", "R2"],
),
if is_sra_paired_end(wildcards.sample):
return dict(
zip(
["fq1", "fq2"],
expand(
"sra/{accession}_{group}.fastq",
accession=accession,
group=["1", "2"],
),
)
)
)
if not is_paired_end(wildcards.sample):
return {"fq1": f"{u.fq1}"}
else:
return {"fq1": f"sra/{accession}.fastq"}
else:
return {"fq1": f"{u.fq1}", "fq2": f"{u.fq2}"}
if not is_paired_end(wildcards.sample):
return {"fq1": f"{u.fq1}"}
else:
return {"fq1": f"{u.fq1}", "fq2": f"{u.fq2}"}


def get_strandedness(units):
Expand Down
12 changes: 12 additions & 0 deletions workflow/rules/pathway.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
rule GSVA:
input:
"results/deseq2/normcounts.tsv"
output:
"results/pathway/hallmark_GSVA.csv",
"results/pathway/hallmark_GSVA.svg",
log:
"logs/pathway/gsva.log",
conda:
"../envs/pathway.yaml"
script:
"../scripts/pathway.R"
17 changes: 17 additions & 0 deletions workflow/rules/qc.smk
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,19 @@ rule rseqc_readgc:
"read_GC.py -i {input} -o {params.prefix} > {log} 2>&1"


rule rseqc_readqual:
input:
bam="results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam"
output:
"results/qc/rseqc/{sample}_{unit}.readqual.Quality_plot.pdf"
log:
"logs/rseqc/rseqc_readqual/{sample}_{unit}.log"
conda:
"../envs/rseqc.yaml"
shell:
"read_quality.py -i {input.bam} -o {output} > {log} 2>&1"


rule multiqc:
input:
expand(
Expand Down Expand Up @@ -184,6 +197,10 @@ rule multiqc:
"results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.readgc.GC_plot.pdf",
unit=units.itertuples(),
),
expand(
"results/qc/rseqc/{unit.sample_name}_{unit.unit_name}.readqual.Quality_plot.pdf",
unit=units.itertuples(),
),
expand(
"logs/rseqc/rseqc_junction_annotation/{unit.sample_name}_{unit.unit_name}.log",
unit=units.itertuples(),
Expand Down
11 changes: 10 additions & 1 deletion workflow/rules/trim.smk
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
rule get_sra:
rule get_sra_pe:
output:
"sra/{accession}_1.fastq",
"sra/{accession}_2.fastq",
Expand All @@ -8,6 +8,15 @@ rule get_sra:
"v3.5.3/bio/sra-tools/fasterq-dump"


rule get_sra_se:
output:
"sra/{accession}.fastq",
log:
"logs/get-sra/{accession}.log",
wrapper:
"v3.5.3/bio/sra-tools/fasterq-dump"


rule cutadapt_pipe:
input:
get_cutadapt_pipe_input,
Expand Down
8 changes: 8 additions & 0 deletions workflow/schemas/config.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,14 @@ properties:
type: string
required:
- contrasts

pathway:
type: object
properties:
activate:
type: boolean
required:
- activate

params:
type: object
Expand Down
1 change: 1 addition & 0 deletions workflow/scripts/deseq2-init.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ counts_data <- counts_data[, order(names(counts_data))]

col_data <- read.table(
snakemake@config[["samples"]],
sep = "\t",
header = TRUE,
row.names = "sample_name",
check.names = FALSE
Expand Down
Loading
Loading