fix: canonical transcript mapped read extraction (#77)

The main aim of this PR is to get the extraction of reads mapped to the canonical transcripts in the `rule get_mapped_canonical_transcripts` down from about 44GB of memory usage regardless of the input BAM file size and hours of grepping, down to seconds / minutes of extraction time with almost no memory footprint. This happens here: https://github.com/snakemake-workflows/rna-seq-kallisto-sleuth/compare/fix-canonical-transcript-mapped-read-extraction?expand=1#diff-6562a38fb77f8839a8731b8a882bf0d0b683d6268cdffb433dcbe1f360ccedc4R103 This required using a BED file, and thus I switched to generating a valid BED file in the `rule get_canonical_ids` and switched to using that BED file for keeping track of transcript strand information instead of hacking this into the contig names of the reference fasta file. This lead to some cleanup in the workflow. Other things that happened along the way are: * removed an unnecessary `r-dplyr` dependency, as this is pulled in by `r-tidyverse` anyways * cleaned up file names to more clearly reflect what they contain * cleaned up variable / column names in python script to use only lowercase letters * removed some redundancy in wrapper calling by re-`use`ing the `samtools index` rule -- thus, we only have to update the wrapper version in one place and all instances should always stay in sync For now, this is not yet tested. So I'll mark this as a draft to start with. But I wanted it up here to be able to test it on different setups by checking out the branch. --------- Co-authored-by: Lähnemann <d189n@odcf-worker02.inet.dkfz-heidelberg.de> Co-authored-by: Addimator <adpri100@hhu.de>
snakemake-workflows · Jan 30, 2024 · 52b56b0 · 52b56b0
1 parent e7ddff6
commit 52b56b0
Show file tree

Hide file tree

Showing 37 changed files with 436 additions and 526 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -77,9 +77,9 @@ jobs:
       with:
         directory: .test
         snakefile: workflow/Snakefile
-        args: "--use-conda --show-failed-logs --cores 1 --conda-cleanup-pkgs cache --all-temp"
+        args: "--use-conda --show-failed-logs --cores all --conda-cleanup-pkgs cache --all-temp"
 
-  run-3prime-rna-workflow:
+  run-three-prime-rna-workflow:
     runs-on: ubuntu-latest
     needs:
       - linting
@@ -88,15 +88,13 @@ jobs:
 
     - name: Checkout repository
       uses: actions/checkout@v3
-      with:
-        submodules: recursive
 
     - name: Test 3-prime-workflow
       uses: snakemake/snakemake-github-action@v1
       with:
-        directory: .test/3-prime-config
-        snakefile: workflow/Snakefile
-        args: "--use-conda --show-failed-logs --cores 1 --conda-cleanup-pkgs cache --all-temp"
+        directory: .test/three_prime
+        snakefile: .test/three_prime/workflow/Snakefile
+        args: "--use-conda --show-failed-logs --cores all --conda-cleanup-pkgs cache --all-temp"
     # Disable report testing for now since we mark all output files as temporary above.
     # TODO: add some kind of test mode to report generation which does not really try to include
     # results.

diff --git a/.test/3-prime-config/config/samples.tsv b/.test/3-prime-config/config/samples.tsv
diff --git a/.test/3-prime-config/config/units.tsv b/.test/3-prime-config/config/units.tsv
diff --git a/.test/3-prime-config/workflow/Snakefile b/.test/3-prime-config/workflow/Snakefile
diff --git a/.test/three_prime/README.md b/.test/three_prime/README.md
@@ -0,0 +1,12 @@
+The testing data for this test case is downloaded from this Zenodo dataset:
+https://zenodo.org/doi/10.5281/zenodo.10572745
+
+It has been generated by this snakemake workflow:
+https://github.com/dlaehnemann/create-quant-seq-testing-dataset
+
+So it is based from data from this publication:
+
+Corley, S.M., Troy, N.M., Bosco, A. et al. QuantSeq. 3′ Sequencing combined with Salmon provides a fast, reliable approach for high throughput RNA expression analysis. Sci Rep 9, 18895 (2019). https://doi.org/10.1038/s41598-019-55434-x
+
+The MSigDB gene sets are used according to their [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/), which is given here:
+https://www.gsea-msigdb.org/gsea/msigdb_license_terms.jsp
diff --git a/.test/3-prime-config/config/config.yaml → .test/three_prime/config/config.yaml b/.test/3-prime-config/config/config.yaml → .test/three_prime/config/config.yaml
@@ -18,7 +18,7 @@ resources:
     # ensembl species name
     species: homo_sapiens
     # ensembl release version
-    release: "104"
+    release: "111"
     # genome build
     build: GRCh38
     # pfam release to use for annotation of domains in differential splicing analysis
@@ -45,12 +45,12 @@ diffexp:
   # model for sleuth differential expression analysis
   models:
     model_X:
-      full: ~condition + batch_effect
-      reduced: ~batch_effect
+      full: ~condition
+      reduced: ~1
       # Binary valued covariate that shall be used for fold change/effect size
       # based downstream analyses.
       primary_variable: condition
-      base_level: untreated
+      base_level: Control
   # significance level to use for volcano, ma- and qq-plots
   sig-level:
     volcano-plot: 0.05
@@ -82,7 +82,7 @@ enrichment:
     fdr_genes: 0.05
     fdr_go_terms: 0.05
   fgsea:
-    gene_sets_file: "../ngs-test-data/ref/dummy.gmt"
+    gene_sets_file: "config/gene_sets.gmt"
     # tool is only run if set to `true`
     activate: true
     # if activated, you need to provide a GMT file with gene sets of interest

diff --git a/.test/three_prime/config/gene_sets.gmt b/.test/three_prime/config/gene_sets.gmt
@@ -0,0 +1,2 @@
+UROSEVIC_RESPONSE_TO_IMIQUIMOD	https://www.gsea-msigdb.org/gsea/msigdb/human/geneset/UROSEVIC_RESPONSE_TO_IMIQUIMOD	BRCA1	CCL8	CXCL11	IDO1	IFI35	IFI6	IFITM1	IL6	IRF7	ISG15	ISG20	MICB	MX1	OAS2	OASL	PLAAT4	SECTM1	STAT1	TRAFD1
+KEGG_PROTEASOME	https://www.gsea-msigdb.org/gsea/msigdb/human/geneset/KEGG_PROTEASOME	IFNG	POMP	PSMA1	PSMA2	PSMA3	PSMA4	PSMA5	PSMA6	PSMA6P4	PSMA7	PSMA8	PSMB1	PSMB10	PSMB11	PSMB2	PSMB3	PSMB4	PSMB5	PSMB6	PSMB7	PSMB8	PSMB9	PSMC1	PSMC1P4	PSMC2	PSMC3	PSMC4	PSMC5	PSMC6	PSMD1	PSMD11	PSMD12	PSMD13	PSMD14	PSMD2	PSMD3	PSMD4	PSMD6	PSMD7	PSMD8	PSME1	PSME2	PSME3	PSME4	PSMF1	SEM1
diff --git a/.test/three_prime/config/samples.tsv b/.test/three_prime/config/samples.tsv
@@ -0,0 +1,7 @@
+sample	condition
+SRR8309096	Control
+SRR8309094	Control
+SRR8309095	Treated
+SRR8309097	Treated
+SRR8309098	Control
+SRR8309099	Treated
diff --git a/.test/three_prime/config/units.tsv b/.test/three_prime/config/units.tsv
@@ -0,0 +1,7 @@
+sample	unit	fragment_len_mean	fragment_len_sd	fq1	fq2
+SRR8309096	u1	430	43	quant_seq_test_data/SRR8309096.fastq.gz	
+SRR8309094	u1	430	43	quant_seq_test_data/SRR8309094.fastq.gz	
+SRR8309095	u1	430	43	quant_seq_test_data/SRR8309095.fastq.gz	
+SRR8309097	u1	430	43	quant_seq_test_data/SRR8309097.fastq.gz	
+SRR8309098	u1	430	43	quant_seq_test_data/SRR8309098.fastq.gz	
+SRR8309099	u1	430	43	quant_seq_test_data/SRR8309099.fastq.gz	
diff --git a/.test/three_prime/workflow/Snakefile b/.test/three_prime/workflow/Snakefile
@@ -0,0 +1,44 @@
+from snakemake.utils import min_version
+
+min_version("7.17.0")
+
+
+configfile: "config/config.yaml"
+
+
+# declare main workflow as a module
+module rna_seq_kallisto_sleuth:
+    snakefile:
+        "../../../workflow/Snakefile"
+    config:
+        config
+
+
+use rule * from rna_seq_kallisto_sleuth
+
+
+rule download_quant_seq_testing_data:
+    output:
+        "quant_seq_test_data.tar.gz",
+    log:
+        "logs/download_quant_seq_testing_data.log",
+    shell:
+        "wget https://zenodo.org/records/10572746/files/quant_seq_test_data.tar.gz 2>{log} "
+
+
+rule extract_quant_seq_testing_data:
+    input:
+        "quant_seq_test_data.tar.gz",
+    output:
+        "quant_seq_test_data/README.md",
+        "quant_seq_test_data/SRR8309099.fastq.gz",
+        "quant_seq_test_data/SRR8309095.fastq.gz",
+        "quant_seq_test_data/SRR8309096.fastq.gz",
+        "quant_seq_test_data/samples.tsv",
+        "quant_seq_test_data/SRR8309097.fastq.gz",
+        "quant_seq_test_data/SRR8309094.fastq.gz",
+        "quant_seq_test_data/SRR8309098.fastq.gz",
+    log:
+        "logs/extract_quant_seq_testing_data.log",
+    shell:
+        "tar xzfv {input} 2>{log}"
diff --git a/workflow/envs/QC.yaml b/workflow/envs/QC.yaml
@@ -5,9 +5,6 @@ channels:
 dependencies:
   - altair-transform =0.2.0
   - altair =4.2.0
-  - pysam =0.19.1
-  - numpy =1.22.0
-  - altair_saver =0.5.0
-  - scipy =1.7.3
   - matplotlib =3.5.2
-  - pandas =1
+  - pandas =1
+  - numpy =1.22.0
diff --git a/workflow/envs/biomart.yaml b/workflow/envs/biomart.yaml
@@ -2,6 +2,9 @@ channels:
   - conda-forge
   - bioconda
 dependencies:
-  - bioconductor-biomart =2.46
-  - r-tidyverse =1.3
-  - r-dplyr =1.0.9
+  - bioconductor-biomart =2.56
+  - r-tidyverse =2.0
+  # remove once we can update to bioconductor-biomart of the 3.18 release, which will
+  # include this proper fix for the underlying compatibility issue:
+  # https://github.com/Bioconductor/BiocFileCache/pull/50
+  - r-dbplyr=2.3.4
diff --git a/workflow/envs/biopython.yaml b/workflow/envs/biopython.yaml
@@ -3,4 +3,5 @@ channels:
   - bioconda
   - nodefaults
 dependencies:
-  - biopython =1.79
+  - biopython =1.79
+  - pandas >=1.3,<2
diff --git a/workflow/envs/get_canonical_ids.yaml b/workflow/envs/get_canonical_ids.yaml
diff --git a/workflow/envs/pysam.yaml b/workflow/envs/pysam.yaml
@@ -3,4 +3,5 @@ channels:
   - bioconda
   - nodefaults
 dependencies:
-  - pysam =0.19
+  - pysam =0.21
+  - pandas =2.0
diff --git a/workflow/envs/sleuth.yaml b/workflow/envs/sleuth.yaml
@@ -4,11 +4,9 @@ channels:
   - nodefaults
 dependencies:
   - r-sleuth =0.30
-  - bioconductor-rhdf5lib =1.12
-  - bioconductor-rhdf5 =2.34
   - r-gridextra =2.3
   - r-pheatmap =1.0.12
-  - r-tidyverse =1.3
-  - r-ggpubr =0.4
-  - r-base =4.0
-  - bioconductor-limma =3.46
+  - r-tidyverse =2.0
+  - r-ggpubr =0.6
+  - r-base =4
+  - bioconductor-limma =3.56
diff --git a/workflow/resources/datavzrd/diffexp-template.yaml b/workflow/resources/datavzrd/diffexp-template.yaml
@@ -95,7 +95,7 @@ views:
         chromosome_name:
           optional: true
           display-mode: hidden
-        transcript_mane_select:
+        main_transcript_per_gene:
           optional: true
           display-mode: hidden
         ensembl_transcript_id_version:
@@ -247,8 +247,6 @@ views:
           display-mode: normal
         gene_desc:
           display-mode: normal
-        canonical:
-          display-mode: normal
         num_aggregated_transcripts:
           display-mode: normal
         sum_mean_obs_counts:

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -169,7 +169,7 @@ enrichment_env = render_enrichment_env()
 
 def kallisto_quant_input(wildcards):
     if is_3prime_experiment:
-        return "results/canonical_reads/{sample}-{unit}.fastq"
+        return "results/main_transcript_3prime_reads/{sample}-{unit}.fastq"
     elif not is_single_end(wildcards.sample, wildcards.unit):
         return expand(
             "results/trimmed/{{sample}}-{{unit}}.{group}.fastq.gz", group=[1, 2]

diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk
@@ -67,7 +67,8 @@ rule spia_datavzrd:
     log:
         "logs/datavzrd-report/spia-{model}/spia-{model}.log",
     wrapper:
-        "v2.6.0/utils/datavzrd"
+        # "v2.6.0/utils/datavzrd"
+        "v3.3.5-1-gd73914d/utils/datavzrd"
 
 
 rule diffexp_datavzrd:
@@ -92,7 +93,8 @@ rule diffexp_datavzrd:
     log:
         "logs/datavzrd-report/diffexp.{model}/diffexp.{model}.log",
     wrapper:
-        "v2.6.0/utils/datavzrd"
+        # "v2.6.0/utils/datavzrd"
+        "v3.3.5-1-gd73914d/utils/datavzrd"
 
 
 rule go_enrichment_datavzrd:
@@ -119,4 +121,5 @@ rule go_enrichment_datavzrd:
     log:
         "logs/datavzrd-report/go_enrichment-{model}/go_enrichment-{model}_{gene_fdr}.go_term_fdr_{go_term_fdr}.log",
     wrapper:
-        "v2.6.0/utils/datavzrd"
+        # "v2.6.0/utils/datavzrd"
+        "v3.3.5-1-gd73914d/utils/datavzrd"
diff --git a/workflow/rules/diffexp.smk b/workflow/rules/diffexp.smk
@@ -30,7 +30,7 @@ rule sleuth_init:
     input:
         kallisto=kallisto_output,
         samples="results/sleuth/{model}.samples.tsv",
-        transcript_info="resources/transcript-info.rds",
+        transcript_info="resources/transcripts_annotation.results.rds",
     output:
         sleuth_object="results/sleuth/{model,[^.]+}.rds",
         designmatrix="results/sleuth/{model}.designmatrix.rds",

diff --git a/workflow/rules/qc_3prime.smk b/workflow/rules/qc_3prime.smk
@@ -1,8 +1,20 @@
+rule get_aligned_pos:
+    input:
+        bam_file="results/kallisto_cdna/{sample}-{unit}",
+    output:
+        aligned_files=temp("results/QC/{sample}-{unit}.aligned.txt"),
+    log:
+        "logs/QC/{sample}-{unit}.aligned.log",
+    conda:
+        "../envs/samtools.yaml"
+    shell:
+        "samtools view {input.bam_file}/pseudoalignments.bam | cut -f1,3,4,10,11  > {output} 2> {log}"
+
+
 rule get_selected_transcripts_aligned_read_bins:
     input:
         aligned_file="results/QC/{sample}-{unit}.aligned.txt",
-        samtools_sort="results/kallisto-bam-sorted/{sample}-{unit}-pseudoalignments.sorted.bam",
-        samtools_index="results/kallisto-bam-sorted/{sample}-{unit}-pseudoalignments.sorted.bam.bai",
+        transcripts_annotation="resources/transcripts_annotation.main_transcript_strand_length.tsv",
         read_length="results/stats/max-read-length.json",
     output:
         fwrd_allsamp_hist_fil=temp(
@@ -15,7 +27,7 @@ rule get_selected_transcripts_aligned_read_bins:
         each_transcript="{ind_transcripts}",
         samples="{sample}-{unit}",
     log:
-        "results/logs/QC/{sample}-{unit}.{ind_transcripts}.aligned-read-bins.log",
+        "logs/QC/{sample}-{unit}.{ind_transcripts}.aligned-read-bins.log",
     conda:
         "../envs/QC.yaml"
     script:
@@ -67,7 +79,7 @@ if is_3prime_experiment and config["experiment"]["3-prime-rna-seq"]["plot-qc"] !
         params:
             each_transcript="{ind_transcripts}",
         log:
-            "results/logs/QC/3prime-QC-plot.{ind_transcripts}.log",
+            "logs/QC/3prime-QC-plot.{ind_transcripts}.log",
         conda:
             "../envs/QC.yaml"
         script:
@@ -108,7 +120,7 @@ else:
         params:
             each_transcript="{ind_transcripts}",
         log:
-            "results/logs/QC/3prime-QC-plot.{ind_transcripts}.log",
+            "logs/QC/3prime-QC-plot.{ind_transcripts}.log",
         conda:
             "../envs/QC.yaml"
         script:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		UROSEVIC_RESPONSE_TO_IMIQUIMOD https://www.gsea-msigdb.org/gsea/msigdb/human/geneset/UROSEVIC_RESPONSE_TO_IMIQUIMOD BRCA1 CCL8 CXCL11 IDO1 IFI35 IFI6 IFITM1 IL6 IRF7 ISG15 ISG20 MICB MX1 OAS2 OASL PLAAT4 SECTM1 STAT1 TRAFD1
		KEGG_PROTEASOME https://www.gsea-msigdb.org/gsea/msigdb/human/geneset/KEGG_PROTEASOME IFNG POMP PSMA1 PSMA2 PSMA3 PSMA4 PSMA5 PSMA6 PSMA6P4 PSMA7 PSMA8 PSMB1 PSMB10 PSMB11 PSMB2 PSMB3 PSMB4 PSMB5 PSMB6 PSMB7 PSMB8 PSMB9 PSMC1 PSMC1P4 PSMC2 PSMC3 PSMC4 PSMC5 PSMC6 PSMD1 PSMD11 PSMD12 PSMD13 PSMD14 PSMD2 PSMD3 PSMD4 PSMD6 PSMD7 PSMD8 PSME1 PSME2 PSME3 PSME4 PSMF1 SEM1