Merge pull request #1067 from maxplanck-ie/pairtools_merge

Pairtools merge
maxplanck-ie · Oct 11, 2024 · 55e560b · 55e560b
2 parents ae1c940 + cf2d4fe
commit 55e560b
Show file tree

Hide file tree

Showing 16 changed files with 333 additions and 133 deletions.
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.11', '3.12']
+        python-version: ['3.11', '3.12' ,'3.13']
         optdeps: [".", ".[actions]", ".[docs]"]
     steps:
     - uses: actions/checkout@v4

diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml
@@ -46,6 +46,6 @@ jobs:
           pip install .
       - name: createEnvsOSX
         run: |
-          conda config --add subdirs osx-64
+          conda config --set subdir osx-64
           snakePipes createEnvs --only ${{matrix.envs}}
      
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "3.0.0" %}
+{% set version = "3.1.0" %}
 
 package:
   name: snakepipes
@@ -14,9 +14,10 @@ build:
 
 requirements:
   host:
-    - python >=3
+    - python >=3, < 3.13
     - pip
     - seaborn
+    - setuptools
   run:
     - python >=3.11
     - snakemake >=8

diff --git a/docs/conf.py b/docs/conf.py
@@ -15,7 +15,6 @@
 import sys
 import os
 from importlib.metadata import version as importlibversion
-import sphinx_rtd_theme
 
 # to allow readthedocs to compile without installing some dependencies
 import mock
@@ -150,7 +149,7 @@
 
 # import them both locally and on rtd
 html_theme = 'sphinx_rtd_theme'  # 'alabaster' 'sphinx_rtd_theme'
-html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+# html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the

diff --git a/docs/content/News.rst b/docs/content/News.rst
@@ -1,6 +1,20 @@
 snakePipes News
 ===============
 
+snakePipes 3.1.0
+________________
+
+ * installation test for python 3.13
+ * bulk mode in makePairs wf
+
+snakePipes 3.0.0
+________________
+
+* clusteryaml omitted for profiles
+* All workflows have '-' removed from their name
+* toml file installation
+* makePairs mode introduced
+
 snakePipes 2.9.0
 ________________
 

diff --git a/docs/content/workflows/makePairs.rst b/docs/content/workflows/makePairs.rst
@@ -6,8 +6,8 @@ makePairs
 What it does
 ------------
 
-The snakePipes makePairs workflow allows users to process their HiC data from raw fastq files to HiC matrices in
-an allele-specific manner. The workflow utilized mapping by bwa, followed by analysis
+The snakePipes makePairs workflow allows users to process their HiC/uC data from raw fastq files to HiC matrices (in
+an allele-specific manner). The workflow utilizes mapping by bwa, followed by analysis
 using `pairtools <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9949071/>`__ . The workflow follows the `example workflow described in the documentation of pairtools <https://pairtools.readthedocs.io/en/latest/examples/pairtools_phase_walkthrough.html>`__ , 
 which explains each step in detail and would be useful for new users to have a look at. 
 Currently the output matrices are produced in the `.pairs <https://pairtools.readthedocs.io/en/latest/formats.html>`__ format.

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
 name = "snakePipes"
 description = 'Snakemake workflows and wrappers for NGS data processing from the MPI-IE'
 readme = "README.md"
-version = "3.0.0"
+version = "3.1.0"
 keywords = [
     "DNAmapping",
     "ChIPSeq",
@@ -24,8 +24,7 @@ authors = [
 ]
 
 classifiers = [
-    "Intended Audience :: Bioinformaticians",
-    "Intended Audience :: Biologists",
+    "Intended Audience :: Science/Research",
     "License :: OSI Approved :: MIT License",
     "Programming Language :: Python :: 3",   
 ]

diff --git a/snakePipes/parserCommon.py b/snakePipes/parserCommon.py
@@ -139,7 +139,7 @@ def snpArguments(defaults):
     snpargs = parser.add_argument_group('Allele-specific mapping arguments')
     snpargs.add_argument("--VCFfile",
                          default='',
-                         help="VCF file to create N-masked genomes (default: 'None')")
+                         help="VCF file to create N-masked genomes (default: 'None'). Note that for the makePairs workflow this file is assumed to be gzipped and indexed (with tabix).")
 
     snpargs.add_argument("--strains",
                          default='',

diff --git a/snakePipes/shared/defaults.yaml b/snakePipes/shared/defaults.yaml
@@ -19,4 +19,4 @@ max_thread: 25
 #print tools versions
 toolsVersion: True
 oldConfig:
-configMode: manual
+configMode: manual
diff --git a/snakePipes/shared/rules/envs/makePairs.yaml b/snakePipes/shared/rules/envs/makePairs.yaml
@@ -2,9 +2,9 @@ name: pairtools_phased
 channels:
   - conda-forge
   - bioconda
-  - defaults
 dependencies:
   - bwa
+  - cooler
   - samtools
   - tabix
   - bcftools

diff --git a/snakePipes/shared/rules/pairtools.snakefile b/snakePipes/shared/rules/pairtools.snakefile
@@ -1,199 +1,167 @@
-# based on https://github.com/caballero/snakemake-pairtools-phased/tree/df410ff
+rule generate_chromsizes:
+  input:
+    genome_index
+  output:
+    'genome/genome.chromsizes'
+  threads: 1
+  shell:'''
+  cut -f1,2 {input} > {output}
+  '''
 
-
-# Define function that returns pair files (phased or unphased), based on the reference.
-def ret_pair(wildcards):
-    if "diploid_genome" in wildcards.ref:
-        # Phased setting
-        return f"pairs/{wildcards.sample}.{wildcards.ref}_phased.pairs.gz"
-    else:
-        return f"pairs/{wildcards.sample}.{wildcards.ref}.pairs.gz"
-
-
-# different to bwa.snakefile
-# here we skip the expensive sorting with samtools after bwa mem
-# consider making this optional in bwa.snakefile
 rule bwa_mapping:
     input:
         fq1="FASTQ_fastp/{sample}_R1.fastq.gz",
-        fq2="FASTQ_fastp/{sample}_R2.fastq.gz",
-        ix="genome/{ref}.fa.gz.bwt",
+        fq2="FASTQ_fastp/{sample}_R2.fastq.gz"
     output:
-        bam="bam/{sample}.{ref}.bam",
+        bam="bam/{sample}.bam",
     threads: 30
     params:
-        bwathreads=config["alignerThreads"],
         bwaparams=config["alignerOptions"],
-        fna=lambda wildcards, input: Path(input.ix).with_suffix(""),
+        bwa_index = bwa_index
     resources:
         mem_mb=3000,
     benchmark:
-        "bam/.benchmark/bwa_mapping.{sample}.{ref}.benchmark"
+        "bam/.benchmark/bwa_mapping.{sample}.benchmark"
     conda:
         CONDA_MAKEPAIRS_ENV
     shell:
         """
         bwa mem \
             {params.bwaparams} \
-            -t {params.bwathreads} \
-            {params.fna} \
+            -t 22 \
+            {params.bwa_index} \
             {input.fq1} \
             {input.fq2} \
         | samtools view -@ 8 -b \
         > {output.bam}
         """
 
-
 rule pairtools_parse:
     input:
-        bam="bam/{sample}.{ref}.bam",
-        chr_sizes="genome/{ref}.chromsizes",
+        bam="bam/{sample}.bam",
+        chr_sizes='genome/genome.chromsizes'
     output:
-        pairs="pairs/{sample}.{ref}.pairs.gz",
+        pairs=temp("pairs/{sample}.unsorted.pairs.gz"),
     params:
-        minmapq=40,
-        cols=lambda wildcards: (
-            "--add-columns XB,AS,XS" if "diploid_genome" in wildcards.ref else ""
-        ),
+        minmapq=40
     threads: 12
     benchmark:
-        "pairs/.benchmark/pairtools_parse.{sample}.{ref}.benchmark"
+        "pairs/.benchmark/pairtools_parse.{sample}.benchmark"
     conda:
         CONDA_MAKEPAIRS_ENV
     shell:
         """
         pairtools parse \
             --min-mapq {params.minmapq} \
-            {params.cols} \
             --drop-sam \
             --walks-policy 5unique \
             -c {input.chr_sizes} \
             {input.bam} \
             -o {output.pairs}
         """
 
-
-rule pairtools_phase:
-    input:
-        pairs="pairs/{sample}.diploid_genome.pairs.gz",
-    output:
-        pairs="pairs/{sample}.diploid_genome_phased.pairs.gz",
-    params:
-        hap1=strains[0],
-        hap2=strains[1],
-    threads: 12
-    benchmark:
-        "pairs/.benchmark/pairtools_phase.{sample}.benchmark"
-    conda:
-        CONDA_MAKEPAIRS_ENV
-    shell:
-        """
-        pairtools phase \
-            --phase-suffixes _{params.hap1} _{params.hap2} \
-            --tag-mode XB \
-            --clean-output \
-            {input.pairs} -o {output.pairs}
-        """
-
-
 rule pairtools_sort:
     input:
-        ret_pair,
+        pairs = "pairs/{sample}.unsorted.pairs.gz",
     output:
-        pairs="pairs/{sample}.{ref}.pairs.sorted.gz",
+        pairs = "pairs/{sample}.pairs.gz",
     threads: 20
     benchmark:
-        "pairs/.benchmark/pairtools_sort.{sample}.{ref}.benchmark"
+        "pairs/.benchmark/pairtools_sort.{sample}.benchmark"
     conda:
         CONDA_MAKEPAIRS_ENV
     shell:
         """
         pairtools sort \
-            {input} \
+            {input.pairs} \
             -o {output.pairs} \
             --memory 20G
         """
 
-
 rule pairtools_dedup:
     input:
-        pairs="pairs/{sample}.{ref}.pairs.sorted.gz",
+        pairs="pairs/{sample}.pairs.gz",
     output:
-        pairs="pairs/{sample}.{ref}.pairs.dedup.gz",
-        stats="pairs/{sample}.{ref}.pairs.dedup.stats",
-    params:
-        extra_cols=lambda wildcards: (
-            "--extra-col-pair phase1 phase2" if "diploid" in wildcards.ref else ""
-        ),
+        pairs="pairs/{sample}.pairs.dedup.gz",
+        stats="pairs/{sample}.pairs.dedup.stats"
     threads: 12
     benchmark:
-        "pairs/.benchmark/pairtools_dedup.{sample}.{ref}.benchmark"
+        "pairs/.benchmark/pairtools_dedup.{sample}.benchmark"
     conda:
         CONDA_MAKEPAIRS_ENV
     shell:
         """
         pairtools dedup \
             --mark-dups \
-            {params.extra_cols} \
             --output-dups - \
             --output-unmapped - \
             --output-stats {output.stats} \
             -o {output.pairs} \
             {input.pairs}
         """
 
+rule pairix:
+    input:
+        pairs = 'pairs/{sample}.pairs.dedup.gz'
+    output:
+        ix = 'pairs/{sample}.pairs.dedup.gz.px2'
+    threads: 2
+    conda:
+        CONDA_MAKEPAIRS_ENV
+    shell:
+        """
+        pairix -f -p pairs {input.pairs}
+        """
 
-rule pairtools_filter_phased:
+rule cooler:
     input:
-        pairs="pairs/{sample}.diploid_genome.pairs.dedup.gz",
+        pairs = 'pairs/{sample}.pairs.dedup.gz',
+        ix = 'pairs/{sample}.pairs.dedup.gz.px2',
+        chromsizes = 'genome/genome.chromsizes'
     output:
-        stats="phase_stats/{sample}.diploid_genome_{phasetype}.pairs.stats",
-        pairs="phase_stats/{sample}.diploid_genome_{phasetype}.pairs.gz",
-    params:
-        filterparam=lambda wildcards: PHASEDIC[wildcards.phasetype],
-    resources:
-        mem_mb=1000,
-    benchmark:
-        "phase_stats/.benchmark/pairtools_filter_phased.{sample}.diploid_genome_{phasetype}.benchmark"
-    threads: 12
+        cool = 'cooler/{sample}.5000.cool'
+    threads: 20
     conda:
         CONDA_MAKEPAIRS_ENV
     shell:
         """
-        pairtools select \
-            '{params.filterparam}' \
-            {input.pairs} \
-            -o {output.pairs}
-        pairtools stats {output.pairs} -o {output.stats}
+        cooler cload pairix -p {threads} {input.chromsizes}:5000 {input.pairs} {output.cool}
+        cooler balance --nproc {threads} {output.cool}
         """
 
+rule mcool:
+    input:
+        cool = 'cooler/{sample}.5000.cool'
+    output:
+        mcool = 'cooler/{sample}.5000.mcool'
+    threads: 20
+    conda:
+        CONDA_MAKEPAIRS_ENV
+    shell:
+        """
+        cooler zoomify --resolutions 5000,10000,20000,40000,80000,120000 --balance --nproc {threads} {input.cool}
+        """
 
 rule multiqc:
     input:
-        stats=expand(
-            "pairs/{sample}.{ref}.pairs.dedup.stats", sample=samples, ref=REFERENCES
-        ),
-        phasedstats=expand(
-            "phase_stats/{sample}.diploid_genome_{phasetype}.pairs.stats",
-            sample=samples,
-            phasetype=PHASEDIC.keys(),
-        ),
+        cools=expand(
+            'cooler/{sample}.5000.mcool', sample=samples
+        )
     output:
-        html="multiqc/multiqc_report.html",
+        html="multiQC/multiqc_report.html",
     params:
-        odir="multiqc",
+        odir="multiQC",
     benchmark:
-        "multiqc/.benchmark/multiqc.benchmark"
+        "multiQC/.benchmark/multiqc.benchmark"
     threads: 1
     conda:
         CONDA_MAKEPAIRS_ENV
     shell:
         """
-        echo input: {input.phasedstats}
         multiqc \
             --module pairtools \
             --module fastqc \
             --module fastp \
             -o {params.odir} \
             .
-        """
+        """