nf-core · ziadbkh · Dec 4, 2024 · Feb 5, 2025 · Feb 5, 2025 · Feb 19, 2025
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -9,6 +9,7 @@ body:
 
         - [nf-core website: troubleshooting](https://nf-co.re/usage/troubleshooting)
         - [nf-core/proteinfold pipeline documentation](https://nf-co.re/proteinfold/usage)
+
   - type: textarea
     id: description
     attributes:

diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml
@@ -1,17 +1,15 @@
 name: nf-core branch protection
-# This workflow is triggered on PRs to `main`/`master` branch on the repository
-# It fails when someone tries to make a PR against the nf-core `main`/`master` branch instead of `dev`
+# This workflow is triggered on PRs to master branch on the repository
+# It fails when someone tries to make a PR against the nf-core `master` branch instead of `dev`
 on:
   pull_request_target:
-    branches:
-      - main
-      - master
+    branches: [master]
 
 jobs:
   test:
     runs-on: ubuntu-latest
     steps:
-      # PRs to the nf-core repo main/master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches
+      # PRs to the nf-core repo master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches
       - name: Check PRs
         if: github.repository == 'nf-core/proteinfold'
         run: |
@@ -24,7 +22,7 @@ jobs:
         uses: mshick/add-pr-comment@b8f338c590a895d50bcbfa6c5859251edc8952fc # v2
         with:
           message: |
-            ## This PR is against the `${{github.event.pull_request.base.ref}}` branch :x:
+            ## This PR is against the `master` branch :x:
 
             * Do not close this PR
             * Click _Edit_ and change the `base` to `dev`
@@ -34,9 +32,9 @@ jobs:
 
             Hi @${{ github.event.pull_request.user.login }},
 
-            It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) ${{github.event.pull_request.base.ref}} branch.
-            The ${{github.event.pull_request.base.ref}} branch on nf-core repositories should always contain code from the latest release.
-            Because of this, PRs to ${{github.event.pull_request.base.ref}} are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch.
+            It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `master` branch.
+            The `master` branch on nf-core repositories should always contain code from the latest release.
+            Because of this, PRs to `master` are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch.
 
             You do not need to close this PR, you can change the target branch to `dev` by clicking the _"Edit"_ button at the top of this page.
             Note that even after this, the test will continue to show as failing until you push a new commit.

diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Download lint results
-        uses: dawidd6/action-download-artifact@20319c5641d495c8a52e688b7dc5fada6c3a9fbc # v8
+        uses: dawidd6/action-download-artifact@bf251b5aa9c2f7eeb574a96ee720e24f801b7c11 # v6
         with:
           workflow: linting.yml
           workflow_conclusion: completed

diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) The nf-core/proteinfold team
+Copyright (c) Athanasios Baltzis, Jose Espinosa-Carrasco, Harshil Patel
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -45,6 +45,8 @@ On release, automated continuous integration tests run the pipeline on a full-si
 
    vii. [HelixFold3](https://github.com/PaddlePaddle/PaddleHelix/tree/dev/apps/protein_folding/helixfold3) - Regular HF3
 
+   viii. [Boltz](https://github.com/jwohlwend/boltz/) - Regular Boltz1
+
 ## Usage
 
 > [!NOTE]
@@ -166,6 +168,20 @@ The pipeline takes care of downloading the databases and parameters required by
       -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
   ```
 
+- The boltz mode can be run using the command below:
+
+  ```console
+  nextflow run nf-core/proteinfold \
+      --input samplesheet.csv \
+      --outdir <OUTDIR> \
+      --mode boltz \
+      --boltz_ccd_path <null (default) | PATH> \
+      --boltz_model_path <null (default) | PATH> \
+      --use_gpu <true/false> \
+      -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
+  ```
+
+
 > [!WARNING]
 > Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).
 

diff --git a/assets/nf-core-proteinfold_logo_light.png b/assets/nf-core-proteinfold_logo_light.png
diff --git a/bin/generate_report.py b/bin/generate_report.py
@@ -316,7 +316,8 @@ def pdb_to_lddt(struct_files, generate_tsv):
     "alphafold2": "AlphaFold2",
     "colabfold": "ColabFold",
     "rosettafold_all_atom": "Rosettafold_All_Atom",
-    "helixfold3": "HelixFold3"
+    "helixfold3": "HelixFold3",
+    "boltz": "Boltz1"
 }
 
 parser = argparse.ArgumentParser()

diff --git a/bin/msa_manager.py b/bin/msa_manager.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+import os
+import string
+import argparse
+
+
+MAX_MSA_SEQS = 16384
+MAX_PAIRED_SEQS = 8192
+ID_CHARS = list(string.ascii_uppercase) + list(string.ascii_lowercase) + [str(x) for x in range(10)]
+
+
+def get_sub_sequences(seq_lengths, whole_seq):
+    out_seqs = []
+    curr_seq = ""
+    curr_seq_itr = 0
+    total_letters = 0
+    for letter in whole_seq:
+        curr_seq += letter
+        if letter.isupper() or letter == "-":
+            total_letters += 1
+        if total_letters == seq_lengths[curr_seq_itr]:
+            out_seqs.append(curr_seq)
+            curr_seq = ""
+            curr_seq_itr += 1
+            total_letters = 0
+
+    if len(out_seqs) != len(seq_lengths):
+        print("Something wrong in the input file, could not generate the required number of sequences")
+        exit(1)
+
+    return out_seqs
+
+
+def parse_msa(msa_path, output_dir):
+    os.makedirs(output_dir, exist_ok=True)
+    homolog = ""
+    section_index = 0
+
+    with open(msa_path, "r") as file:
+        first_line = file.readline()
+        if not first_line.startswith("#"):
+            print("Error: File might not have multiple A3M sections.")
+            return
+
+        homologs_lengths = [int(x.strip()) for x in first_line.replace("#", "").split()[0].split(",")]
+        sequence_groups = [[[], []] for _ in range(len(homologs_lengths))]
+
+        header_line = file.readline().strip()[1:]
+        expected_section_headers = [x.strip() for x in header_line.split()]
+        current_header = header_line
+        first_seq = False
+        for line in file:
+            line = line.strip()
+            if line.startswith(">"):
+                if homolog:
+                    if first_seq and section_index > 0:
+                        first_seq = False
+                    else:
+                        sub_sequences = get_sub_sequences(homologs_lengths, homolog)
+                        for seq_index in range(len(homologs_lengths)):
+                            if section_index == 0:
+                                if len(sequence_groups[seq_index][0]) < MAX_PAIRED_SEQS:
+                                    sequence_groups[seq_index][0].append(sub_sequences[seq_index])
+                            else:
+                                if seq_index == section_index - 1:
+                                    if len(sequence_groups[seq_index][1]) + len(sequence_groups[seq_index][0]) < MAX_MSA_SEQS:
+                                        sequence_groups[seq_index][1].append(sub_sequences[seq_index])
+
+                homolog = ""
+                current_header = line[1:].strip()
+
+                if section_index < len(homologs_lengths) and current_header == expected_section_headers[section_index]:
+                    section_index += 1
+                    first_seq = True
+            else:
+                homolog += line
+        if homolog:
+            if first_seq and section_index > 0:
+                first_seq = False
+            else:
+                sub_sequences = get_sub_sequences(homologs_lengths, homolog)
+                for seq_index in range(len(homologs_lengths)):
+                    if section_index == 0:
+                        if len(sequence_groups[seq_index][0]) < MAX_PAIRED_SEQS:
+                            sequence_groups[seq_index][0].append(sub_sequences[seq_index])
+                    else:
+                        if seq_index == section_index - 1:
+                            if len(sequence_groups[seq_index][1]) + len(sequence_groups[seq_index][0]) < MAX_MSA_SEQS:
+                                sequence_groups[seq_index][1].append(sub_sequences[seq_index])
+
+    for seq_index in range(len(homologs_lengths)):
+        filename = os.path.join(output_dir, f"{ID_CHARS[seq_index]}.csv")
+        with open(filename, "w") as out_file:
+            out_file.write("key,sequence\n")
+            paired_sequences = sequence_groups[seq_index][0]
+            for i, seq in enumerate(paired_sequences, start=1):
+                out_file.write(f"{i},{seq}\n")
+
+            unpaired_sequences = sequence_groups[seq_index][1]
+            for seq in unpaired_sequences:
+                out_file.write(f"-1,{seq}\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Split multi-A3M file into CSV sequences per section.")
+    parser.add_argument("msa_path", help="Path to input .a3m file")
+    parser.add_argument("-o", "--output_dir", default="output_msa", help="Directory to write output CSVs")
+
+    args = parser.parse_args()
+    parse_msa(args.msa_path, args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/conf/dbs.config b/conf/dbs.config
@@ -98,6 +98,11 @@ params {
     // Esmfold paths
     esmfold_params_path                         = "${params.esmfold_db}/*"
 
+    // boltz paths
+    boltz_ccd_link           = 'https://huggingface.co/boltz-community/boltz-1/resolve/main/ccd.pkl'
+    boltz_model_link         = 'https://huggingface.co/boltz-community/boltz-1/resolve/main/boltz1.ckpt'
+
+
     // Foldseek databases paths
     foldseek_db = null
     foldseek_db_path = null

diff --git a/conf/modules_boltz.config b/conf/modules_boltz.config
@@ -0,0 +1,45 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Config file for defining DSL2 per module options and publishing paths
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Available keys to override module options:
+        ext.args   = Additional arguments appended to command in module.
+        ext.args2  = Second set of arguments appended to command in module (multi-tool modules).
+        ext.args3  = Third set of arguments appended to command in module (multi-tool modules).
+        ext.prefix = File name prefix for output files.
+----------------------------------------------------------------------------------------
+*/
+
+process {
+    withName: 'RUN_BOLTZ' {
+        ext.args = '--write_full_pae --output_format pdb --use_msa_server'
+        publishDir = [
+            [
+                path: { "${params.outdir}/boltz/default" },
+                mode: 'copy',
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+                pattern: '*.*'
+            ],
+            [
+                path: { "${params.outdir}/boltz/default/top_ranked_structures" },
+                mode: 'copy',
+                saveAs: { "${meta.id}.pdb" },
+                pattern: '*.pdb'
+            ]
+        ]
+    }
+
+    withName: 'NFCORE_PROTEINFOLD:BOLTZ:MULTIQC' {
+        publishDir = [
+            path: { "${params.outdir}/multiqc" },
+            mode: 'copy',
+            saveAs: { filename -> filename.equals('versions.yml') ? null : "boltz_$filename" }
+        ]
+    }
+
+    withName: 'BOLTZ_FASTA|MULTIFASTA_TO_CSV|SPLIT_MSA' {
+        cpus   = 1
+        memory = 2.GB
+        time   = 1.h
+    }
+}
diff --git a/conf/test_full_boltz.config b/conf/test_full_boltz.config
@@ -0,0 +1,23 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running full-size tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a full size pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/proteinfold -profile test_full_esmfold,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Full test profile for boltz'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Input data for full test of boltz
+    mode                    = 'boltz'
+    colabfold_server        = 'local'
+    colabfold_model_preset  = 'alphafold2_ptm'
+    input                   = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.2/samplesheet.csv'
+    colabfold_db            = 's3://proteinfold-dataset/test-data/db/colabfold_mini'
+}
diff --git a/conf/test_helixfold3.config b/conf/test_helixfold3.config
@@ -8,7 +8,6 @@
 ----------------------------------------------------------------------------------------
 */
 
-stubRun = true
 
 // Limit resources so that this can run on GitHub Actions
 process {

diff --git a/dockerfiles/Dockerfile_nfcore-proteinfold_boltz b/dockerfiles/Dockerfile_nfcore-proteinfold_boltz
@@ -0,0 +1,18 @@
+FROM python:3.9-slim
+
+LABEL authors="Ziad Al-Bkhetan <ziad.albkhetan@gmail.com>" \
+    title="nfcore/proteinfold_boltz" \
+    Version="0.3.0" \
+    description="Docker image containing all software requirements to run boltz using the nf-core/proteinfold pipeline"
+
+RUN  apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential \
+    procps \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip install boltz
+
+CMD ["boltz"]
+
+
diff --git a/docs/images/nf-core-proteinfold_logo_light.png b/docs/images/nf-core-proteinfold_logo_light.png
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,7 +8,6 @@ @@
     ----------------------------------------------------------------------------------------
     */
-    stubRun = true
     // Limit resources so that this can run on GitHub Actions
     process {
@@ Expand Down @@