openpipelines-bio · dorien-er · Sep 6, 2024 · Mar 6, 2024 · Mar 6, 2024 · Mar 6, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,8 @@
 
 * `scgpt/binning` component: Added a scGPT pre-processing binning component (PR #765).
 
+* `workflows/integration/scgpt_leiden/: workflow: integrate with scGPT followed by Leiden clustering (PR #794).
+
 ## MINOR CHANGES
 
 * Added `resources_test_scripts/cellranger_atac_tiny_bcl.sh` script: download tiny bcl file with an ATAC experiment, download a motifs file, demultiplex bcl files to reads in fastq format (PR #726).

diff --git a/resources_test_scripts/scgpt.sh b/resources_test_scripts/scgpt.sh
diff --git a/src/workflows/integration/scgpt_leiden/config.vsh.yaml b/src/workflows/integration/scgpt_leiden/config.vsh.yaml
@@ -0,0 +1,170 @@
+functionality:
+  name: "scgpt_leiden"
+  namespace: "workflows/integration"
+  description: "Run scGPT integration (cell embedding generation) followed by neighbour calculations, leiden clustering and run umap on the result."
+  authors:
+    - __merge__: /src/authors/dorien_roosen.yaml
+      roles: [ author, maintainer ]
+  argument_groups:
+    - name: "Inputs"
+      arguments:
+        - name: "--id"
+          required: true
+          type: string
+          description: ID of the sample.
+          example: foo
+        - name: "--input"
+          type: file
+          required: true
+          description: Path to the input file.
+          example: input.h5mu
+        - name: "--modality"
+          type: string
+          default: "rna"
+          required: false
+        - name: "--model"
+          type: file
+          required: true
+          example: resources_test/scgpt/best_model.pt
+          description: |
+            Path to scGPT model file.
+        - name: "--model_vocab"
+          type: file
+          direction: input
+          required: true
+          example: resources_test/scgpt/vocab.json
+          description: |
+            Path to scGPT model vocabulary file.
+        - name: "--model_config"
+          type: file
+          direction: input
+          required: true
+          example: args.json
+          description: |
+            Path to scGPT model config file.
+        - name: "--input_layer"
+          type: string
+          required: False
+          description: |
+            Mudata layer (key from layers) to use as input data for hvg subsetting and binning; if not specified, X is used.
+        - name: "--var_gene_names"
+          type: string
+          required: false
+          description: |
+            The name of the adata var column containing gene names; when no gene_name_layer is provided, the var index will be used.
+        - name: "--obs_batch_label"
+          type: string
+          description: |
+            The name of the adata obs column containing the batch labels.
+
+    - name: "Outputs"
+      arguments:
+        - name: "--output"
+          type: file
+          required: true
+          direction: output
+          description: Output file path
+          example: output.h5mu
+        - name: "--output_compression"
+          type: string
+          example: "gzip"
+          required: false
+          choices: ["gzip", "lzf"]
+          description: |
+            The compression algorithm to use for the output h5mu file.
+
+    - name: "Padding arguments"
+      arguments:
+        - name: "--pad_token"
+          type: string
+          default: "<pad>"
+          required: false
+          description: |
+            Token used for padding.
+        - name: "--pad_value"
+          type: integer
+          default: -2
+          required: false
+          description: |
+            The value of the padding token.
+
+    - name: "HVG subset arguments"
+      arguments:
+        - name: "--n_hvg"
+          type: integer
+          default: 1200
+          description: |
+            Number of highly variable genes to subset for.
+
+    - name: "Tokenization arguments"
+      arguments:
+        - name: "--max_seq_len"
+          type: integer
+          required: false
+          description: |
+            The maximum sequence length of the tokenized data.
+
+    - name: "Embedding arguments"
+      arguments:
+        - name: --dsbn
+          type: boolean
+          default: true
+          description: |
+            Apply domain-specific batch normalization
+        - name: "--batch_size"
+          type: integer
+          default: 64
+          description: |
+            The batch size to be used for embedding inference.
+
+    - name: "Binning arguments"
+      arguments:
+        - name: "--n_input_bins"
+          type: integer
+          default: 51
+          required: False
+          min: 1
+          description: |
+            The number of bins to discretize the data into; When no value is provided, data won't be binned.
+        - name: "--seed"
+          type: integer
+          required: false
+          description: |
+            Seed for random number generation used for binning. If not set, no seed is used.
+
+    - name: "Clustering arguments"
+      arguments:
+        - name: "--leiden_resolution"
+          type: double
+          description: Control the coarseness of the clustering. Higher values lead to more clusters.
+          default: [1]
+          multiple: true
+
+  resources:
+    - type: nextflow_script
+      path: main.nf
+      entrypoint: run_wf
+
+  dependencies:
+    - name: scgpt/cross_check_genes
+    - name: scgpt/binning
+    - name: feature_annotation/highly_variable_features_scanpy
+    - name: filter/do_filter
+    - name: scgpt/pad_tokenize
+    - name: scgpt/embedding
+    - name: dimred/umap
+    - name: neighbors/find_neighbors
+    - name: cluster/leiden
+    - name: metadata/move_obsm_to_obs
+
+  test_resources:
+    - type: nextflow_script
+      path: test.nf
+      entrypoint: test_wf
+    - type: nextflow_script
+      path: test.nf
+      entrypoint: test_wf2
+    - path: /resources_test/scgpt
+
+platforms:
+  - type: nextflow
diff --git a/src/workflows/integration/scgpt_leiden/integration_test.sh b/src/workflows/integration/scgpt_leiden/integration_test.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+export NXF_VER=21.10.6
+
+viash ns build -q scgpt_leiden
+
+nextflow run . \
+  -main-script src/workflows/integration/scgpt_leiden/test.nf \
+  -profile docker,no_publish \
+  -entry test_wf \
+  -with-trace work/trace.txt \
+  -c src/workflows/utils/labels_ci.config \
+  -c src/workflows/utils/integration_tests.config
+
+nextflow run . \
+  -main-script src/workflows/integration/scgpt_leiden/test.nf \
+  -profile docker,no_publish \
+  -entry test_wf2 \
+  -resume \
+  -c src/workflows/utils/labels_ci.config \
+  -c src/workflows/utils/integration_tests.config \
diff --git a/src/workflows/integration/scgpt_leiden/main.nf b/src/workflows/integration/scgpt_leiden/main.nf
@@ -0,0 +1,166 @@
+workflow run_wf {
+
+  take:
+    input_ch
+
+  main:
+    output_ch = input_ch
+    // Set aside the output for this workflow to avoid conflicts
+    | map {id, state -> 
+      def new_state = state + ["workflow_output": state.output]
+      [id, new_state]
+    }
+    | highly_variable_features_scanpy.run(
+      fromState: {id, state ->
+      // Annotates the mudata object with highly variable genes.
+        [
+          "input": state.input,
+          "layer": state.input_layer,
+          "modality": state.modality,
+          "var_name_filter": "filter_with_hvg",
+          "n_top_features": state.n_hvg,
+          "flavor": "seurat_v3"
+        ]
+      },
+      toState: ["input": "output"]
+    )
+    | do_filter.run(
+      fromState: {id, state ->
+        // do_filter does not need a layer argument because it filters all layers
+        // from a modality.
+        // filters the mudata object based on the HVG
+        [
+          "input": state.input,
+          "modality": state.modality,
+          "var_filter": "filter_with_hvg"
+        ]
+      },
+      toState: ["input": "output"]
+    )
+    | cross_check_genes.run(
+      fromState: { id, state ->
+      // Check whether the genes are part of the provided vocabulary. Subsets for genes present in vocab only.
+        [
+          "input": state.input,
+          "modality": state.modality,
+          "vocab_file": state.model_vocab,
+          "var_gene_names": state.var_gene_names,
+          "output": state.output,
+          "pad_token": state.pad_token
+        ]
+      },
+      toState: ["input": "output"]
+    )
+    | binning.run(
+      // Bins the data into a fixed number of bins.
+        fromState: {id, state -> [
+            "input": state.input,
+            "modality": state.modality,
+            "input_layer": state.input_layer,
+            "n_input_bins": state.n_input_bins,
+            "binned_layer": "binned",
+            "output": state.output,
+            "seed": state.seed
+          ]
+        },
+        toState: ["input": "output"]
+    )
+    | pad_tokenize.run(
+      // Padding and tokenization of gene count values.
+       fromState: {id, state -> [
+            "input": state.input,
+            "modality": state.modality,
+            "model_vocab": state.model_vocab,
+            "input_layer": "binned",
+            "var_gene_names": state.var_gene_names,
+            "pad_token": state.pad_token,
+            "pad_value": state.pad_value,
+            "max_seq_len": state.max_seq_len,
+            "obsm_gene_tokens": "gene_id_tokens",
+            "obsm_tokenized_values": "values_tokenized",
+            "obsm_padding_mask": "padding_mask",
+            "output": state.output
+          ]
+        },
+        toState: ["input": "output"]
+    )
+    | embedding.run(
+      // Generation of cell embedings from the tokenized gene counts values.
+      fromState: {id, state -> [
+          "input": state.input,
+          "modality": state.modality,
+          "model": state.model,
+          "model_vocab": state.model_vocab,
+          "model_config": state.model_config,
+          "obsm_gene_tokens": "gene_id_tokens",
+          "obsm_tokenized_values": "values_tokenized",
+          "obsm_padding_mask": "padding_mask",
+          "var_gene_names": state.var_gene_names,
+          "obs_batch_label": state.obs_batch_label,
+          "pad_token": state.pad_token,
+          "pad_value": state.pad_value,
+          "dsbn": state.dsbn,
+          "batch_size": state.batch_size,
+          "obsm_embeddings": "X_scGPT",
+          "output": state.output
+        ]
+      },
+      toState: ["input": "output"]
+    )
+
+    | find_neighbors.run(
+      fromState: {id, state -> [
+          "input": state.input,
+          "uns_output": "scGPT_integration_neighbors",
+          "obsp_distances": "scGPT_integration_distances",
+          "obsp_connectivities": "scGPT_integration_connectivities",
+          "obsm_input": "X_scGPT",
+          "modality": state.modality
+        ]
+      },
+      toState: ["input": "output"]
+    )
+
+    | leiden.run(
+      runIf: {id, state -> state.var_name_mitochondrial_genes}
+      fromState: {id, state -> [
+        "input": state.input,
+        "obsp_connectivities": "scGPT_integration_connectivities",
+        "obsm_name": "scGPT_integration_leiden",
+        "resolution": state.leiden_resolution,
+        "modality": state.modality,
+        ]
+      },
+      toState: ["input": "output"]
+    )
+
+    | move_obsm_to_obs.run(
+      runIf: {id, state -> state.var_name_mitochondrial_genes}
+      fromState: {id, state -> [
+          "input": state.input,
+          "obsm_key": "scGPT_integration_leiden",
+          "modality": state.modality,
+        ]
+      },
+      toState: ["input": "output"]
+    )
+
+    | umap.run(
+      fromState: {id, state -> [
+          "input": state.input,
+          "uns_neighbors": "scGPT_integration_neighbors",
+          "obsm_output": "X_scGPT_umap",
+          "modality": state.modality,
+          "output_compression": state.output_compression,
+          "output": state.workflow_output
+        ]
+      },
+      toState: { id, output, state ->
+        [ output: output.output ]
+      },
+      auto: [ publish: true ]
+    )
+
+  emit:
+    output_ch
+}
diff --git a/src/workflows/integration/scgpt_leiden/nextflow.config b/src/workflows/integration/scgpt_leiden/nextflow.config
@@ -0,0 +1,10 @@
+manifest {
+  nextflowVersion = '!>=20.12.1-edge'
+}
+
+params {
+  rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
+}
+
+// include common settings
+includeConfig("${params.rootDir}/src/workflows/utils/labels.config")