openpipelines-bio · DriesSchaumont · Jun 14, 2024 · Mar 6, 2024 · Mar 6, 2024 · Mar 6, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,12 @@
 
 * `reference/cellranger_mkgtf` component: Added cellranger mkgtf as a standalone component (PR #771).
 
+* `scgpt/cross_check_genes` component: Added a gene-model cross check component for scGPT (PR #758).
+
+* `scgpt/embedding`: component: Added scGPT embedding component (PR #761)
+
+* `scgpt/tokenize_pad`: component: Added scGPT padding and tokenization component (PR #754).
+
 * `scgpt/binning` component: Added a scGPT pre-processing binning component (PR #765).
 
 ## MINOR CHANGES

diff --git a/src/scgpt/cross_check_genes/config.vsh.yaml b/src/scgpt/cross_check_genes/config.vsh.yaml
@@ -0,0 +1,87 @@
+functionality:
+  name: cross_check_genes
+  namespace: "scgpt"
+  description: |
+    Cross-check genes with pre-trained scGPT model.
+  authors:
+    - __merge__: /src/authors/jakub_majercik.yaml
+      roles: [ maintainer, author ]
+    - __merge__: /src/authors/dorien_roosen.yaml
+      roles: [ maintainer, author ]
+
+  argument_groups:
+    - name: Inputs
+      arguments:
+        - name: "--input"
+          type: file
+          direction: input
+          required: true
+          example: input.h5mu
+          description: |
+            The input h5mu file containing of pre-processed data.
+        - name: "--modality"
+          type: string
+          default: "rna"
+          required: false
+          description: |
+            The modality key of the MuData object containing the RNA AnnData object.
+        - name: "--vocab_file"
+          type: file
+          direction: input
+          required: true
+          example: resources_test/scgpt/vocab.json
+          description: |
+            Model vocabulary file path.
+        - name: "--input_var_gene_names"
+          type: string
+          example: "gene_name"
+          required: false
+          description: |
+            The name of the adata.var column containing gene names. By default the .var index will be used.
+    - name: Outputs
+      arguments:
+        - name: "--output"
+          type: file
+          direction: output
+          required: true
+          example: output.h5mu
+          description: |
+            The output cross-checked anndata file.
+        - name: "--output_compression"
+          type: string
+          choices: ["gzip", "lzf"]
+          required: false
+          example: "gzip"
+    - name: Arguments
+      arguments:
+        - name: "--pad_token"
+          type: string
+          default: "<pad>"
+          required: false
+          description: |
+            The padding token used in the model.
+  resources:
+    - type: python_script
+      path: script.py
+    - path: /src/utils/setup_logger.py
+  test_resources:
+    - type: python_script
+      path: test.py
+    - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset.h5mu
+    - path: /resources_test/scgpt/source/vocab.json
+
+platforms:
+  - type: docker
+    image: nvcr.io/nvidia/pytorch:23.09-py3
+    setup:
+      - type: python
+        __merge__: [ /src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .]
+      - type: python
+        packages:
+          - scgpt==0.2.1
+    test_setup:
+      - type: python
+    __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
+  - type: nextflow
+    directives:
+      label: [ lowmem, lowcpu ]
diff --git a/src/scgpt/cross_check_genes/script.py b/src/scgpt/cross_check_genes/script.py
@@ -0,0 +1,68 @@
+import mudata as mu
+import numpy as np
+from scgpt.tokenizer.gene_tokenizer import GeneVocab
+
+## VIASH START
+par = {
+    "input": "resources_test/scgpt/test_resources/Kim2020_Lung_subset.h5mu",
+    "output": "output.h5mu",
+    "modality": "rna",
+    "input_var_gene_names": None,
+    "pad_token": "<pad>",
+    "vocab_file": "resources_test/scgpt/source/vocab.json"
+}
+## VIASH END
+
+# START TEMPORARY WORKAROUND setup_logger
+# reason: resources aren't available when using Nextflow fusion
+# from setup_logger import setup_logger
+def setup_logger():
+    import logging
+    from sys import stdout
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler(stdout)
+    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    console_handler.setFormatter(logFormatter)
+    logger.addHandler(console_handler)
+
+    return logger
+# END TEMPORARY WORKAROUND setup_logger
+logger = setup_logger()
+# Read in data
+logger.info(f"Reading {par['input']}")
+mudata = mu.read_h5mu(par["input"])
+adata = mudata.mod[par["modality"]].copy()
+
+pad_token = par["pad_token"]
+special_tokens = [pad_token, "<cls>", "<eoc>"]
+
+# Fetching gene names
+if not par["input_var_gene_names"]:
+    genes = adata.var.index.astype(str).tolist()
+elif par["input_var_gene_names"] not in adata.var.columns:
+    raise ValueError(f"Gene name column '{par['input_var_gene_names']}' not found in .mod['{par['modality']}'].obs.")
+else: 
+    genes = adata.var[par["input_var_gene_names"]].astype(str).tolist()
+
+# Cross-check genes with pre-trained model
+logger.info(f"Loading model vocab from {par['vocab_file']}")
+vocab_file = par["vocab_file"]
+vocab = GeneVocab.from_file(vocab_file)
+[vocab.append_token(s) for s in special_tokens if s not in vocab]
+
+# vocab.append_token([s for s in special_tokens if s not in vocab])
+
+logger.info("Filtering genes based on model vocab")
+adata.var["id_in_vocab"] = [1 if gene in vocab else -1 for gene in genes]
+
+gene_ids_in_vocab = np.array(adata.var["id_in_vocab"])
+
+logger.info("Subsetting input data based on genes present in model vocab")
+adata = adata[:, adata.var["id_in_vocab"] >= 0]
+
+mudata.mod[par["modality"]] = adata
+
+logger.info(f"Writing to {par['output']}")
+mudata.write_h5mu(par["output"], compression=par["output_compression"])
diff --git a/src/scgpt/cross_check_genes/test.py b/src/scgpt/cross_check_genes/test.py
@@ -0,0 +1,55 @@
+import pytest
+import subprocess
+from mudata import read_h5mu
+import re
+import sys
+
+## VIASH START
+meta = {
+    'executable': './target/docker/scgpt/cross_check/cross_check',
+    'resources_dir': './resources_test/scgpt/',
+    'config': './src/scgpt/cross_check/config.vsh.yaml'
+}
+## VIASH END
+
+input_path = meta["resources_dir"] + "Kim2020_Lung_subset.h5mu"
+vocab_path = meta["resources_dir"] + "vocab.json"
+
+def test_cross_check(run_component, random_path):
+    output_path = random_path(extension="h5mu")
+    args = [
+        "--input", input_path,
+        "--output",  output_path,
+        "--modality", "rna",
+        "--vocab_file", vocab_path,
+        "--output_compression", "gzip"
+    ]
+    run_component(args)
+
+    output_mudata = read_h5mu(output_path)
+    input_mudata = read_h5mu(input_path)
+
+    # Check added columns
+    assert {"gene_name", "id_in_vocab"}.issubset(set(output_mudata.mod["rna"].var.columns)), "Gene columns were not added."    
+    # Check if genes were filtered
+    assert all(output_mudata.mod["rna"].var["id_in_vocab"] == 1), "Genes were not filtered."
+    # Check if number of observations is the same
+    assert output_mudata.mod["rna"].n_obs == input_mudata.mod["rna"].n_obs, "Number of observations changed."
+    assert output_mudata.n_obs == input_mudata.n_obs, "Number of observations changed."
+
+def test_cross_check_invalid_gene_layer_raises(run_component, random_path):
+    output_path = random_path(extension="h5mu")
+    args = [
+        "--input", input_path,
+        "--output",  output_path,
+        "--vocab_file", vocab_path,
+        "--input_var_gene_names", "dummy_var",
+    ]
+
+    with pytest.raises(subprocess.CalledProcessError) as err:
+        run_component(args)
+    assert re.search(r"ValueError: Gene name column 'dummy_var' not found in .mod\['rna'\]\.obs\.",
+                     err.value.stdout.decode('utf-8'))
+
+if __name__ == '__main__':
+    sys.exit(pytest.main([__file__]))
diff --git a/src/scgpt/embedding/config.vsh.yaml b/src/scgpt/embedding/config.vsh.yaml
@@ -0,0 +1,143 @@
+functionality:
+  name: embedding
+  namespace: scgpt
+  description: |
+    Generation of cell embeddings for the integration of single cell transcriptomic count data using scGPT.
+  authors:
+    - __merge__: /src/authors/dorien_roosen.yaml
+      roles: [ maintainer, author ]
+
+  argument_groups:
+    - name: Inputs
+      arguments:
+        - name: "--input"
+          type: file
+          direction: input
+          required: true
+          example: input.h5mu
+          description: |
+            The input h5mu file containing tokenized gene and count data. 
+        - name: "--modality"
+          type: string
+          default: "rna"
+          required: false
+        - name: "--model"
+          type: file
+          direction: input
+          required: true
+          example: best_model.pt
+          description: |
+            Path to scGPT model file.
+        - name: "--model_vocab"
+          type: file
+          direction: input
+          required: true
+          example: vocab.json
+          description: |
+            Path to scGPT model vocabulary file.
+        - name: "--model_config"
+          type: file
+          direction: input
+          required: true
+          example: args.json
+          description: |
+            Path to scGPT model config file.
+        - name: "--obsm_gene_tokens"
+          type: string
+          default: "gene_id_tokens"
+          description: |
+            The key of the .obsm array containing the gene token ids
+          example: values.pt
+        - name: "--obsm_tokenized_values"
+          type: string
+          default: values_tokenized
+          description: |
+            The key of the .obsm array containing the count values of the tokenized genes
+        - name: "--obsm_padding_mask"
+          type: string
+          default: padding_mask
+          description: |
+            The key of the .obsm array containing the padding mask.
+        - name: "--var_gene_names"
+          type: string
+          description: |
+            The name of the .var column containing gene names. When no gene_name_layer is provided, the .var index will be used.
+        - name: "--obs_batch_label"
+          type: string
+          description: |
+            The name of the adata.obs column containing the batch labels. Must be provided when 'dsbn' is set to True.
+
+    - name: Outputs
+      arguments:
+        - name: "--output"
+          type: file
+          required: true
+          description: |
+            Path to output anndata file containing pre-processed data as well as scGPT embeddings.
+          direction: output
+          example: output.h5mu
+        - name: "--output_compression"
+          type: string
+          example: "gzip"
+          required: false
+          choices: ["gzip", "lzf"]
+          description: |
+            The compression algorithm to use for the output h5mu file.
+        - name: "--obsm_embeddings"
+          type: string
+          default: "X_scGPT"
+          description: |
+            The name of the adata.obsm array to which scGPT embeddings will be written.
+
+    - name: Arguments
+      arguments:
+        - name: "--pad_token"
+          type: string
+          default: "<pad>"
+          description: |
+            The token to be used for padding.
+        - name: "--pad_value"
+          type: integer
+          default: -2
+          description: |
+            The value of the padding token.
+        - name: "--dbsn"
+          type: boolean
+          default: true
+          description: |
+            Whether to apply domain-specific batch normalization for generating embeddings. When set to True, 'obs_batch_labels' must be set as well.
+        - name: "--batch_size"
+          type: integer
+          default: 64
+          description: |
+            The batch size to be used for inference
+        - name: "--dsbn"
+          type: boolean
+          default: true
+          description: |
+            Whether to apply domain-specific batch normalization for generating embeddings. When set to True, 'obs_batch_labels' must be set as well.
+
+  resources:
+    - type: python_script
+      path: script.py
+  test_resources:
+    - type: python_script
+      path: test.py
+    - path: /resources_test/scgpt/source
+    - path: /resources_test/scgpt/test_resources/Kim2020_Lung_subset.h5mu
+
+platforms:
+  - type: docker
+    image: nvcr.io/nvidia/pytorch:23.09-py3
+    setup:
+      - type: python
+        __merge__: [ /src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml ]
+      - type: python 
+        packages:
+          - scgpt==0.2.1
+    test_setup:
+      - type: python
+        __merge__: [ /src/base/requirements/viashpy.yaml ]
+  - type: nextflow
+    directives:
+      label: [ midmem ]