more docs

PGScatalog · Feb 2, 2022 · ecc92a8 · ecc92a8
1 parent 1e1a7f5
commit ecc92a8
Show file tree

Hide file tree

Showing 24 changed files with 539 additions and 279 deletions.
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/setup-python@v2
-    - run: pip install sphinx-book-theme sphinx-jsonschema
+    - run: pip install sphinx-book-theme sphinx-jsonschema sphinxemoji
     - uses: actions/checkout@master
       with:
         fetch-depth: 0 # otherwise, you will failed to push refs to dest repo

diff --git a/.nf-core.yml b/.nf-core.yml
@@ -14,6 +14,9 @@ lint:
     # docs
     - docs/images/nf-core-pgscatalog/pgsc_calc_logo_light.png
     - docs/images/nf-core-pgscatalog/pgsc_calc_logo_dark.png
+    - docs/output.md
+    - docs/README.md
+    - docs/usage.md
     # assets
     - assets/nf-core-pgscatalog/pgsc_calc_logo_light.png
     - assets/email_template.html

diff --git a/assets/README.md b/assets/README.md
diff --git a/assets/api_examples/input.json b/assets/api_examples/input.json
@@ -1,9 +1,7 @@
-{
-    "target_genomes": [
-        {
-            "sample": "example",
-            "vcf_path": "/path/to/genome.vcf.gz",
-            "chrom": 22
-        }
-    ]
-}
+[
+    {
+        "sample": "example",
+        "vcf_path": "/path/to/genome.vcf.gz",
+        "chrom": 22
+    }
+]
diff --git a/assets/api_examples/params.json b/assets/api_examples/params.json
@@ -1,6 +1,4 @@
 {
-    "nxf_params_file": {
-        "scorefile": "/path/to/scorefile.txt",
-        "format": "json"
-    }
+    "scorefile": "/path/to/scorefile.txt",
+    "format": "json"
 }
diff --git a/assets/examples/example_data/samplesheet.json b/assets/examples/example_data/samplesheet.json
@@ -0,0 +1 @@
+[{"sample":"cineca_synthetic_subset","vcf_path":null,"chrom":22,"bed":"https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.bim","bim":"https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.bed","fam":"https:\/\/gitlab.ebi.ac.uk\/nebfield\/test-datasets\/-\/raw\/master\/pgsc_calc\/cineca_synthetic_subset.fam"}]
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -1,39 +1,74 @@
 {
-    "$schema": "http://json-schema.org/draft-07/schema",
-    "$id": "https://raw.githubusercontent.com/nf-core/pgscalc/master/assets/schema_input.json",
-    "title": "nf-core/pgscalc pipeline - params.input schema",
-    "description": "Schema for the file provided with params.input",
-    "type": "array",
-    "items": {
-        "type": "object",
-        "properties": {
-            "sample": {
-                "type": "string",
-                "pattern": "^\\S+$",
-                "errorMessage": "Sample name must be provided and cannot contain spaces"
-            },
-            "fastq_1": {
-                "type": "string",
-                "pattern": "^\\S+\\.f(ast)?q\\.gz$",
-                "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
-            },
-            "fastq_2": {
-                "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'",
-                "anyOf": [
-                    {
-                        "type": "string",
-                        "pattern": "^\\S+\\.f(ast)?q\\.gz$"
-                    },
-                    {
-                        "type": "string",
-                        "maxLength": 0
-                    }
-                ]
-            }
-        },
-        "required": [
-            "sample",
-            "fastq_1"
+  "$schema": "http://json-schema.org/draft-07/schema",
+  "$id": "https://raw.githubusercontent.com/pgscatalog/pgsc_calc/master/assets/schema_input.json",
+  "title": "Input schema",
+  "description": "Schema for the file provided with params.input",
+  "type": "array",
+  "uniqueItems": true,
+  "minItems": 1,
+  "items": {
+    "type": "object",
+    "properties": {
+      "sample": {
+        "type": "string",
+        "pattern": "^\\S+$",
+        "errorMessage": "Sample name must be provided and cannot contain spaces"
+      },
+      "vcf_path": {
+        "type": "string",
+        "pattern": "^\\S+\\.vcf\\.gz$",
+        "errorMessage": "VCF path must end with .vcf.gz, mutually exclusive with bfile_path",
+        "anyOf": [
+          {
+            "type": "string",
+            "pattern": "^\\S+\\.vcf\\.gz$"
+          },
+          {
+            "type": "string",
+            "maxLength": 0
+          }
         ]
+      },
+      "bfile_path": {
+        "errorMessage": "Plink 1 binary fileset prefix, must not end with bed / bim / fam, mutually exclusive with vcf_path",
+        "anyOf": [
+          {
+            "type": "string",
+            "pattern": "^\\S+$"
+          },
+          {
+            "type": "string",
+            "maxLength": 0
+          }
+        ]
+      },
+      "chrom": {
+        "errorMessage": "Plink 1 binary fileset prefix, must not end with bed / bim / fam, mutually exclusive with vcf_path",
+        "anyOf": [
+          {
+            "type": "null"
+          },
+          {
+            "type": "integer",
+            "exclusiveMinimum": 1,
+            "maximum": 22
+          }
+        ]
+      }
+    },
+    "required": [
+      "sample",
+      "chrom"
+    ],
+    "not": {
+      "anyOf": [
+        {
+          "required": [
+            "vcf_path",
+            "bfile_path"
+          ]
+        }
+      ]
     }
+  }
 }
diff --git a/assets/schema_k8s.json b/assets/schema_k8s.json
@@ -1,8 +1,8 @@
 {
   "$schema": "https://json-schema.org/draft/2020-12/schema",
   "$id": "https://raw.githubusercontent.com/PGScatalog/pgsc_calc/master/assets/schema_k8s.json",
-  "title": "PGS Catalog Calculator Parameters",
-  "description": "Parameters required to launch an instance of the PGS Catalog Calculator from IGS4EU backend on Embassy cloud",
+  "title": "PGS Catalog Calculator API Parameters",
+  "description": "Parameters required to launch an instance of the PGS Catalog Calculator from K8s cluster on Embassy cloud",
   "type": "object",
   "properties": {
     "target_genomes": {

diff --git a/conf/k8s.config b/conf/k8s.config
@@ -11,6 +11,9 @@ k8s {
     pod = [[volumeClaim: "example-dataset", mountPath: "/data"]]
 }
 
+// clean up work directory
+cleanup = true
+
 docker {
    enabled = true
 }

diff --git a/docs/README.md b/docs/README.md
diff --git a/docs/api.rst b/docs/api.rst
@@ -46,7 +46,8 @@ Specifying workflow parameters with JSON
   :language: JSON
 
 Some other parameters need to be set for the workflow to run, which are
-specified in a simple JSON object. This object can be complex, because many
+specified in a simple JSON object. Nextflow supports setting parameters via JSON
+with the ``-params-file`` flag. This object can be complex, because many
 optional parameters can be set here. A minimal workflow parameter object must
 contain:
 
@@ -55,7 +56,7 @@ contain:
   scorefile)
 - The format must be "json"
 
-The :ref:`JSON schema` specifies optional parameters in full.
+The JSON :ref:`schema` specifies optional parameters in full.
 
 API call
 ~~~~~~~~
@@ -77,7 +78,7 @@ This documentation is useful for a human, but not a computer, so we wrote a
 document (`a JSON schema`_) that describes the data format. The schema is used
 to automatically validate data submitted to the workflow via the API.
 
-.. _a JSON schema: https://raw.githubusercontent.com/PGScatalog/pgsc_calc/master/assets/schema_k8s.json
+.. _a JSON schema: https://json-schema.org/
 
 .. jsonschema:: ../assets/schema_k8s.json
 
@@ -89,7 +90,7 @@ Events`_. Briefly, a sensor constantly listens on a Kubernetes cluster for Kafka
 messages to launch the pipeline. Once the message is received, a nextflow driver
 pod is created and the workflow is executed using the `K8S executor`_. The
 status of the workflow instance is reported using Nextflow's `weblog`_ and
-a second sensor.
+a second sensor. We didn't have Nextflow Tower at the time.
 
 .. _Argo Events: https://argoproj.github.io/argo-events/
 .. _K8S executor: https://www.nextflow.io/docs/latest/kubernetes.html

diff --git a/docs/conf.py b/docs/conf.py
@@ -31,9 +31,12 @@
     'sphinx.ext.githubpages',
     'sphinx.ext.autosectionlabel',
     'sphinx.ext.autodoc',
-    'sphinx-jsonschema'
+    'sphinx-jsonschema',
+    'sphinxemoji.sphinxemoji'
 ]
 
+sphinxemoji_style = 'twemoji'
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 

diff --git a/docs/glossary.rst b/docs/glossary.rst
@@ -2,35 +2,41 @@ Glossary
 ========
 
 .. glossary::
-     accession
-         A unique and stable identifier
 
-     polygenic score
-         A `polygenic score`_ (PGS) aggregates the effects of many genetic variants
-         into a single number which predicts genetic predisposition for a
-         phenotype. PGS are typically composed of hundreds-to-millions of genetic
-         variants (usually SNPs) which are combined using a weighted sum of allele
-         dosages multiplied by their corresponding effect sizes, as estimated from
-         a relevant genome-wide association study (GWAS).
+     CSV
+         Comma-separated values, a popular plain text file format. `CSVs are
+         good`_. Please don't use ``.xlsx`` (Excel), it makes bioinformaticians
+         sad.
 
-         .. _polygenic score: https://www.pgscatalog.org/about/
+     JSON
+         Javascript Object Notation. A popular file format and data interchange
+         format.
 
      PGS Catalog
          The `PGS Catalog`_ is an open database of published polygenic scores
          (PGS). If you develop and publish polygenic scores, please consider
          `submitting them`_ to the Catalog!
 
-         .. _PGS Catalog: https://www.pgscatalog.org
-         .. _submitting them: https://www.pgscatalog.org/submit/
-
      PGS Catalog Calculator
-         This cool workflow
+         ``pgsc_calc`` -  this cool workflow!
+
+     SNP
+         A `single nucleotide polymorphism`_
 
      Scoring file
-         A file for scoring
+         A file containing risk alleles and derived weights for a specific
+         phenotype. Weights are typically calculated with 1) GWAS summary
+         statistics and 2) A large population of people with known phenotypes
+         (e.g. the `UK BioBank`_). These files are hopefully published in the
+         PGS Catalog.
 
-     SNP
-         A single nucleotide polymorphism
+     VCF
+         Variant Call Format. A popular `standard file format`_ used to store
+         genetic variants.
+
+     accession
+         A unique and stable identifier. PGS Catalog accessions start with the
+         prefix PGS, e.g. `PGS000001`_
 
      driver pod
      pod
@@ -41,7 +47,32 @@ Glossary
          a workflow instance. The driver pod will monitor and submit each job in
          the workflow as a separate worker pod.
 
-         .. _A pod: https://kubernetes.io/docs/concepts/workloads/pods/
+     polygenic score
+         A `polygenic score`_ (PGS) aggregates the effects of many genetic variants
+         into a single number which predicts genetic predisposition for a
+         phenotype. PGS are typically composed of hundreds-to-millions of genetic
+         variants (usually SNPs) which are combined using a weighted sum of allele
+         dosages multiplied by their corresponding effect sizes, as estimated from
+         a relevant genome-wide association study (GWAS).
+
+     target genomic data
+         Genomes that you want to calculate polygenic scores for. Scores are
+         calculated from an existing scoring file that contains risk alleles and
+         associated weights. These genomes are distinct from those used to
+         create the polygenic scoring file originally (i.e., those used to
+         derive the risk alleles and weights).
 
      worker pods
-         Pods unite! You have nothing to lose but your chains.
+         A pod, managed by the nextflow driver pod, that is responsible for
+         executing an atomic process in the workflow. They are created and
+         destroyed automatically by the driver pod.
+
+.. _CSVs are good: https://www.gov.uk/guidance/using-csv-file-format
+.. _A pod: https://kubernetes.io/docs/concepts/workloads/pods/
+.. _single nucleotide polymorphism: https://en.wikipedia.org/wiki/Single-nucleotide_polymorphism
+.. _UK BioBank: https://www.ukbiobank.ac.uk/    
+.. _PGS Catalog: https://www.pgscatalog.org
+.. _submitting them: https://www.pgscatalog.org/submit/
+.. _PGS000001: https://www.pgscatalog.org/score/PGS000001/
+.. _standard file format: https://samtools.github.io/hts-specs/VCFv4.2.pdf
+.. _polygenic score: https://www.pgscatalog.org/about/