update docs

PGScatalog · May 26, 2022 · bc7b26a · bc7b26a
1 parent dfdadf0
commit bc7b26a
Show file tree

Hide file tree

Showing 22 changed files with 136 additions and 160 deletions.
diff --git a/assets/schemas/target_genomes.json b/assets/schemas/target_genomes.json
@@ -12,12 +12,12 @@
       "sample": {
         "type": "string",
         "pattern": "^\\S+$",
-        "errorMessage": "Sample name must be provided and cannot contain spaces"
+        "description": "Sample name must be provided and cannot contain spaces"
       },
       "vcf_path": {
         "type": "string",
         "pattern": "^\\S+\\.vcf\\.gz$",
-        "errorMessage": "VCF path must end with .vcf.gz, mutually exclusive with other formats",
+        "description": "VCF path must end with .vcf.gz, mutually exclusive with other formats",
         "anyOf": [
           {
             "type": "string",
@@ -31,7 +31,7 @@
         ]
       },
       "bfile_path": {
-        "errorMessage": "Plink 1 binary fileset prefix, must not end with bed / bim / fam, mutually exclusive with other formats",
+        "description": "Plink 1 binary fileset prefix, must not end with bed / bim / fam, mutually exclusive with other formats",
         "anyOf": [
           {
             "type": "string",
@@ -45,7 +45,7 @@
         ]
       },
       "pfile_path": {
-        "errorMessage": "Plink 2 binary fileset prefix, must not end with pvar / pgen / psam, mutually exclusive with other formats. Must not be zstd compressed.",
+        "description": "Plink 2 binary fileset prefix, must not end with pvar / pgen / psam, mutually exclusive with other formats. Must not be zstd compressed.",
         "anyOf": [
           {
             "type": "string",
@@ -59,7 +59,7 @@
         ]
       },
       "chrom": {
-        "errorMessage": "Chromosome of associated genomic data. If multiple chromosomes are in the data, set to an empty string",
+        "description": "Chromosome of associated genomic data. If multiple chromosomes are in the data, set to an empty string",
         "anyOf": [
           {
             "type": "null"

diff --git a/docs/_templates/globaltoc.html b/docs/_templates/globaltoc.html
@@ -6,6 +6,7 @@ <h3>Contents</h3>
   <li><a href="{{ pathto('how-to/index') }}">How-to guides</a></li>
   <li><a href="{{ pathto('reference/index') }}">Reference guide</a></li>
   <li><a href="{{ pathto('explanation/index') }}">Explanation</a></li>
+  <li><a href="{{ pathto('troubleshooting') }}">Troubleshooting</a></li>
   <li><a href="{{ pathto('glossary') }}">Glossary</a></li>
 </ul>
 

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -8,6 +8,27 @@ will only occur in major versions with changes noted in this changelog.
 
 .. _`semantic versioning`: https://semver.org/
 
+
+pgsc_calc v1.0.0 (2022-05-24)
+--------------------------------
+
+This release produces scores that should be biologically meaningful. Significant
+effort has been made to validate calculate scores on different datasets. In the
+next release we'll add score validation to our test suite to make sure
+calculated scores stay valid in the future.
+
+Features
+~~~~~~~~
+
+- Add support for PLINK2 format (samplesheet structure changed)
+- Add support for allosomes (e.g. X, Y)
+- Improve PGS Catalog compatibility (e.g. missing other allele)
+- Add automatic liftover of scoring files to match target genome build
+- Performance improvements to support UK BioBank scale data (500,000 genomes)
+- Support calculation of multiple scores in parallel
+- Significantly improved test coverage (> 80%)
+- Lots of other small changes to improve correctness and handling edge cases
+
 pgsc_calc v0.1.3dev (2022-02-04)
 --------------------------------
 

diff --git a/docs/explanation/calculating.rst b/docs/explanation/calculating.rst
diff --git a/docs/explanation/goodpractices.rst b/docs/explanation/goodpractices.rst
diff --git a/docs/explanation/index.rst b/docs/explanation/index.rst
@@ -3,15 +3,8 @@
 Explanation
 ===========
 
-(These explanations haven't been written yet, sorry!)
-
 .. toctree::
    :maxdepth: 1
 
-   calculating
-   packaging
-   testing
-   platform   
-   goodpractices
-   troubleshooting
+   output
 
diff --git a/docs/explanation/output.rst b/docs/explanation/output.rst
@@ -0,0 +1,4 @@
+Understanding workflow output
+=============================
+
+
diff --git a/docs/explanation/packaging.rst b/docs/explanation/packaging.rst
diff --git a/docs/explanation/platform.rst b/docs/explanation/platform.rst
diff --git a/docs/getting-started.rst b/docs/getting-started.rst
@@ -56,7 +56,7 @@ workflow with bundled test data:
     [7c/5cca6c] process > PGSC_CALC:PGSCALC:MAKE_COMPATIBLE:PLINK2_BFILE (cineca_synthetic_subset)   [100%] 1 of 1 ✔
     [3b/ce0e39] process > PGSC_CALC:PGSCALC:MAKE_COMPATIBLE:MATCH_VARIANTS (cineca_synthetic_subset) [100%] 1 of 1 ✔
     [2e/fb3233] process > PGSC_CALC:PGSCALC:APPLY_SCORE:PLINK2_SCORE (cineca_synthetic_subset)       [100%] 1 of 1 ✔
-    [b5/fc5b1e] process > PGSC_CALC:PGSCALC:APPLY_SCORE:MAKE_REPORT (1)                              [100%] 1 of 1 ✔
+    [b5/fc5b1e] process > PGSC_CALC:PGSCALC:APPLY_SCORE:SCORE_REPORT (1)                             [100%] 1 of 1 ✔
     [03/009cb6] process > PGSC_CALC:PGSCALC:DUMPSOFTWAREVERSIONS (1)                                 [100%] 1 of 1 ✔
     -[pgscatalog/pgsc_calc] Pipeline completed successfully-
     

diff --git a/docs/glossary.rst b/docs/glossary.rst
@@ -40,15 +40,6 @@ Glossary
          A unique and stable identifier. PGS Catalog accessions start with the
          prefix PGS, e.g. `PGS000001`_
 
-     driver pod
-     pod
-         `A pod`_ is a description of one or more containers and its associated
-         computing resources (e.g. number of processes and RAM, but it's more
-         complicated than that). Kubernetes takes this description and tries to
-         make it exist on the cluster. The driver pod is responsible for managing
-         a workflow instance. The driver pod will monitor and submit each job in
-         the workflow as a separate worker pod.
-
      polygenic score
          A `polygenic score`_ (PGS) aggregates the effects of many genetic variants
          into a single number which predicts genetic predisposition for a
@@ -64,13 +55,7 @@ Glossary
          create the polygenic scoring file originally (i.e., those used to
          derive the risk alleles and weights).
 
-     worker pods
-         A pod, managed by the nextflow driver pod, that is responsible for
-         executing an atomic process in the workflow. They are created and
-         destroyed automatically by the driver pod.
-
 .. _CSVs are good: https://www.gov.uk/guidance/using-csv-file-format
-.. _A pod: https://kubernetes.io/docs/concepts/workloads/pods/
 .. _single nucleotide polymorphism: https://en.wikipedia.org/wiki/Single-nucleotide_polymorphism
 .. _UK BioBank: https://www.ukbiobank.ac.uk/    
 .. _PGS Catalog: https://www.pgscatalog.org

diff --git a/docs/how-to/bigjob.rst b/docs/how-to/bigjob.rst
@@ -116,10 +116,8 @@ Nextflow also supports submitting jobs platforms like:
 - Google cloud
 - Azure cloud
 - Amazon cloud
-
-Check the `nextflow documentation`_ for configuration specifics. pgsc_calc is
-deployed and tested on a `local Kubernetes cluster`_, but it's not a recommended
-way of running the pipeline for normal users.
+- Kubernetes
+
+Check the `nextflow documentation`_ for configuration specifics.
 
 .. _`nextflow documentation`: https://nextflow.io/docs/latest/google.html
-.. _`local Kubernetes cluster`: https://github.com/PGScatalog/pgsc_calc/blob/master/conf/k8s.config
diff --git a/docs/how-to/index.rst b/docs/how-to/index.rst
@@ -23,13 +23,6 @@ Making genomic data and scorefiles compatible
    liftover
    effecttype
 
-Understanding workflow output
------------------------------
-
-.. toctree::
-   :maxdepth: 1
-
-   interpret
 
 Working with big data
 ---------------------

diff --git a/docs/how-to/interpret.rst b/docs/how-to/interpret.rst
diff --git a/docs/index.rst b/docs/index.rst
@@ -47,7 +47,7 @@ The workflow should output:
     [7c/5cca6c] process > PGSC_CALC:PGSCALC:MAKE_COMPATIBLE:PLINK2_BFILE (cineca_synthetic_subset)   [100%] 1 of 1 ✔
     [3b/ce0e39] process > PGSC_CALC:PGSCALC:MAKE_COMPATIBLE:MATCH_VARIANTS (cineca_synthetic_subset) [100%] 1 of 1 ✔
     [2e/fb3233] process > PGSC_CALC:PGSCALC:APPLY_SCORE:PLINK2_SCORE (cineca_synthetic_subset)       [100%] 1 of 1 ✔
-    [b5/fc5b1e] process > PGSC_CALC:PGSCALC:APPLY_SCORE:MAKE_REPORT (1)                              [100%] 1 of 1 ✔
+    [b5/fc5b1e] process > PGSC_CALC:PGSCALC:APPLY_SCORE:SCORE_REPORT (1)                             [100%] 1 of 1 ✔
     [03/009cb6] process > PGSC_CALC:PGSCALC:DUMPSOFTWAREVERSIONS (1)                                 [100%] 1 of 1 ✔
     -[pgscatalog/pgsc_calc] Pipeline completed successfully-
                 

diff --git a/docs/reference/api.rst b/docs/reference/api.rst
diff --git a/docs/reference/index.rst b/docs/reference/index.rst
@@ -8,4 +8,3 @@ Reference guides
 
    input
    params
-   api   
diff --git a/docs/reference/input.rst b/docs/reference/input.rst
@@ -1,8 +1,10 @@
-Input schema
-============
+Input (samplesheet) schema
+==========================
 
-The documentation below is automatically generated from the input schema and
-contains additional technical detail. 
+The documentation below is automatically generated from the schema. The JSON
+file contains additional technical detail not shown in the table below.
 
-.. jsonschema:: ../../assets/schema_input.json
-.. _`example`: https://github.com/PGScatalog/pgsc_calc/blob/master/assets/api_examples/input.json
+Each row in a samplesheet can only have a single genomic data format (i.e. they
+are mutually exclusive).
+
+.. jsonschema:: ../../assets/schemas/target_genomes.json
diff --git a/docs/reference/params.rst b/docs/reference/params.rst
@@ -6,4 +6,13 @@ Parameter reference
 The documentation below is automatically generated from the input schema and
 contains additional technical detail.
 
-.. jsonschema:: ../../nextflow_schema.json
+**Parameters in bold** are required and must be set by the user.
+
+.. jsonschema:: ../../nextflow_schema.json 
+    :lift_description:
+    :lift_definitions:
+    :auto_target:
+    :auto_reference:       
+
+
+
diff --git a/docs/explanation/troubleshooting.rst → docs/troubleshooting.rst b/docs/explanation/troubleshooting.rst → docs/troubleshooting.rst
@@ -1,3 +1,5 @@
+:orphan:
+
 .. _troubleshoot:
 
 Troubleshooting

diff --git a/modules/local/score_report.nf b/modules/local/score_report.nf
@@ -0,0 +1,34 @@
+process SCORE_REPORT {
+    label 'process_high_memory'
+
+    conda (params.enable_conda ? "conda-forge::r-tidyverse=1.3.1 conda-forge::r-rsqlite=2.1.1" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'oras://dockerhub.ebi.ac.uk/gdp-public/pgsc_calc/singularity/mulled-v2-e5054a4b868f4ffd21311d4e05426694e2c7fb5e:17fe01267c936fedcbd51470941b075c42b08c23-0' :
+        'dockerhub.ebi.ac.uk/gdp-public/pgsc_calc/mulled-v2-e5054a4b868f4ffd21311d4e05426694e2c7fb5e:17fe01267c936fedcbd51470941b075c42b08c23-0' }"
+
+    input:
+    path scorefiles
+    path report
+    path logo
+    path db
+
+    output:
+    path "*.html"      , emit: report
+    path "*.txt"       , emit: scores
+    path "versions.yml", emit: versions
+
+    script:
+    def args = task.ext.args ?: ''
+    """
+    # dumb workaround symlink & out_dir (rmarkdown)
+    cp $report report.rmd
+
+    R -e 'rmarkdown::render("report.rmd", \
+        output_options = list(self_contained=TRUE))'
+
+    cat <<-END_VERSIONS > versions.yml
+    ${task.process.tokenize(':').last()}:
+        R: \$(echo \$(R --version 2>&1) | head -n 1 | cut -f 3 -d ' ')
+    END_VERSIONS
+    """
+}