From bc7b26a6de6a6d19c274c047aac4b4798d52429e Mon Sep 17 00:00:00 2001
From: Benjamin Wingfield <bwingfield@ebi.ac.uk>
Date: Tue, 24 May 2022 14:53:59 +0100
Subject: [PATCH] update docs

---
 assets/schemas/target_genomes.json         | 10 +--
 docs/_templates/globaltoc.html             |  1 +
 docs/changelog.rst                         | 21 ++++++
 docs/explanation/calculating.rst           |  2 -
 docs/explanation/goodpractices.rst         |  2 -
 docs/explanation/index.rst                 |  9 +--
 docs/explanation/output.rst                |  4 ++
 docs/explanation/packaging.rst             |  4 --
 docs/explanation/platform.rst              |  2 -
 docs/getting-started.rst                   |  2 +-
 docs/glossary.rst                          | 15 ----
 docs/how-to/bigjob.rst                     |  8 +--
 docs/how-to/index.rst                      |  7 --
 docs/how-to/interpret.rst                  | 13 ----
 docs/index.rst                             |  2 +-
 docs/reference/api.rst                     | 53 ---------------
 docs/reference/index.rst                   |  1 -
 docs/reference/input.rst                   | 14 ++--
 docs/reference/params.rst                  | 11 ++-
 docs/{explanation => }/troubleshooting.rst |  2 +
 modules/local/score_report.nf              | 34 ++++++++++
 nextflow_schema.json                       | 79 ++++++++++++----------
 22 files changed, 136 insertions(+), 160 deletions(-)
 delete mode 100644 docs/explanation/calculating.rst
 delete mode 100644 docs/explanation/goodpractices.rst
 create mode 100644 docs/explanation/output.rst
 delete mode 100644 docs/explanation/packaging.rst
 delete mode 100644 docs/explanation/platform.rst
 delete mode 100644 docs/how-to/interpret.rst
 delete mode 100644 docs/reference/api.rst
 rename docs/{explanation => }/troubleshooting.rst (98%)
 create mode 100644 modules/local/score_report.nf

diff --git a/assets/schemas/target_genomes.json b/assets/schemas/target_genomes.json
index a1bcb884..516506be 100644
--- a/assets/schemas/target_genomes.json
+++ b/assets/schemas/target_genomes.json
@@ -12,12 +12,12 @@
       "sample": {
         "type": "string",
         "pattern": "^\\S+$",
-        "errorMessage": "Sample name must be provided and cannot contain spaces"
+        "description": "Sample name must be provided and cannot contain spaces"
       },
       "vcf_path": {
         "type": "string",
         "pattern": "^\\S+\\.vcf\\.gz$",
-        "errorMessage": "VCF path must end with .vcf.gz, mutually exclusive with other formats",
+        "description": "VCF path must end with .vcf.gz, mutually exclusive with other formats",
         "anyOf": [
           {
             "type": "string",
@@ -31,7 +31,7 @@
         ]
       },
       "bfile_path": {
-        "errorMessage": "Plink 1 binary fileset prefix, must not end with bed / bim / fam, mutually exclusive with other formats",
+        "description": "Plink 1 binary fileset prefix, must not end with bed / bim / fam, mutually exclusive with other formats",
         "anyOf": [
           {
             "type": "string",
@@ -45,7 +45,7 @@
         ]
       },
       "pfile_path": {
-        "errorMessage": "Plink 2 binary fileset prefix, must not end with pvar / pgen / psam, mutually exclusive with other formats. Must not be zstd compressed.",
+        "description": "Plink 2 binary fileset prefix, must not end with pvar / pgen / psam, mutually exclusive with other formats. Must not be zstd compressed.",
         "anyOf": [
           {
             "type": "string",
@@ -59,7 +59,7 @@
         ]
       },
       "chrom": {
-        "errorMessage": "Chromosome of associated genomic data. If multiple chromosomes are in the data, set to an empty string",
+        "description": "Chromosome of associated genomic data. If multiple chromosomes are in the data, set to an empty string",
         "anyOf": [
           {
             "type": "null"
diff --git a/docs/_templates/globaltoc.html b/docs/_templates/globaltoc.html
index 2dec734d..07be32d0 100644
--- a/docs/_templates/globaltoc.html
+++ b/docs/_templates/globaltoc.html
@@ -6,6 +6,7 @@ <h3>Contents</h3>
   <li><a href="{{ pathto('how-to/index') }}">How-to guides</a></li>
   <li><a href="{{ pathto('reference/index') }}">Reference guide</a></li>
   <li><a href="{{ pathto('explanation/index') }}">Explanation</a></li>
+  <li><a href="{{ pathto('troubleshooting') }}">Troubleshooting</a></li>
   <li><a href="{{ pathto('glossary') }}">Glossary</a></li>
 </ul>
 
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 774f5d3c..9b087af5 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -8,6 +8,27 @@ will only occur in major versions with changes noted in this changelog.
 
 .. _`semantic versioning`: https://semver.org/
 
+
+pgsc_calc v1.0.0 (2022-05-24)
+--------------------------------
+
+This release produces scores that should be biologically meaningful. Significant
+effort has been made to validate calculate scores on different datasets. In the
+next release we'll add score validation to our test suite to make sure
+calculated scores stay valid in the future.
+
+Features
+~~~~~~~~
+
+- Add support for PLINK2 format (samplesheet structure changed)
+- Add support for allosomes (e.g. X, Y)
+- Improve PGS Catalog compatibility (e.g. missing other allele)
+- Add automatic liftover of scoring files to match target genome build
+- Performance improvements to support UK BioBank scale data (500,000 genomes)
+- Support calculation of multiple scores in parallel
+- Significantly improved test coverage (> 80%)
+- Lots of other small changes to improve correctness and handling edge cases
+
 pgsc_calc v0.1.3dev (2022-02-04)
 --------------------------------
 
diff --git a/docs/explanation/calculating.rst b/docs/explanation/calculating.rst
deleted file mode 100644
index be1b7e6f..00000000
--- a/docs/explanation/calculating.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Calculating polygenic scores
-============================
diff --git a/docs/explanation/goodpractices.rst b/docs/explanation/goodpractices.rst
deleted file mode 100644
index 4c6ee0c1..00000000
--- a/docs/explanation/goodpractices.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Good practices for polygenic score analysis
-===========================================
diff --git a/docs/explanation/index.rst b/docs/explanation/index.rst
index b8cebf3e..7fa8f3f1 100644
--- a/docs/explanation/index.rst
+++ b/docs/explanation/index.rst
@@ -3,15 +3,8 @@
 Explanation
 ===========
 
-(These explanations haven't been written yet, sorry!)
-
 .. toctree::
    :maxdepth: 1
 
-   calculating
-   packaging
-   testing
-   platform   
-   goodpractices
-   troubleshooting
+   output
 
diff --git a/docs/explanation/output.rst b/docs/explanation/output.rst
new file mode 100644
index 00000000..12d41231
--- /dev/null
+++ b/docs/explanation/output.rst
@@ -0,0 +1,4 @@
+Understanding workflow output
+=============================
+
+
diff --git a/docs/explanation/packaging.rst b/docs/explanation/packaging.rst
deleted file mode 100644
index 07a77e6d..00000000
--- a/docs/explanation/packaging.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-.. _containers:
-
-Packaging bioinformatics software with containers
-=================================================
diff --git a/docs/explanation/platform.rst b/docs/explanation/platform.rst
deleted file mode 100644
index 938f32c9..00000000
--- a/docs/explanation/platform.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Building a web platform for polygenic risk score calculation
-============================================================
diff --git a/docs/getting-started.rst b/docs/getting-started.rst
index 74859ab8..93a30d9a 100644
--- a/docs/getting-started.rst
+++ b/docs/getting-started.rst
@@ -56,7 +56,7 @@ workflow with bundled test data:
     [7c/5cca6c] process > PGSC_CALC:PGSCALC:MAKE_COMPATIBLE:PLINK2_BFILE (cineca_synthetic_subset)   [100%] 1 of 1 ✔
     [3b/ce0e39] process > PGSC_CALC:PGSCALC:MAKE_COMPATIBLE:MATCH_VARIANTS (cineca_synthetic_subset) [100%] 1 of 1 ✔
     [2e/fb3233] process > PGSC_CALC:PGSCALC:APPLY_SCORE:PLINK2_SCORE (cineca_synthetic_subset)       [100%] 1 of 1 ✔
-    [b5/fc5b1e] process > PGSC_CALC:PGSCALC:APPLY_SCORE:MAKE_REPORT (1)                              [100%] 1 of 1 ✔
+    [b5/fc5b1e] process > PGSC_CALC:PGSCALC:APPLY_SCORE:SCORE_REPORT (1)                             [100%] 1 of 1 ✔
     [03/009cb6] process > PGSC_CALC:PGSCALC:DUMPSOFTWAREVERSIONS (1)                                 [100%] 1 of 1 ✔
     -[pgscatalog/pgsc_calc] Pipeline completed successfully-
     
diff --git a/docs/glossary.rst b/docs/glossary.rst
index 94a2b642..c037f0ac 100644
--- a/docs/glossary.rst
+++ b/docs/glossary.rst
@@ -40,15 +40,6 @@ Glossary
          A unique and stable identifier. PGS Catalog accessions start with the
          prefix PGS, e.g. `PGS000001`_
 
-     driver pod
-     pod
-         `A pod`_ is a description of one or more containers and its associated
-         computing resources (e.g. number of processes and RAM, but it's more
-         complicated than that). Kubernetes takes this description and tries to
-         make it exist on the cluster. The driver pod is responsible for managing
-         a workflow instance. The driver pod will monitor and submit each job in
-         the workflow as a separate worker pod.
-
      polygenic score
          A `polygenic score`_ (PGS) aggregates the effects of many genetic variants
          into a single number which predicts genetic predisposition for a
@@ -64,13 +55,7 @@ Glossary
          create the polygenic scoring file originally (i.e., those used to
          derive the risk alleles and weights).
 
-     worker pods
-         A pod, managed by the nextflow driver pod, that is responsible for
-         executing an atomic process in the workflow. They are created and
-         destroyed automatically by the driver pod.
-
 .. _CSVs are good: https://www.gov.uk/guidance/using-csv-file-format
-.. _A pod: https://kubernetes.io/docs/concepts/workloads/pods/
 .. _single nucleotide polymorphism: https://en.wikipedia.org/wiki/Single-nucleotide_polymorphism
 .. _UK BioBank: https://www.ukbiobank.ac.uk/    
 .. _PGS Catalog: https://www.pgscatalog.org
diff --git a/docs/how-to/bigjob.rst b/docs/how-to/bigjob.rst
index 5dbbfa7a..98c11e4c 100644
--- a/docs/how-to/bigjob.rst
+++ b/docs/how-to/bigjob.rst
@@ -116,10 +116,8 @@ Nextflow also supports submitting jobs platforms like:
 - Google cloud
 - Azure cloud
 - Amazon cloud
-
-Check the `nextflow documentation`_ for configuration specifics. pgsc_calc is
-deployed and tested on a `local Kubernetes cluster`_, but it's not a recommended
-way of running the pipeline for normal users.
+- Kubernetes
+  
+Check the `nextflow documentation`_ for configuration specifics.
 
 .. _`nextflow documentation`: https://nextflow.io/docs/latest/google.html
-.. _`local Kubernetes cluster`: https://github.com/PGScatalog/pgsc_calc/blob/master/conf/k8s.config
diff --git a/docs/how-to/index.rst b/docs/how-to/index.rst
index af8903ef..12b8857d 100644
--- a/docs/how-to/index.rst
+++ b/docs/how-to/index.rst
@@ -23,13 +23,6 @@ Making genomic data and scorefiles compatible
    liftover
    effecttype
    
-Understanding workflow output
------------------------------
-
-.. toctree::
-   :maxdepth: 1
-
-   interpret
 
 Working with big data
 ---------------------
diff --git a/docs/how-to/interpret.rst b/docs/how-to/interpret.rst
deleted file mode 100644
index fa64cd5c..00000000
--- a/docs/how-to/interpret.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-.. _interpret:
-
-How to interpret workflow output
-================================
-
-The most interesting output is published by default to ``output/make``,
-including:
-
-- A report summarising the score calculation process
-- A text file containing aggregated scores
-
-
-
diff --git a/docs/index.rst b/docs/index.rst
index 891c941a..20414fb5 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -47,7 +47,7 @@ The workflow should output:
     [7c/5cca6c] process > PGSC_CALC:PGSCALC:MAKE_COMPATIBLE:PLINK2_BFILE (cineca_synthetic_subset)   [100%] 1 of 1 ✔
     [3b/ce0e39] process > PGSC_CALC:PGSCALC:MAKE_COMPATIBLE:MATCH_VARIANTS (cineca_synthetic_subset) [100%] 1 of 1 ✔
     [2e/fb3233] process > PGSC_CALC:PGSCALC:APPLY_SCORE:PLINK2_SCORE (cineca_synthetic_subset)       [100%] 1 of 1 ✔
-    [b5/fc5b1e] process > PGSC_CALC:PGSCALC:APPLY_SCORE:MAKE_REPORT (1)                              [100%] 1 of 1 ✔
+    [b5/fc5b1e] process > PGSC_CALC:PGSCALC:APPLY_SCORE:SCORE_REPORT (1)                             [100%] 1 of 1 ✔
     [03/009cb6] process > PGSC_CALC:PGSCALC:DUMPSOFTWAREVERSIONS (1)                                 [100%] 1 of 1 ✔
     -[pgscatalog/pgsc_calc] Pipeline completed successfully-
                 
diff --git a/docs/reference/api.rst b/docs/reference/api.rst
deleted file mode 100644
index b31e8fa1..00000000
--- a/docs/reference/api.rst
+++ /dev/null
@@ -1,53 +0,0 @@
-API reference
-=============
-
-``pgsc_calc`` has two main use cases:
-
-- A bioinformatician or data scientist wants to calculate some polygenic scores
-  using an Unixy operating system and a terminal
-- A normal person (e.g. a biologist or other researcher) wants to calculate some
-  polygenic scores using a web browser
-
-To simplify the second use case, the workflow is designed to be launched
-programmatically on a `private cloud`_ using an API. API parameters are specified
-using JSON. The web platform is still under development.
-
-.. _private cloud: http://www.embassycloud.org/
-
-Specifying target genomes with JSON
------------------------------------
-
-.. literalinclude:: ../../assets/api_examples/input.json
-  :language: JSON
-
-             
-Specifying workflow parameters with JSON
-----------------------------------------
-
-.. literalinclude:: ../assets/api_examples/params.json
-  :language: JSON
-
-Complete API call
------------------
-
-.. literalinclude:: ../../assets/api_examples/call.json
-
-API call schema
----------------------------
-
-.. jsonschema:: ../../assets/schema_k8s.json
-
-Implementation details
-----------------------
-
-The API is designed using an event-driven approach with `Argo
-Events`_. Briefly, a sensor constantly listens on a Kubernetes cluster for Kafka
-messages to launch the pipeline. Once the message is received, a nextflow driver
-pod is created and the workflow is executed using the `K8S executor`_. The
-status of the workflow instance is reported using Nextflow's `weblog`_ and
-a second sensor. We didn't have Nextflow Tower at the time.
-
-.. _Argo Events: https://argoproj.github.io/argo-events/
-.. _K8S executor: https://www.nextflow.io/docs/latest/kubernetes.html
-.. _weblog: https://www.nextflow.io/docs/latest/tracing.html#weblog-via-http
-
diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 7b60d7f2..488ecbad 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -8,4 +8,3 @@ Reference guides
 
    input
    params
-   api   
diff --git a/docs/reference/input.rst b/docs/reference/input.rst
index ad7ec11f..16f03bfe 100644
--- a/docs/reference/input.rst
+++ b/docs/reference/input.rst
@@ -1,8 +1,10 @@
-Input schema
-============
+Input (samplesheet) schema
+==========================
 
-The documentation below is automatically generated from the input schema and
-contains additional technical detail. 
+The documentation below is automatically generated from the schema. The JSON
+file contains additional technical detail not shown in the table below.
 
-.. jsonschema:: ../../assets/schema_input.json
-.. _`example`: https://github.com/PGScatalog/pgsc_calc/blob/master/assets/api_examples/input.json
+Each row in a samplesheet can only have a single genomic data format (i.e. they
+are mutually exclusive).
+
+.. jsonschema:: ../../assets/schemas/target_genomes.json
diff --git a/docs/reference/params.rst b/docs/reference/params.rst
index ff62d9d6..e2c51d6c 100644
--- a/docs/reference/params.rst
+++ b/docs/reference/params.rst
@@ -6,4 +6,13 @@ Parameter reference
 The documentation below is automatically generated from the input schema and
 contains additional technical detail.
 
-.. jsonschema:: ../../nextflow_schema.json
+**Parameters in bold** are required and must be set by the user.
+
+.. jsonschema:: ../../nextflow_schema.json 
+    :lift_description:
+    :lift_definitions:
+    :auto_target:
+    :auto_reference:       
+ 
+
+ 
diff --git a/docs/explanation/troubleshooting.rst b/docs/troubleshooting.rst
similarity index 98%
rename from docs/explanation/troubleshooting.rst
rename to docs/troubleshooting.rst
index f824f5bb..ea3e87c5 100644
--- a/docs/explanation/troubleshooting.rst
+++ b/docs/troubleshooting.rst
@@ -1,3 +1,5 @@
+:orphan:
+   
 .. _troubleshoot:
 
 Troubleshooting
diff --git a/modules/local/score_report.nf b/modules/local/score_report.nf
new file mode 100644
index 00000000..c634baef
--- /dev/null
+++ b/modules/local/score_report.nf
@@ -0,0 +1,34 @@
+process SCORE_REPORT {
+    label 'process_high_memory'
+
+    conda (params.enable_conda ? "conda-forge::r-tidyverse=1.3.1 conda-forge::r-rsqlite=2.1.1" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'oras://dockerhub.ebi.ac.uk/gdp-public/pgsc_calc/singularity/mulled-v2-e5054a4b868f4ffd21311d4e05426694e2c7fb5e:17fe01267c936fedcbd51470941b075c42b08c23-0' :
+        'dockerhub.ebi.ac.uk/gdp-public/pgsc_calc/mulled-v2-e5054a4b868f4ffd21311d4e05426694e2c7fb5e:17fe01267c936fedcbd51470941b075c42b08c23-0' }"
+
+    input:
+    path scorefiles
+    path report
+    path logo
+    path db
+
+    output:
+    path "*.html"      , emit: report
+    path "*.txt"       , emit: scores
+    path "versions.yml", emit: versions
+
+    script:
+    def args = task.ext.args ?: ''
+    """
+    # dumb workaround symlink & out_dir (rmarkdown)
+    cp $report report.rmd
+
+    R -e 'rmarkdown::render("report.rmd", \
+        output_options = list(self_contained=TRUE))'
+
+    cat <<-END_VERSIONS > versions.yml
+    ${task.process.tokenize(':').last()}:
+        R: \$(echo \$(R --version 2>&1) | head -n 1 | cut -f 3 -d ' ')
+    END_VERSIONS
+    """
+}
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 7324a925..d233a00b 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -13,19 +13,14 @@
             "properties": {
                 "input": {
                     "type": "string",
-                    "format": "file-path",
-                    "mimetype": "text/csv",
-                    "schema": "assets/schema_input.json",
-                    "description": "Path to comma-separated file containing information about the samples (a sample sheet)",
-                    "help_text": "You will need to create a design file with information about the samples before running the pipeline. Use this parameter to specify its location. See usage docs for a more detailed explanation.",
-                    "fa_icon": "fas fa-file-csv",
-                    "pattern": ".csv$|.json$"
+                    "default": "None",
+                    "description": "Path to input samplesheet"
                 },
                 "format": {
                     "type": "string",
                     "default": "csv",
                     "fa_icon": "fas fa-cog",
-                    "description": "Format of the input",
+                    "description": "Format of input samplesheet",
                     "enum": [
                         "csv",
                         "json"
@@ -38,15 +33,26 @@
                 },
                 "accession": {
                     "type": "string",
-                    "description": "A PGS Catalog accession",
+                    "description": "A PGS Catalog accession, multiple accessions can be specified in a comma separated list",
                     "pattern": "(PGS\\d+)(,\\s*PGS\\d+)*",
                     "fa_icon": "fas fa-star"
                 },
-                "allelic_freq": {
+                "target_build": {
                     "type": "string",
-                    "default": "NO_FILE",
-                    "description": "(Optional) Path to a plink 2 allelic frequency file (see --read-freq)",
-                    "fa_icon": "fas fa-file-alt"
+                    "enum": [
+                        "GRCh37",
+                        "GRCh38"
+                    ],
+                    "description": "Genome build of input data"
+                },
+                "ref": {
+                    "type": "string",
+                    "default": "https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/reference_data/pgsc_calc_ref.sqlar",
+                    "description": "Path to reference database"
+                },
+                "copy_genomes": {
+                    "type": "boolean",
+                    "description": "Copy harmonised genomes to outdir"
                 },
                 "outdir": {
                     "type": "string",
@@ -64,7 +70,8 @@
             },
             "required": [
                 "input",
-                "format"
+                "format",
+                "target_build"
             ]
         },
         "compatibility_options": {
@@ -73,11 +80,32 @@
             "description": "",
             "default": "",
             "properties": {
+                "liftover": {
+                    "type": "boolean",
+                    "description": "Lift scoring files to match your target genomes. Requires build information in the header of the scoring files."
+                },
+                "min_lift": {
+                    "type": "number",
+                    "default": 0.95,
+                    "description": "Minimum proportion of variants required to successfully remap a scoring file to a different genome build",
+                    "minimum": 0,
+                    "maximum": 1
+                },
+                "keep_multiallelic": {
+                    "type": "boolean",
+                    "description": "Keep multiallelic alleles?"
+                },
+                "keep_ambiguous": {
+                    "type": "boolean",
+                    "description": "Keep ambiguous matches"
+                },
                 "min_overlap": {
                     "type": "number",
                     "default": 0.75,
                     "description": "Minimum proportion of variants present in both the score file and input target genomic data",
-                    "fa_icon": "fas fa-cog"
+                    "fa_icon": "fas fa-cog",
+                    "minimum": 0,
+                    "maximum": 1
                 }
             },
             "fa_icon": "fas fa-user-cog"
@@ -273,22 +301,5 @@
         {
             "$ref": "#/definitions/generic_options"
         }
-    ],
-    "properties": {
-        "liftover": {
-            "type": "boolean"
-        },
-        "target_build": {
-            "type": "string"
-        },
-        "only_input": {
-            "type": "boolean"
-        },
-        "only_compatible": {
-            "type": "boolean"
-        },
-        "only_score": {
-            "type": "boolean"
-        }
-    }
-}
\ No newline at end of file
+    ]
+}