From bc7b26a6de6a6d19c274c047aac4b4798d52429e Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Tue, 24 May 2022 14:53:59 +0100 Subject: [PATCH] update docs --- assets/schemas/target_genomes.json | 10 +-- docs/_templates/globaltoc.html | 1 + docs/changelog.rst | 21 ++++++ docs/explanation/calculating.rst | 2 - docs/explanation/goodpractices.rst | 2 - docs/explanation/index.rst | 9 +-- docs/explanation/output.rst | 4 ++ docs/explanation/packaging.rst | 4 -- docs/explanation/platform.rst | 2 - docs/getting-started.rst | 2 +- docs/glossary.rst | 15 ---- docs/how-to/bigjob.rst | 8 +-- docs/how-to/index.rst | 7 -- docs/how-to/interpret.rst | 13 ---- docs/index.rst | 2 +- docs/reference/api.rst | 53 --------------- docs/reference/index.rst | 1 - docs/reference/input.rst | 14 ++-- docs/reference/params.rst | 11 ++- docs/{explanation => }/troubleshooting.rst | 2 + modules/local/score_report.nf | 34 ++++++++++ nextflow_schema.json | 79 ++++++++++++---------- 22 files changed, 136 insertions(+), 160 deletions(-) delete mode 100644 docs/explanation/calculating.rst delete mode 100644 docs/explanation/goodpractices.rst create mode 100644 docs/explanation/output.rst delete mode 100644 docs/explanation/packaging.rst delete mode 100644 docs/explanation/platform.rst delete mode 100644 docs/how-to/interpret.rst delete mode 100644 docs/reference/api.rst rename docs/{explanation => }/troubleshooting.rst (98%) create mode 100644 modules/local/score_report.nf diff --git a/assets/schemas/target_genomes.json b/assets/schemas/target_genomes.json index a1bcb884..516506be 100644 --- a/assets/schemas/target_genomes.json +++ b/assets/schemas/target_genomes.json @@ -12,12 +12,12 @@ "sample": { "type": "string", "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces" + "description": "Sample name must be provided and cannot contain spaces" }, "vcf_path": { "type": "string", "pattern": "^\\S+\\.vcf\\.gz$", - "errorMessage": "VCF path must end with .vcf.gz, mutually exclusive with other formats", + "description": "VCF path must end with .vcf.gz, mutually exclusive with other formats", "anyOf": [ { "type": "string", @@ -31,7 +31,7 @@ ] }, "bfile_path": { - "errorMessage": "Plink 1 binary fileset prefix, must not end with bed / bim / fam, mutually exclusive with other formats", + "description": "Plink 1 binary fileset prefix, must not end with bed / bim / fam, mutually exclusive with other formats", "anyOf": [ { "type": "string", @@ -45,7 +45,7 @@ ] }, "pfile_path": { - "errorMessage": "Plink 2 binary fileset prefix, must not end with pvar / pgen / psam, mutually exclusive with other formats. Must not be zstd compressed.", + "description": "Plink 2 binary fileset prefix, must not end with pvar / pgen / psam, mutually exclusive with other formats. Must not be zstd compressed.", "anyOf": [ { "type": "string", @@ -59,7 +59,7 @@ ] }, "chrom": { - "errorMessage": "Chromosome of associated genomic data. If multiple chromosomes are in the data, set to an empty string", + "description": "Chromosome of associated genomic data. If multiple chromosomes are in the data, set to an empty string", "anyOf": [ { "type": "null" diff --git a/docs/_templates/globaltoc.html b/docs/_templates/globaltoc.html index 2dec734d..07be32d0 100644 --- a/docs/_templates/globaltoc.html +++ b/docs/_templates/globaltoc.html @@ -6,6 +6,7 @@

Contents

  • How-to guides
  • Reference guide
  • Explanation
  • +
  • Troubleshooting
  • Glossary
  • diff --git a/docs/changelog.rst b/docs/changelog.rst index 774f5d3c..9b087af5 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -8,6 +8,27 @@ will only occur in major versions with changes noted in this changelog. .. _`semantic versioning`: https://semver.org/ + +pgsc_calc v1.0.0 (2022-05-24) +-------------------------------- + +This release produces scores that should be biologically meaningful. Significant +effort has been made to validate calculate scores on different datasets. In the +next release we'll add score validation to our test suite to make sure +calculated scores stay valid in the future. + +Features +~~~~~~~~ + +- Add support for PLINK2 format (samplesheet structure changed) +- Add support for allosomes (e.g. X, Y) +- Improve PGS Catalog compatibility (e.g. missing other allele) +- Add automatic liftover of scoring files to match target genome build +- Performance improvements to support UK BioBank scale data (500,000 genomes) +- Support calculation of multiple scores in parallel +- Significantly improved test coverage (> 80%) +- Lots of other small changes to improve correctness and handling edge cases + pgsc_calc v0.1.3dev (2022-02-04) -------------------------------- diff --git a/docs/explanation/calculating.rst b/docs/explanation/calculating.rst deleted file mode 100644 index be1b7e6f..00000000 --- a/docs/explanation/calculating.rst +++ /dev/null @@ -1,2 +0,0 @@ -Calculating polygenic scores -============================ diff --git a/docs/explanation/goodpractices.rst b/docs/explanation/goodpractices.rst deleted file mode 100644 index 4c6ee0c1..00000000 --- a/docs/explanation/goodpractices.rst +++ /dev/null @@ -1,2 +0,0 @@ -Good practices for polygenic score analysis -=========================================== diff --git a/docs/explanation/index.rst b/docs/explanation/index.rst index b8cebf3e..7fa8f3f1 100644 --- a/docs/explanation/index.rst +++ b/docs/explanation/index.rst @@ -3,15 +3,8 @@ Explanation =========== -(These explanations haven't been written yet, sorry!) - .. toctree:: :maxdepth: 1 - calculating - packaging - testing - platform - goodpractices - troubleshooting + output diff --git a/docs/explanation/output.rst b/docs/explanation/output.rst new file mode 100644 index 00000000..12d41231 --- /dev/null +++ b/docs/explanation/output.rst @@ -0,0 +1,4 @@ +Understanding workflow output +============================= + + diff --git a/docs/explanation/packaging.rst b/docs/explanation/packaging.rst deleted file mode 100644 index 07a77e6d..00000000 --- a/docs/explanation/packaging.rst +++ /dev/null @@ -1,4 +0,0 @@ -.. _containers: - -Packaging bioinformatics software with containers -================================================= diff --git a/docs/explanation/platform.rst b/docs/explanation/platform.rst deleted file mode 100644 index 938f32c9..00000000 --- a/docs/explanation/platform.rst +++ /dev/null @@ -1,2 +0,0 @@ -Building a web platform for polygenic risk score calculation -============================================================ diff --git a/docs/getting-started.rst b/docs/getting-started.rst index 74859ab8..93a30d9a 100644 --- a/docs/getting-started.rst +++ b/docs/getting-started.rst @@ -56,7 +56,7 @@ workflow with bundled test data: [7c/5cca6c] process > PGSC_CALC:PGSCALC:MAKE_COMPATIBLE:PLINK2_BFILE (cineca_synthetic_subset) [100%] 1 of 1 ✔ [3b/ce0e39] process > PGSC_CALC:PGSCALC:MAKE_COMPATIBLE:MATCH_VARIANTS (cineca_synthetic_subset) [100%] 1 of 1 ✔ [2e/fb3233] process > PGSC_CALC:PGSCALC:APPLY_SCORE:PLINK2_SCORE (cineca_synthetic_subset) [100%] 1 of 1 ✔ - [b5/fc5b1e] process > PGSC_CALC:PGSCALC:APPLY_SCORE:MAKE_REPORT (1) [100%] 1 of 1 ✔ + [b5/fc5b1e] process > PGSC_CALC:PGSCALC:APPLY_SCORE:SCORE_REPORT (1) [100%] 1 of 1 ✔ [03/009cb6] process > PGSC_CALC:PGSCALC:DUMPSOFTWAREVERSIONS (1) [100%] 1 of 1 ✔ -[pgscatalog/pgsc_calc] Pipeline completed successfully- diff --git a/docs/glossary.rst b/docs/glossary.rst index 94a2b642..c037f0ac 100644 --- a/docs/glossary.rst +++ b/docs/glossary.rst @@ -40,15 +40,6 @@ Glossary A unique and stable identifier. PGS Catalog accessions start with the prefix PGS, e.g. `PGS000001`_ - driver pod - pod - `A pod`_ is a description of one or more containers and its associated - computing resources (e.g. number of processes and RAM, but it's more - complicated than that). Kubernetes takes this description and tries to - make it exist on the cluster. The driver pod is responsible for managing - a workflow instance. The driver pod will monitor and submit each job in - the workflow as a separate worker pod. - polygenic score A `polygenic score`_ (PGS) aggregates the effects of many genetic variants into a single number which predicts genetic predisposition for a @@ -64,13 +55,7 @@ Glossary create the polygenic scoring file originally (i.e., those used to derive the risk alleles and weights). - worker pods - A pod, managed by the nextflow driver pod, that is responsible for - executing an atomic process in the workflow. They are created and - destroyed automatically by the driver pod. - .. _CSVs are good: https://www.gov.uk/guidance/using-csv-file-format -.. _A pod: https://kubernetes.io/docs/concepts/workloads/pods/ .. _single nucleotide polymorphism: https://en.wikipedia.org/wiki/Single-nucleotide_polymorphism .. _UK BioBank: https://www.ukbiobank.ac.uk/ .. _PGS Catalog: https://www.pgscatalog.org diff --git a/docs/how-to/bigjob.rst b/docs/how-to/bigjob.rst index 5dbbfa7a..98c11e4c 100644 --- a/docs/how-to/bigjob.rst +++ b/docs/how-to/bigjob.rst @@ -116,10 +116,8 @@ Nextflow also supports submitting jobs platforms like: - Google cloud - Azure cloud - Amazon cloud - -Check the `nextflow documentation`_ for configuration specifics. pgsc_calc is -deployed and tested on a `local Kubernetes cluster`_, but it's not a recommended -way of running the pipeline for normal users. +- Kubernetes + +Check the `nextflow documentation`_ for configuration specifics. .. _`nextflow documentation`: https://nextflow.io/docs/latest/google.html -.. _`local Kubernetes cluster`: https://github.com/PGScatalog/pgsc_calc/blob/master/conf/k8s.config diff --git a/docs/how-to/index.rst b/docs/how-to/index.rst index af8903ef..12b8857d 100644 --- a/docs/how-to/index.rst +++ b/docs/how-to/index.rst @@ -23,13 +23,6 @@ Making genomic data and scorefiles compatible liftover effecttype -Understanding workflow output ------------------------------ - -.. toctree:: - :maxdepth: 1 - - interpret Working with big data --------------------- diff --git a/docs/how-to/interpret.rst b/docs/how-to/interpret.rst deleted file mode 100644 index fa64cd5c..00000000 --- a/docs/how-to/interpret.rst +++ /dev/null @@ -1,13 +0,0 @@ -.. _interpret: - -How to interpret workflow output -================================ - -The most interesting output is published by default to ``output/make``, -including: - -- A report summarising the score calculation process -- A text file containing aggregated scores - - - diff --git a/docs/index.rst b/docs/index.rst index 891c941a..20414fb5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -47,7 +47,7 @@ The workflow should output: [7c/5cca6c] process > PGSC_CALC:PGSCALC:MAKE_COMPATIBLE:PLINK2_BFILE (cineca_synthetic_subset) [100%] 1 of 1 ✔ [3b/ce0e39] process > PGSC_CALC:PGSCALC:MAKE_COMPATIBLE:MATCH_VARIANTS (cineca_synthetic_subset) [100%] 1 of 1 ✔ [2e/fb3233] process > PGSC_CALC:PGSCALC:APPLY_SCORE:PLINK2_SCORE (cineca_synthetic_subset) [100%] 1 of 1 ✔ - [b5/fc5b1e] process > PGSC_CALC:PGSCALC:APPLY_SCORE:MAKE_REPORT (1) [100%] 1 of 1 ✔ + [b5/fc5b1e] process > PGSC_CALC:PGSCALC:APPLY_SCORE:SCORE_REPORT (1) [100%] 1 of 1 ✔ [03/009cb6] process > PGSC_CALC:PGSCALC:DUMPSOFTWAREVERSIONS (1) [100%] 1 of 1 ✔ -[pgscatalog/pgsc_calc] Pipeline completed successfully- diff --git a/docs/reference/api.rst b/docs/reference/api.rst deleted file mode 100644 index b31e8fa1..00000000 --- a/docs/reference/api.rst +++ /dev/null @@ -1,53 +0,0 @@ -API reference -============= - -``pgsc_calc`` has two main use cases: - -- A bioinformatician or data scientist wants to calculate some polygenic scores - using an Unixy operating system and a terminal -- A normal person (e.g. a biologist or other researcher) wants to calculate some - polygenic scores using a web browser - -To simplify the second use case, the workflow is designed to be launched -programmatically on a `private cloud`_ using an API. API parameters are specified -using JSON. The web platform is still under development. - -.. _private cloud: http://www.embassycloud.org/ - -Specifying target genomes with JSON ------------------------------------ - -.. literalinclude:: ../../assets/api_examples/input.json - :language: JSON - - -Specifying workflow parameters with JSON ----------------------------------------- - -.. literalinclude:: ../assets/api_examples/params.json - :language: JSON - -Complete API call ------------------ - -.. literalinclude:: ../../assets/api_examples/call.json - -API call schema ---------------------------- - -.. jsonschema:: ../../assets/schema_k8s.json - -Implementation details ----------------------- - -The API is designed using an event-driven approach with `Argo -Events`_. Briefly, a sensor constantly listens on a Kubernetes cluster for Kafka -messages to launch the pipeline. Once the message is received, a nextflow driver -pod is created and the workflow is executed using the `K8S executor`_. The -status of the workflow instance is reported using Nextflow's `weblog`_ and -a second sensor. We didn't have Nextflow Tower at the time. - -.. _Argo Events: https://argoproj.github.io/argo-events/ -.. _K8S executor: https://www.nextflow.io/docs/latest/kubernetes.html -.. _weblog: https://www.nextflow.io/docs/latest/tracing.html#weblog-via-http - diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 7b60d7f2..488ecbad 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -8,4 +8,3 @@ Reference guides input params - api diff --git a/docs/reference/input.rst b/docs/reference/input.rst index ad7ec11f..16f03bfe 100644 --- a/docs/reference/input.rst +++ b/docs/reference/input.rst @@ -1,8 +1,10 @@ -Input schema -============ +Input (samplesheet) schema +========================== -The documentation below is automatically generated from the input schema and -contains additional technical detail. +The documentation below is automatically generated from the schema. The JSON +file contains additional technical detail not shown in the table below. -.. jsonschema:: ../../assets/schema_input.json -.. _`example`: https://github.com/PGScatalog/pgsc_calc/blob/master/assets/api_examples/input.json +Each row in a samplesheet can only have a single genomic data format (i.e. they +are mutually exclusive). + +.. jsonschema:: ../../assets/schemas/target_genomes.json diff --git a/docs/reference/params.rst b/docs/reference/params.rst index ff62d9d6..e2c51d6c 100644 --- a/docs/reference/params.rst +++ b/docs/reference/params.rst @@ -6,4 +6,13 @@ Parameter reference The documentation below is automatically generated from the input schema and contains additional technical detail. -.. jsonschema:: ../../nextflow_schema.json +**Parameters in bold** are required and must be set by the user. + +.. jsonschema:: ../../nextflow_schema.json + :lift_description: + :lift_definitions: + :auto_target: + :auto_reference: + + + diff --git a/docs/explanation/troubleshooting.rst b/docs/troubleshooting.rst similarity index 98% rename from docs/explanation/troubleshooting.rst rename to docs/troubleshooting.rst index f824f5bb..ea3e87c5 100644 --- a/docs/explanation/troubleshooting.rst +++ b/docs/troubleshooting.rst @@ -1,3 +1,5 @@ +:orphan: + .. _troubleshoot: Troubleshooting diff --git a/modules/local/score_report.nf b/modules/local/score_report.nf new file mode 100644 index 00000000..c634baef --- /dev/null +++ b/modules/local/score_report.nf @@ -0,0 +1,34 @@ +process SCORE_REPORT { + label 'process_high_memory' + + conda (params.enable_conda ? "conda-forge::r-tidyverse=1.3.1 conda-forge::r-rsqlite=2.1.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://dockerhub.ebi.ac.uk/gdp-public/pgsc_calc/singularity/mulled-v2-e5054a4b868f4ffd21311d4e05426694e2c7fb5e:17fe01267c936fedcbd51470941b075c42b08c23-0' : + 'dockerhub.ebi.ac.uk/gdp-public/pgsc_calc/mulled-v2-e5054a4b868f4ffd21311d4e05426694e2c7fb5e:17fe01267c936fedcbd51470941b075c42b08c23-0' }" + + input: + path scorefiles + path report + path logo + path db + + output: + path "*.html" , emit: report + path "*.txt" , emit: scores + path "versions.yml", emit: versions + + script: + def args = task.ext.args ?: '' + """ + # dumb workaround symlink & out_dir (rmarkdown) + cp $report report.rmd + + R -e 'rmarkdown::render("report.rmd", \ + output_options = list(self_contained=TRUE))' + + cat <<-END_VERSIONS > versions.yml + ${task.process.tokenize(':').last()}: + R: \$(echo \$(R --version 2>&1) | head -n 1 | cut -f 3 -d ' ') + END_VERSIONS + """ +} diff --git a/nextflow_schema.json b/nextflow_schema.json index 7324a925..d233a00b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -13,19 +13,14 @@ "properties": { "input": { "type": "string", - "format": "file-path", - "mimetype": "text/csv", - "schema": "assets/schema_input.json", - "description": "Path to comma-separated file containing information about the samples (a sample sheet)", - "help_text": "You will need to create a design file with information about the samples before running the pipeline. Use this parameter to specify its location. See usage docs for a more detailed explanation.", - "fa_icon": "fas fa-file-csv", - "pattern": ".csv$|.json$" + "default": "None", + "description": "Path to input samplesheet" }, "format": { "type": "string", "default": "csv", "fa_icon": "fas fa-cog", - "description": "Format of the input", + "description": "Format of input samplesheet", "enum": [ "csv", "json" @@ -38,15 +33,26 @@ }, "accession": { "type": "string", - "description": "A PGS Catalog accession", + "description": "A PGS Catalog accession, multiple accessions can be specified in a comma separated list", "pattern": "(PGS\\d+)(,\\s*PGS\\d+)*", "fa_icon": "fas fa-star" }, - "allelic_freq": { + "target_build": { "type": "string", - "default": "NO_FILE", - "description": "(Optional) Path to a plink 2 allelic frequency file (see --read-freq)", - "fa_icon": "fas fa-file-alt" + "enum": [ + "GRCh37", + "GRCh38" + ], + "description": "Genome build of input data" + }, + "ref": { + "type": "string", + "default": "https://gitlab.ebi.ac.uk/nebfield/test-datasets/-/raw/master/pgsc_calc/reference_data/pgsc_calc_ref.sqlar", + "description": "Path to reference database" + }, + "copy_genomes": { + "type": "boolean", + "description": "Copy harmonised genomes to outdir" }, "outdir": { "type": "string", @@ -64,7 +70,8 @@ }, "required": [ "input", - "format" + "format", + "target_build" ] }, "compatibility_options": { @@ -73,11 +80,32 @@ "description": "", "default": "", "properties": { + "liftover": { + "type": "boolean", + "description": "Lift scoring files to match your target genomes. Requires build information in the header of the scoring files." + }, + "min_lift": { + "type": "number", + "default": 0.95, + "description": "Minimum proportion of variants required to successfully remap a scoring file to a different genome build", + "minimum": 0, + "maximum": 1 + }, + "keep_multiallelic": { + "type": "boolean", + "description": "Keep multiallelic alleles?" + }, + "keep_ambiguous": { + "type": "boolean", + "description": "Keep ambiguous matches" + }, "min_overlap": { "type": "number", "default": 0.75, "description": "Minimum proportion of variants present in both the score file and input target genomic data", - "fa_icon": "fas fa-cog" + "fa_icon": "fas fa-cog", + "minimum": 0, + "maximum": 1 } }, "fa_icon": "fas fa-user-cog" @@ -273,22 +301,5 @@ { "$ref": "#/definitions/generic_options" } - ], - "properties": { - "liftover": { - "type": "boolean" - }, - "target_build": { - "type": "string" - }, - "only_input": { - "type": "boolean" - }, - "only_compatible": { - "type": "boolean" - }, - "only_score": { - "type": "boolean" - } - } -} \ No newline at end of file + ] +}