From faeabdf35a040aba7216af390aa261cd10b7f822 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Thu, 28 Mar 2024 13:45:17 -0700
Subject: [PATCH] ingest/config.yaml: Add all NCBI Datasets fields
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Provides an easy way for first time users to get the uncurated metadata
from NCBI Datasets commands by running the ingest workflow with the
specified target `data/ncbi_dataset_report.tsv`.

Afterwards, users can easily remove fields that are not needed as part
the workflow to reduce the file size and save space.

Prompted by @jameshadfield in review of the tutorial¹ and
resolves https://github.com/nextstrain/pathogen-repo-guide/issues/30.

¹ https://github.com/nextstrain/docs.nextstrain.org/pull/195#discussion_r1540226016
---
 ingest/README.md            | 16 +++++++++++
 ingest/defaults/config.yaml | 56 ++++++++++++++++++++++++++++++-------
 2 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/ingest/README.md b/ingest/README.md
index 079fc78..4c4a4cd 100644
--- a/ingest/README.md
+++ b/ingest/README.md
@@ -25,6 +25,22 @@ This produces the default outputs of the ingest workflow:
 - metadata      = results/metadata.tsv
 - sequences     = results/sequences.fasta
 
+### Uncurated metadata
+
+To see the uncurated metadata from NCBI Datasets, you can specify the intermediate
+target of the workflow with:
+
+```
+nextstrain build ingest data/ncbi_dataset_report.tsv --notemp
+```
+
+> [!Note]
+> This outputs the computer friendly mnemonics instead of the human readable
+> names of the [NCBI Dataset fields](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields).
+
+Inspect the `ingest/data/ncbi_dataset_report.tsv` file to see the uncurated
+metadata from NCBI Datasets.
+
 ## Defaults
 
 The defaults directory contains all of the default configurations for the ingest workflow.
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
index f7e7b64..96d6ccf 100644
--- a/ingest/defaults/config.yaml
+++ b/ingest/defaults/config.yaml
@@ -13,24 +13,60 @@ ncbi_taxon_id: ""
 # The list of NCBI Datasets fields to include from NCBI Datasets output
 # These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
 # https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
+#
+# The default includes all available fields to be able to easily see the
+# uncurated metadata by running the workflow with the target `data/ncbi_dataset_report.tsv`
+# Remove any fields that are not needed in your workflow to reduce file size and save space.
 # Note: the "accession" field MUST be provided to match with the sequences
 ncbi_datasets_fields:
   - accession
-  - sourcedb
-  - sra-accs
-  - isolate-lineage
-  - geo-region
+  - bioprojects
+  - biosample-acc
+  - completeness
+  - gene-count
   - geo-location
-  - isolate-collection-date
-  - release-date
-  - update-date
-  - length
+  - geo-region
+  - host-common-name
+  - host-infraspecific-breed
+  - host-infraspecific-cultivar
+  - host-infraspecific-ecotype
+  - host-infraspecific-isolate
+  - host-infraspecific-sex
+  - host-infraspecific-strain
   - host-name
+  - host-pangolin
+  - host-tax-id
+  - is-annotated
+  - is-complete
+  - is-lab-host
+  - is-vaccine-strain
+  - isolate-collection-date
+  - isolate-lineage
   - isolate-lineage-source
-  - biosample-acc
-  - submitter-names
+  - lab-host
+  - length
+  - matpeptide-count
+  - mol-type
+  - nucleotide-completeness
+  - protein-count
+  - purpose-of-sampling
+  - release-date
+  - sourcedb
+  - sra-accs
   - submitter-affiliation
   - submitter-country
+  - submitter-names
+  - update-date
+  - virus-common-name
+  - virus-infraspecific-breed
+  - virus-infraspecific-cultivar
+  - virus-infraspecific-ecotype
+  - virus-infraspecific-isolate
+  - virus-infraspecific-sex
+  - virus-infraspecific-strain
+  - virus-name
+  - virus-pangolin
+  - virus-tax-id
 
 # Config parameters related to the curate pipeline
 curate: