From faeabdf35a040aba7216af390aa261cd10b7f822 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 28 Mar 2024 13:45:17 -0700 Subject: [PATCH] ingest/config.yaml: Add all NCBI Datasets fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provides an easy way for first time users to get the uncurated metadata from NCBI Datasets commands by running the ingest workflow with the specified target `data/ncbi_dataset_report.tsv`. Afterwards, users can easily remove fields that are not needed as part the workflow to reduce the file size and save space. Prompted by @jameshadfield in review of the tutorial¹ and resolves https://github.com/nextstrain/pathogen-repo-guide/issues/30. ¹ https://github.com/nextstrain/docs.nextstrain.org/pull/195#discussion_r1540226016 --- ingest/README.md | 16 +++++++++++ ingest/defaults/config.yaml | 56 ++++++++++++++++++++++++++++++------- 2 files changed, 62 insertions(+), 10 deletions(-) diff --git a/ingest/README.md b/ingest/README.md index 079fc78..4c4a4cd 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -25,6 +25,22 @@ This produces the default outputs of the ingest workflow: - metadata = results/metadata.tsv - sequences = results/sequences.fasta +### Uncurated metadata + +To see the uncurated metadata from NCBI Datasets, you can specify the intermediate +target of the workflow with: + +``` +nextstrain build ingest data/ncbi_dataset_report.tsv --notemp +``` + +> [!Note] +> This outputs the computer friendly mnemonics instead of the human readable +> names of the [NCBI Dataset fields](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields). + +Inspect the `ingest/data/ncbi_dataset_report.tsv` file to see the uncurated +metadata from NCBI Datasets. + ## Defaults The defaults directory contains all of the default configurations for the ingest workflow. diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index f7e7b64..96d6ccf 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -13,24 +13,60 @@ ncbi_taxon_id: "" # The list of NCBI Datasets fields to include from NCBI Datasets output # These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields # https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields +# +# The default includes all available fields to be able to easily see the +# uncurated metadata by running the workflow with the target `data/ncbi_dataset_report.tsv` +# Remove any fields that are not needed in your workflow to reduce file size and save space. # Note: the "accession" field MUST be provided to match with the sequences ncbi_datasets_fields: - accession - - sourcedb - - sra-accs - - isolate-lineage - - geo-region + - bioprojects + - biosample-acc + - completeness + - gene-count - geo-location - - isolate-collection-date - - release-date - - update-date - - length + - geo-region + - host-common-name + - host-infraspecific-breed + - host-infraspecific-cultivar + - host-infraspecific-ecotype + - host-infraspecific-isolate + - host-infraspecific-sex + - host-infraspecific-strain - host-name + - host-pangolin + - host-tax-id + - is-annotated + - is-complete + - is-lab-host + - is-vaccine-strain + - isolate-collection-date + - isolate-lineage - isolate-lineage-source - - biosample-acc - - submitter-names + - lab-host + - length + - matpeptide-count + - mol-type + - nucleotide-completeness + - protein-count + - purpose-of-sampling + - release-date + - sourcedb + - sra-accs - submitter-affiliation - submitter-country + - submitter-names + - update-date + - virus-common-name + - virus-infraspecific-breed + - virus-infraspecific-cultivar + - virus-infraspecific-ecotype + - virus-infraspecific-isolate + - virus-infraspecific-sex + - virus-infraspecific-strain + - virus-name + - virus-pangolin + - virus-tax-id # Config parameters related to the curate pipeline curate: