From 4212c40a73409e3f4d50223adec60ea82c749fd5 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Wed, 4 Sep 2024 11:34:25 -0700 Subject: [PATCH] ingest: Merge Nextclade metadata with `augur merge` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This construction reads a bit clearer and cleaner. It's also a good example of how to use `augur merge`. The limitation on non-seekable streams means the rule now uses additional transient disk space, but it typically shouldn't be an issue. The way Augur's slow start up time impacts `augur merge` also contributes to a longer rule execution time, but it should be negligible in the context of the larger workflow and presumably we'll fix the slow start up eventually.¹ The output is semantically identical but has some syntactic changes re: quoting. It's worth noting that the pre-existing TSV format was _not_ IANA TSV, despite it (still) being treated as such in a few places, but was (and remains) a CSV-like TSV with some quoted fields (and some mangled quotes², e.g. the "institution" column for accession KJ556895). We really need to sort out our TSV formats³, but that's for a larger project. ¹ ² ³ --- ingest/rules/nextclade.smk | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 39148ca..6f37fe7 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -62,6 +62,7 @@ rule join_metadata_and_nextclade: metadata="data/subset_metadata.tsv", output: metadata="results/metadata.tsv", + nextclade_metadata=temp("results/nextclade_metadata.csv"), params: metadata_id_field=config["curate"]["output_id_field"], nextclade_id_field=config["nextclade"]["id_field"], @@ -75,13 +76,15 @@ rule join_metadata_and_nextclade: --field-map {params.nextclade_field_map:q} \ --output-metadata - \ | tsv-select --header --fields {params.nextclade_fields:q} \ - | tsv-join -H \ - --filter-file - \ - --key-fields {params.nextclade_id_field} \ - --data-fields {params.metadata_id_field} \ - --append-fields '*' \ - --write-all ? \ - {input.metadata} \ - | tsv-select -H --exclude {params.nextclade_id_field} \ - > {output.metadata} - """ + > {output.nextclade_metadata:q} + + augur merge \ + --metadata \ + metadata={input.metadata:q} \ + nextclade={output.nextclade_metadata:q} \ + --metadata-id-columns \ + metadata={params.metadata_id_field:q} \ + nextclade={params.nextclade_id_field:q} \ + --output-metadata {output.metadata:q} \ + --no-source-columns + """