refactor: serotypes

Since serotype is annotated as a column in metadata, simplify intermediate filenames like `data/sequences_{serotype}.fasta` and `data/metadata_{serotype}.tsv` to `data/sequences.fasta` and `data/metadata.tsv`.
nextstrain · Dec 16, 2022 · fc12467 · fc12467
1 parent 370f6b8
commit fc12467
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 28 deletions.
diff --git a/ingest/README.md b/ingest/README.md
@@ -10,7 +10,7 @@ This is the ingest pipeline for Ebola virus sequences.
 Fetch sequences with
 
 ```sh
-nextstrain build --cpus 1 . data/sequences_all.ndjson
+nextstrain build --cpus 1 . data/sequences.ndjson
 ```
 
 Run the complete ingest pipeline with
@@ -21,8 +21,8 @@ nextstrain build --cpus 1 .
 
 This will produce two files (within the `ingest` directory):
 
-- data/metadata_all.tsv
-- data/sequences_all.fasta
+- data/metadata.tsv
+- data/sequences.fasta
 
 Run the complete ingest pipeline and upload results to AWS S3 with
 

diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -9,7 +9,7 @@ serotypes = ['all']
 
 def _get_all_targets(wildcards):
     # Default targets are the metadata TSV and sequences FASTA files
-    all_targets = expand(["data/sequences_{serotype}.fasta.zst", "data/metadata_{serotype}.tsv.zst"], serotype=serotypes)
+    all_targets = ["data/sequences.fasta.zst", "data/metadata.tsv.zst"]
 
     # Add additional targets based on upload config
     upload_config = config.get("upload", {})

diff --git a/ingest/config/optional.yaml b/ingest/config/optional.yaml
@@ -7,13 +7,13 @@ upload:
     dst: 's3://nextstrain-data/files/workflows/ebola/test'
     # Files to upload to S3 that are in the `data` directory
     files_to_upload: [
-      'metadata_all.tsv.zst',
-      'sequences_all.fasta.zst',
+      'metadata.tsv.zst',
+      'sequences.fasta.zst',
     ]
     # Remote file names for the files to upload, must be in the same order as local files above
     remote_file_names: [
-      'metadata_all.tsv.zst',
-      'sequences_all.fasta.zst',
+      'metadata.tsv.zst',
+      'sequences.fasta.zst',
     ]
     cloudfront_domain: 'data.nextstrain.org'
 

diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk
@@ -13,17 +13,11 @@ Produces final output as
 
 """
 
-def download_serotype(w):
-    serotype = {
-        'all': '186536', # Returns 3530 records, check if we need a more specific Taxon ID
-    }
-    return serotype[w.serotype]
-
 rule fetch_from_genbank:
     output:
-        genbank_ndjson=temp("data/genbank_{serotype}.ndjson"),
+        genbank_ndjson=temp("data/genbank.ndjson"),
     params:
-        serotype_tax_id=download_serotype,
+        serotype_tax_id='186536', # Returns 3530 records, check if we need a more specific Taxon ID
         csv_to_ndjson_url="https://raw.githubusercontent.com/nextstrain/monkeypox/master/ingest/bin/csv-to-ndjson",
         fetch_from_genbank_url="https://raw.githubusercontent.com/nextstrain/dengue/new_ingest/ingest/bin/fetch-from-genbank",
         genbank_url_url="https://raw.githubusercontent.com/nextstrain/dengue/new_ingest/ingest/bin/genbank-url", # Update if dengue merged
@@ -43,14 +37,14 @@ rule fetch_from_genbank:
 
 
 def _get_all_sources(wildcards):
-    return [f"data/{source}_{wildcards.serotype}.ndjson" for source in config["sources"]]
+    return [f"data/{source}.ndjson" for source in config["sources"]]
 
 
 rule fetch_all_sequences:
     input:
         all_sources=_get_all_sources,
     output:
-        sequences_ndjson=temp("data/sequences_{serotype}.ndjson"),
+        sequences_ndjson=temp("data/sequences.ndjson"),
     shell:
         """
         cat {input.all_sources} > {output.sequences_ndjson}

diff --git a/ingest/workflow/snakemake_rules/transform.smk b/ingest/workflow/snakemake_rules/transform.smk
@@ -38,13 +38,13 @@ rule concat_geolocation_rules:
 
 rule transform:
     input:
-        sequences_ndjson="data/sequences_{serotype}.ndjson",
+        sequences_ndjson="data/sequences.ndjson",
         all_geolocation_rules="data/all-geolocation-rules.tsv",
     output:
-        metadata=temp("data/raw_metadata_{serotype}.tsv"),
-        sequences="data/sequences_{serotype}.fasta",
+        metadata=temp("data/raw_metadata.tsv"),
+        sequences="data/sequences.fasta",
     log:
-        "logs/transform_{serotype}.txt",
+        "logs/transform.txt",
     params:
         field_map=config["transform"]["field_map"],
         strain_regex=config["transform"]["strain_regex"],
@@ -123,9 +123,9 @@ rule transform:
 
 rule post_process_metadata:
     input:
-        metadata="data/raw_metadata_{serotype}.tsv",
+        metadata="data/raw_metadata.tsv",
     output:
-        metadata="data/metadata_{serotype}.tsv",
+        metadata="data/metadata.tsv",
     params:
         post_process_metadata_url="https://raw.githubusercontent.com/nextstrain/zika/ingest/ingest/bin/post_process_metadata.py",
 
@@ -144,11 +144,11 @@ rule post_process_metadata:
 
 rule compress:
     input:
-        sequences="data/sequences_{serotype}.fasta",
-        metadata="data/metadata_{serotype}.tsv",
+        sequences="data/sequences.fasta",
+        metadata="data/metadata.tsv",
     output:
-        sequences="data/sequences_{serotype}.fasta.zst",
-        metadata="data/metadata_{serotype}.tsv.zst",
+        sequences="data/sequences.fasta.zst",
+        metadata="data/metadata.tsv.zst",
     shell:
         """
         zstd -T0 -o {output.sequences} {input.sequences}