nextstrain · j23414 · Nov 17, 2022 · Mar 28, 2023 · Nov 17, 2022 · Nov 17, 2022
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,8 @@ environment*
 
 # Snakemake state dir
 /.snakemake
+ingest/.snakemake
+ingest/logs
 
 # Local config overrides
 /config_local.yaml

diff --git a/README.md b/README.md
@@ -54,7 +54,14 @@ This build starts by pulling preprocessed sequence and metadata files from:
 * https://data.nextstrain.org/files/dengue/sequences_denv4.fasta.zst
 * https://data.nextstrain.org/files/dengue/metadata_denv4.tsv.zst
 
-The above datasets have been preprocessed and cleaned from GenBank and are updated at regular intervals. 
+The above datasets have been preprocessed and cleaned from GenBank and are updated at regular intervals from the ingest folder.
+
+```
+nextstrain build ingest
+
+# Upload final dataset and trigger slack notifications
+nextstrain build ingest  --configfiles config/config.yaml config/optional.yaml
+```
 
 ### Using example data
 

diff --git a/Snakefile b/Snakefile
@@ -1,9 +1,34 @@
+from snakemake.utils import min_version
+min_version("6.0")
+
+# Use default pathogen build config if no configs are provided
+if not config:
+    configfile: "config/config_dengue.yaml"
+# Use default ingest config if no `transform` config is provided
+if not config.get("transform"):
+    configfile: "ingest/config/config.yaml"
+
+# Add the hard-coded ingest basedir to the workflow config so that we can
+# pass it to the module ingest workflow. This will allow shell scripts to
+# use the proper paths for local script invocation since we cannot set the
+# workdir separately for module workflows.
+# This work around is based on https://stackoverflow.com/a/66890412
+config["ingest_basedir"] = f"{workflow.current_basedir}/ingest"
+
 serotypes = ['all', 'denv1', 'denv2', 'denv3', 'denv4']
 
 rule all:
     input:
         auspice_json = expand("auspice/dengue_{serotype}.json", serotype=serotypes)
 
+module ingest_workflow:
+    snakefile:
+        "ingest/Snakefile"
+    config: config
+    prefix: "ingest"
+
+use rule * from ingest_workflow as ingest_*
+
 rule files:
     params:
         dropped_strains = "config/dropped_strains.txt",
@@ -52,35 +77,65 @@ def clade_defs(w):
     }
     return defs[w.serotype]
 
-rule download:
-    """Downloading sequences and metadata from data.nextstrain.org"""
-    output:
-        sequences = "data/sequences_{serotype}.fasta.zst",
-        metadata = "data/metadata_{serotype}.tsv.zst"
 
-    params:
-        sequences_url = "https://data.nextstrain.org/files/dengue/sequences_{serotype}.fasta.zst",
-        metadata_url = "https://data.nextstrain.org/files/dengue/metadata_{serotype}.tsv.zst"
+rule mv_ingest_data:
+    input:
+        sequences="ingest/data/sequences_{serotype}.fasta.zst",
+        metadata="ingest/data/metadata_{serotype}.tsv.zst",
+    output:
+        sequences="data/sequences_{serotype}.fasta.zst",
+        metadata="data/metadata_{serotype}.tsv.zst",
     shell:
         """
-        curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences}
-        curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata}
+        mv {input.sequences} {output.sequences}
+        mv {input.metadata} {output.metadata}
         """
 
+if config.get("s3_src"):
+    ruleorder: download > mv_ingest_data
+
+    rule download:
+        """Downloading sequences and metadata from data.nextstrain.org"""
+        output:
+            sequences = "data/sequences_{serotype}.fasta.zst",
+            metadata = "data/metadata_{serotype}.tsv.zst"
+
+        params:
+            sequences_url = f"{config.get('s3_src')}/sequences_{{serotype}}.fasta.zst",
+            metadata_url = f"{config.get('s3_src')}/metadata_{{serotype}}.tsv.zst"
+        shell:
+            """
+            curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences}
+            curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata}
+            """
+
 rule decompress:
     """Parsing fasta into sequences and metadata"""
     input:
         sequences = "data/sequences_{serotype}.fasta.zst",
         metadata = "data/metadata_{serotype}.tsv.zst"
     output:
-        sequences = "results/sequences_{serotype}.fasta",
-        metadata = "results/metadata_{serotype}.tsv"
+        sequences = "data/sequences_{serotype}.fasta",
+        metadata = "data/metadata_{serotype}.tsv"
     shell:
         """
         zstd -d -c {input.sequences} > {output.sequences}
         zstd -d -c {input.metadata} > {output.metadata}
         """
 
+rule wrangle_metadata:
+    input:
+        metadata="data/metadata_{serotype}.tsv",
+    output:
+        metadata="results/wrangled_metadata_{serotype}.tsv",
+    params:
+        strain_id=config.get("strain_id_field", "strain"), #accession
+    shell:
+        """
+        csvtk -t rename -f strain -n strain_original {input.metadata} \
+          | csvtk -t mutate -f {params.strain_id} -n strain > {output.metadata}
+        """
+
 rule filter:
     """
     Filtering to
@@ -90,8 +145,8 @@ rule filter:
       - excluding strains with missing region, country or date metadata
     """
     input:
-        sequences = "results/sequences_{serotype}.fasta",
-        metadata = "results/metadata_{serotype}.tsv",
+        sequences = "data/sequences_{serotype}.fasta",
+        metadata = "results/wrangled_metadata_{serotype}.tsv",
         exclude = files.dropped_strains
     output:
         sequences = "results/filtered_{serotype}.fasta"
@@ -122,6 +177,8 @@ rule align:
         reference = files.reference
     output:
         alignment = "results/aligned_{serotype}.fasta"
+    params:
+        threads = 1
     shell:
         """
         augur align \
@@ -130,7 +187,7 @@ rule align:
             --output {output.alignment} \
             --fill-gaps \
             --remove-reference \
-            --nthreads 1
+            --nthreads {params.threads}
         """
 
 rule tree:
@@ -158,7 +215,7 @@ rule refine:
     input:
         tree = "results/tree-raw_{serotype}.nwk",
         alignment = "results/aligned_{serotype}.fasta",
-        metadata = "results/metadata_{serotype}.tsv"
+        metadata = "results/wrangled_metadata_{serotype}.tsv"
     output:
         tree = "results/tree_{serotype}.nwk",
         node_data = "results/branch-lengths_{serotype}.json"
@@ -223,7 +280,7 @@ rule traits:
     """
     input:
         tree = "results/tree_{serotype}.nwk",
-        metadata = "results/metadata_{serotype}.tsv"
+        metadata = "results/wrangled_metadata_{serotype}.tsv"
     output:
         node_data = "results/traits_{serotype}.json",
     params:
@@ -262,15 +319,16 @@ rule export:
     """Exporting data files for for auspice"""
     input:
         tree = "results/tree_{serotype}.nwk",
-        metadata = "results/metadata_{serotype}.tsv",
+        metadata = "results/wrangled_metadata_{serotype}.tsv",
         branch_lengths = "results/branch-lengths_{serotype}.json",
         traits = "results/traits_{serotype}.json",
         clades = "results/clades_{serotype}.json",
         nt_muts = "results/nt-muts_{serotype}.json",
         aa_muts = "results/aa-muts_{serotype}.json",
         auspice_config = files.auspice_config
     output:
-        auspice_json = "auspice/dengue_{serotype}.json"
+        auspice_json = "results/raw_dengue_{serotype}.json",
+        root_sequence = "results/raw_dengue_{serotype}_root-sequence.json",
     shell:
         """
         augur export v2 \
@@ -282,6 +340,26 @@ rule export:
             --output {output.auspice_json}
         """
 
+rule final_strain_name:
+    input:
+        auspice_json="results/raw_dengue_{serotype}.json",
+        metadata="results/wrangled_metadata_{serotype}.tsv",
+        root_sequence="results/raw_dengue_{serotype}_root-sequence.json",
+    output:
+        auspice_json="auspice/dengue_{serotype}.json",
+        root_sequence="auspice/dengue_{serotype}_root-sequence.json",
+    params:
+        display_strain_field=config.get("display_strain_field", "strain"),
+    shell:
+        """
+        python3 bin/set_final_strain_name.py \
+            --metadata {input.metadata} \
+            --input-auspice-json {input.auspice_json} \
+            --display-strain-name {params.display_strain_field} \
+            --output {output.auspice_json}
+        cp {input.root_sequence} {output.root_sequence}
+        """
+
 rule clean:
     """Removing directories: {params}"""
     params:

diff --git a/bin/set_final_strain_name.py b/bin/set_final_strain_name.py
@@ -0,0 +1,36 @@
+import pandas as pd
+import json, argparse
+
+def replace_name_recursive(node, lookup):
+    if node["name"] in lookup:
+        node["name"] = lookup[node["name"]]
+
+    if "children" in node:
+        for child in node["children"]:
+            replace_name_recursive(child, lookup)
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(
+        description="Swaps out the strain names in the Auspice JSON with the final strain name",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument('--input-auspice-json', type=str, required=True, help="input auspice_json")
+    parser.add_argument('--metadata', type=str, required=True, help="input data")
+    parser.add_argument('--display-strain-name', type=str, required=True, help="field to use as strain name in auspice")
+    parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON")
+    args = parser.parse_args()
+
+    metadata = pd.read_csv(args.metadata, sep='\t')
+    name_lookup = {}
+    for ri, row in metadata.iterrows():
+        strain_id = row['strain']
+        name_lookup[strain_id] = args.display_strain_name if pd.isna(row[args.display_strain_name]) else row[args.display_strain_name]
+
+    with open(args.input_auspice_json, 'r') as fh:
+        data = json.load(fh)
+
+    replace_name_recursive(data['tree'], name_lookup)
+
+    with open(args.output, 'w') as fh:
+        json.dump(data, fh)
diff --git a/config/config_dengue.yaml b/config/config_dengue.yaml
@@ -0,0 +1,3 @@
+strain_id_field: "accession"
+display_strain_field: "strain_original"
+# s3_src: 'https://data.nextstrain.org/files/dengue'
diff --git a/config/dropped_strains.txt b/config/dropped_strains.txt
@@ -1,69 +1,41 @@
-DENV/SPAIN/EEB17/2009
-DENV1/FRANCE/00475/2008
-DENV1/MALAYSIA/P1244/1972
-DENV1/VIETNAM/BIDV3990/2008
-DENV1/VIETNAM/BIDV992/2006
-DENV2/AUSTRALIA/QML22/2015
-DENV2/BURKINA_FASO/DAKAR2039/1980
-DENV2/BURKINA_FASO/DAKARA2022/1980
-DENV2/COTE_D_IVOIRE/DAKAR510/1980
-DENV2/COTE_D_IVOIRE/DAKAR578/1980
-DENV2/COTE_D_IVOIRE/DAKARA1247/1980
-DENV2/GUINEA/PM33974/1981
-DENV2/HAITI/DENGUEVIRUS2HOMOSAPIENS1/2016
-DENV2/MALAYSIA/DKD811/2008
-DENV2/MALAYSIA/P81407/1970
-DENV2/MALAYSIA/SAB/2015
-DENV2/NIGERIA/IBH11208/1966
-DENV2/NIGERIA/IBH11234/1966
-DENV2/NIGERIA/IBH11664/1966
-DENV2/SENEGAL/0674/1970
-DENV2/SENEGAL/DAKAR0761/1974
-DENV2/SENEGAL/DAKAR141069/1999
-DENV2/SENEGAL/DAKAR141070/1999
-DENV2/SENEGAL/DAKARD75505/1999
-DENV2/TRINIDAD_AND_TOBAGO/NA/1953
-DENV4/MALAYSIA/P215/1975
-DENV4/MALAYSIA/P514/1975
-DENV4/MALAYSIA/P731120/1973
-D2Sab2015 # miscategorized
-QML22 # miscategorized
-DAK_Ar_A1247 # sylvatic
-Dak_Ar_2039 # sylvatic
-Dak_Ar_578 # sylvatic
-DAK_Ar_510 # sylvatic
-PM33974 # sylvatic
-Dak_Ar_A2022 # sylvatic
-Dak_Ar_141069 # sylvatic
-Dak_Ar_141070 # sylvatic
-Dak_Ar_D75505 # sylvatic
-Dak_HD_10674 # sylvatic
-Dak_Ar_D20761 # sylvatic
-IBH11664 # sylvatic
-IBH11208 # sylvatic
-IBH11234 # sylvatic
-P8_1407 # sylvatic
-P75_514 # sylvatic
-P73_1120 # sylvatic
-P75_215 # sylvatic
-DKD811 # sylvatic
-ZS01/01 # metadata issue
-Vero # cell line
-MS13002673 # too divergent
-MS11011405 # too divergent
-V43257 # too divergent
-KDC0574A2_06/02/2011 # too divergent
-00178/03 # too divergent
-00759/12 # too divergent
-00988/11 # too divergent
-01113/10 # too divergent
-01224/04 # too divergent
-01231/10 # too divergent
-01488/09 # too divergent
-01542/04 # too divergent
-dev1 # too divergent
-DKE_121 # too divergent
-SENDAK_HD_10674 # sylvatic
-DENV2_1_DAK_HD_76395 # sylvatic
-DENV3/PUERTORICO/1963/PRS_228762_AC27 # too divergent
-PR_6 # too divergent
+KY923048 # D2Sab2015 # miscategorized
+KX274130 # QML22 # miscategorized
+EF105383 # DAK_Ar_A1247 # sylvatic
+EF105382 # Dak_Ar_2039 # sylvatic
+EF105380 # Dak_Ar_578 # sylvatic
+EF105381 # DAK_Ar_510 # sylvatic
+EF105378 # PM33974 # sylvatic
+EF105386 # Dak_Ar_A2022 # sylvatic
+EF105389 # Dak_Ar_141069 # sylvatic
+EF105390 # Dak_Ar_141070 # sylvatic
+EF457904 # Dak_Ar_D75505 # sylvatic
+EF105384 # Dak_HD_10674 # sylvatic
+EF105385 # Dak_Ar_D20761 # sylvatic
+EF105388 # IBH11664 # sylvatic
+EF105387 # IBH11208 # sylvatic
+EU003591 # IBH11234 # sylvatic
+EF105379 # P8_1407 # sylvatic
+JF262779 # P75_514 # sylvatic
+JF262780 # P73_1120 # sylvatic
+EF457906 # P75_215 # sylvatic
+FJ467493 # DKD811 # sylvatic
+EF051521 # ZS01/01 # metadata issue
+MT929160 # Vero # cell line
+MH048676 # MS13002673 # too divergent
+MH048674 # MS11011405 # too divergent
+MT597439 # V43257 # too divergent
+MN448607 # KDC0574A2_06/02/2011 # too divergent
+ON046268 # 00178/03 # too divergent
+ON046278 # 00759/12 # too divergent
+ON046276 # 00988/11 # too divergent
+ON046273 # 01113/10 # too divergent
+ON046270 # 01224/04 # too divergent
+ON046274 # 01231/10 # too divergent
+ON046272 # 01488/09 # too divergent
+ON046271 # 01542/04 # too divergent
+MZ284953 # dev1 # too divergent
+MZ215848 # DKE_121 # too divergent
+MW946564 # SENDAK_HD_10674 # sylvatic
+OK605757 # DENV2_1_DAK_HD_76395 # sylvatic
+MW945427 # DENV3/PUERTORICO/1963/PRS_228762_AC27 # too divergent
+OM258630 # PR_6 # too divergent
diff --git a/example_data/sequences_all.fasta.zst b/example_data/sequences_all.fasta.zst
diff --git a/example_data/sequences_denv1.fasta.zst b/example_data/sequences_denv1.fasta.zst
diff --git a/example_data/sequences_denv2.fasta.zst b/example_data/sequences_denv2.fasta.zst
diff --git a/example_data/sequences_denv3.fasta.zst b/example_data/sequences_denv3.fasta.zst
diff --git a/example_data/sequences_denv4.fasta.zst b/example_data/sequences_denv4.fasta.zst