From 3b1685410fbb706b8d1025a76fe43f7eba51e3c0 Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Fri, 4 Aug 2023 15:49:48 +1200
Subject: [PATCH] DROP! test datasets

A python script to post-process a nCoV dataset to create annotations
which more closely match the biology. There may be some off-by-one
errors there, and it's incomplete, but it's very useful for testing. Of
course we would use Nextclade to actually generate the translations

P.S. The get-data script is run for Heroku review-apps
---
 post-process-ncov.py | 156 +++++++++++++++++++++++++++++++++++++++++++
 scripts/get-data.sh  |  74 ++------------------
 2 files changed, 160 insertions(+), 70 deletions(-)
 create mode 100644 post-process-ncov.py

diff --git a/post-process-ncov.py b/post-process-ncov.py
new file mode 100644
index 000000000..8434e8e2c
--- /dev/null
+++ b/post-process-ncov.py
@@ -0,0 +1,156 @@
+NEW_ANNOTATION = {
+  "nuc": {
+    "start": 1,
+    "end": 29903,
+    "strand": "+"
+  },
+  "ORF1ab": {
+    "gene": "ORF1ab",
+    "strand": "+",
+    "segments":[
+      {"start": 266, "end": 13468, "name": "ORF1a"},
+      {"start": 13468, "end": 21555, "name": "ORF1b"}
+    ],
+    "display_name": "AKA polyprotein PP1ab. -1 ribisomal frameshift. Cleaved to yield 15 nonstructural proteins (NSP1-10, 12-16)"
+  },
+  "PP1a": {
+    "gene": "ORF1ab",
+    "start": 266,
+    "end": 13483,
+    "display_name": "Polyprotein PP1a. Cleaved to yield 11 nonstructural proteins (NSP1-11)"
+  },
+  "NSP3": {
+    "gene": "ORF1ab",
+    "color": "#2c7fb8",
+    "start": 266 + (819-1)*3,
+    "end": 266 + (2763-1)*3 -1,
+    "display_name": "Cleaved from short + long polyproteins",
+    "strand": "+",
+  },
+  "RdRp": {
+    "gene": "ORF1ab",
+    "color": "#41b6c4",
+    # Length is 2796nt (932aa)
+    "segments":[
+      { # first segment is before the slip
+        "start": 266 + (4393-1)*3, # 13442
+        "end": 13468,
+      },
+      {
+        "start": 13468,
+        "end": 13468 + 2796 -1
+      }
+    ],
+    "display_name": "NSP12; Cleaved from long polyprotein only; I'm not sure if the coordinates are correct, BTW!!!",
+    "strand": "+",
+  },
+  "S": {
+    "gene": "Spike",
+    "end": 25384,
+    "display_name": "structural protein; spike protein; surface glycoprotein",
+    "start": 21563,
+    "strand": "+",
+  },
+  "E": {
+    "end": 26472,
+    "dsiplay_name": "ORF4; structural protein; E protein",
+    "start": 26245,
+    "strand": "+",
+    "type": "CDS"
+  },
+  "M": {
+    "end": 27191,
+    "start": 26523,
+    "strand": "+",
+    "gene": "M",
+    "display_name": "ORF5; structural protein (membrane glycoprotein)"
+  },
+  "N": {
+    "end": 29533,
+    "display_name": "nucleocapsid phosphoprotein (ORF9)",
+    "start": 28274,
+    "strand": "+",
+  },
+  "ORF3a": {
+    "end": 26220,
+    "start": 25393,
+    "strand": "+",
+  },
+  "ORF6": {
+    "end": 27387,
+    "start": 27202,
+    "strand": "+",
+  },
+  "ORF7a": {
+    "end": 27759,
+    "start": 27394,
+    "strand": "+",
+  },
+  "ORF7b": {
+    "end": 27887,
+    "start": 27756,
+    "strand": "+",
+  },
+  "ORF8": {
+    "end": 28259,
+    "start": 27894,
+    "strand": "+",
+  },
+  "ORF9b": {
+    "end": 28577,
+    "start": 28284,
+    "strand": "+",
+  },
+}
+
+def a_pos_b(m):
+  return (m[0], int(m[1:-1]), m[-1])
+  
+def recurse(node):
+
+  mutations = node.get('branch_attrs', {}).get('mutations', {})
+  if 'ORF1a' in mutations:
+    # ORF1a -> ORF1ab is no-change
+    mutations['ORF1ab'] = [*mutations['ORF1a']]
+    mutations['PP1a'] = [*mutations['ORF1a']]
+    del mutations['ORF1a']
+  if 'ORF1b' in mutations:
+    if 'ORF1ab' not in mutations:
+      mutations['ORF1ab'] = [];
+    for m in mutations['ORF1b']:
+      # ORF1b is in phase with ORF1a
+      a, pos, b = a_pos_b(m)
+      mutations['ORF1ab'].append(f"{a}{pos+4401}{b}")
+    del mutations['ORF1b']
+
+  # Extract mutations which fall in NSP3
+  if 'ORF1ab' in mutations:
+    mutations['NSP3'] = []
+    for m in mutations['ORF1ab']:
+      a, pos, b = a_pos_b(m)
+      # relative to PP1ab the coords are 819..2763 (in aa space)
+      if pos>=819 and pos<=2763:
+        mutations['NSP3'].append(f"{a}{pos-819+1}{b}")
+
+  # Extract mutations which fall in RdRp 
+  if 'ORF1ab' in mutations:
+    mutations['RdRp'] = []
+    for m in mutations['ORF1ab']:
+      a, pos, b = a_pos_b(m)
+      # relative to PP1ab the coords are 4393..5324 (in aa space, so don't need to worry about -1 slippage)
+      if pos>=4393 and pos<=5324:
+        mutations['RdRp'].append(f"{a}{pos-4393+1}{b}")    
+
+  if "children" in node:
+    [recurse(child) for child in node["children"]]
+
+
+
+import json
+with open("./data/nextclade_sars-cov-2.json", 'r') as fh:
+    dataset = json.load(fh)
+recurse(dataset['tree'])
+dataset['meta']['genome_annotations'] = NEW_ANNOTATION
+dataset['meta']['title'] = 'nCoV with adjusted annotations (use with caution!)'
+with open("./datasets/entropy2023/entropy-test-data_ncov.json", 'w') as fh:
+    json.dump(dataset, fh, indent=2)
diff --git a/scripts/get-data.sh b/scripts/get-data.sh
index 7861efdc6..e91c3edf8 100755
--- a/scripts/get-data.sh
+++ b/scripts/get-data.sh
@@ -1,83 +1,17 @@
 #!/bin/bash
 
 data_files=(
-  "dengue_all.json" "dengue_denv1.json" "dengue_denv2.json" "dengue_denv3.json" "dengue_denv4.json"\
-  "ebola.json" "ebola_root-sequence.json" \
-  "ebola_2019-09-14-no-epi-id_meta.json" "ebola_2019-09-14-no-epi-id_tree.json" \
-  "lassa_s_tree.json" "lassa_s_meta.json" \
-  "lassa_l_tree.json" "lassa_l_meta.json" \
-  "measles.json" \
-  "mers_tree.json" "mers_meta.json" \
-  "mumps_global.json" "mumps_na.json" \
-  "WNV_NA_tree.json" "WNV_NA_meta.json" \
+  "entropy-test-data_hepB.json" \
+  "entropy-test-data_ncov.json" \
   "zika.json" \
-  "tb_global_meta.json" "tb_global_tree.json" \
-  "enterovirus_d68_genome_meta.json" "enterovirus_d68_genome_tree.json" \
-  "enterovirus_d68_vp1_meta.json" "enterovirus_d68_vp1_tree.json" \
-  ##############              AVIAN FLU           ##############
-  "flu_avian_h7n9_ha.json" \
-  "flu_avian_h7n9_mp.json" \
-  "flu_avian_h7n9_na.json" \
-  "flu_avian_h7n9_np.json" \
-  "flu_avian_h7n9_ns.json" \
-  "flu_avian_h7n9_pa.json" \
-  "flu_avian_h7n9_pb1.json" \
-  "flu_avian_h7n9_pb2.json" \
-  ##############              SEASONAL FLU           ##############
-  "flu_seasonal_h3n2_ha_2y.json" "flu_seasonal_h3n2_ha_2y_tip-frequencies.json" \
-  "flu_seasonal_h3n2_ha_3y.json" "flu_seasonal_h3n2_ha_3y_tip-frequencies.json" \
-  "flu_seasonal_h3n2_ha_6y.json" "flu_seasonal_h3n2_ha_6y_tip-frequencies.json" \
-  "flu_seasonal_h3n2_ha_12y.json" "flu_seasonal_h3n2_ha_12y_tip-frequencies.json" \
-  "flu_seasonal_h3n2_na_2y.json" "flu_seasonal_h3n2_na_2y_tip-frequencies.json" \
-  "flu_seasonal_h3n2_na_3y.json" "flu_seasonal_h3n2_na_3y_tip-frequencies.json" \
-  "flu_seasonal_h3n2_na_6y.json" "flu_seasonal_h3n2_na_6y_tip-frequencies.json" \
-  "flu_seasonal_h3n2_na_12y.json" "flu_seasonal_h3n2_na_12y_tip-frequencies.json" \
-  "flu_seasonal_h1n1pdm_ha_2y.json" "flu_seasonal_h1n1pdm_ha_2y_tip-frequencies.json" \
-  "flu_seasonal_h1n1pdm_ha_3y.json" "flu_seasonal_h1n1pdm_ha_3y_tip-frequencies.json" \
-  "flu_seasonal_h1n1pdm_ha_6y.json" "flu_seasonal_h1n1pdm_ha_6y_tip-frequencies.json" \
-  "flu_seasonal_h1n1pdm_ha_12y.json" "flu_seasonal_h1n1pdm_ha_12y_tip-frequencies.json" \
-  "flu_seasonal_h1n1pdm_ha_pandemic_meta.json" "flu_seasonal_h1n1pdm_ha_pandemic_tree.json" "flu_seasonal_h1n1pdm_ha_pandemic_tip-frequencies.json" \
-  "flu_seasonal_h1n1pdm_na_2y.json" "flu_seasonal_h1n1pdm_na_2y_tip-frequencies.json" \
-  "flu_seasonal_h1n1pdm_na_3y.json" "flu_seasonal_h1n1pdm_na_3y_tip-frequencies.json" \
-  "flu_seasonal_h1n1pdm_na_6y.json" "flu_seasonal_h1n1pdm_na_6y_tip-frequencies.json" \
-  "flu_seasonal_h1n1pdm_na_12y.json" "flu_seasonal_h1n1pdm_na_12y_tip-frequencies.json" \
-  "flu_seasonal_h1n1pdm_na_pandemic_tree.json" "flu_seasonal_h1n1pdm_na_pandemic_meta.json" "flu_seasonal_h1n1pdm_na_pandemic_tip-frequencies.json" \
-  "flu_seasonal_vic_ha_2y.json" "flu_seasonal_vic_ha_2y_tip-frequencies.json" "flu_seasonal_vic_ha_2y_root-sequence.json" \
-  "flu_seasonal_vic_ha_3y.json" "flu_seasonal_vic_ha_3y_tip-frequencies.json" "flu_seasonal_vic_ha_3y_root-sequence.json" \
-  "flu_seasonal_vic_ha_6y.json" "flu_seasonal_vic_ha_6y_tip-frequencies.json" "flu_seasonal_vic_ha_6y_root-sequence.json" \
-  "flu_seasonal_vic_ha_12y.json" "flu_seasonal_vic_ha_12y_tip-frequencies.json" "flu_seasonal_vic_ha_12y_root-sequence.json" \
-  "flu_seasonal_vic_na_2y.json" "flu_seasonal_vic_na_2y_tip-frequencies.json" "flu_seasonal_vic_na_2y_root-sequence.json" \
-  "flu_seasonal_vic_na_3y.json" "flu_seasonal_vic_na_3y_tip-frequencies.json" "flu_seasonal_vic_na_3y_root-sequence.json" \
-  "flu_seasonal_vic_na_6y.json" "flu_seasonal_vic_na_6y_tip-frequencies.json" "flu_seasonal_vic_na_6y_root-sequence.json" \
-  "flu_seasonal_vic_na_12y.json" "flu_seasonal_vic_na_12y_tip-frequencies.json" "flu_seasonal_vic_na_12y_root-sequence.json" \
-  "flu_seasonal_yam_ha_2y.json" "flu_seasonal_yam_ha_2y_tip-frequencies.json" "flu_seasonal_yam_ha_2y_root-sequence.json" \
-  "flu_seasonal_yam_ha_3y.json" "flu_seasonal_yam_ha_3y_tip-frequencies.json" "flu_seasonal_yam_ha_3y_root-sequence.json" \
-  "flu_seasonal_yam_ha_6y.json" "flu_seasonal_yam_ha_6y_tip-frequencies.json" "flu_seasonal_yam_ha_6y_root-sequence.json" \
-  "flu_seasonal_yam_ha_12y.json" "flu_seasonal_yam_ha_12y_tip-frequencies.json" "flu_seasonal_yam_ha_12y_root-sequence.json" \
-  "flu_seasonal_yam_na_2y.json" "flu_seasonal_yam_na_2y_tip-frequencies.json" "flu_seasonal_yam_na_2y_root-sequence.json" \
-  "flu_seasonal_yam_na_3y.json" "flu_seasonal_yam_na_3y_tip-frequencies.json" "flu_seasonal_yam_na_3y_root-sequence.json" \
-  "flu_seasonal_yam_na_6y.json" "flu_seasonal_yam_na_6y_tip-frequencies.json" "flu_seasonal_yam_na_6y_root-sequence.json" \
-  "flu_seasonal_yam_na_12y.json" "flu_seasonal_yam_na_12y_tip-frequencies.json" "flu_seasonal_yam_na_12y_root-sequence.json" \
-  ##############            LATEST CORE SARS-CoV-2 (COVID-19) BUILDS            ##############
-  "ncov_gisaid_global.json" "ncov_gisaid_global_tip-frequencies.json" \
-  "ncov_gisaid_africa.json" "ncov_gisaid_africa_tip-frequencies.json" \
-  "ncov_gisaid_asia.json" "ncov_gisaid_asia_tip-frequencies.json" \
-  "ncov_gisaid_europe.json" "ncov_gisaid_europe_tip-frequencies.json" \
-  "ncov_gisaid_north-america.json" "ncov_gisaid_north-america_tip-frequencies.json" \
-  "ncov_gisaid_oceania.json" "ncov_gisaid_oceania_tip-frequencies.json" \
-  "ncov_gisaid_south-america.json" "ncov_gisaid_south-america_tip-frequencies.json" \
-  ##############            TIMESTAMPED  SARS-CoV-2 BUILDS USED IN NARRATIVES           #############
-  "ncov_2020-01-23.json" "ncov_2020-01-25.json" "ncov_2020-01-26.json" "ncov_2020-01-30.json" \
-  "ncov_2020-03-04.json" "ncov_2020-03-05.json" "ncov_2020-03-11.json" "ncov_2020-03-13.json" \
-  "ncov_2020-03-20.json" "ncov_2020-03-27.json" "ncov_2020-04-03.json" \
-  "ncov_global_2020-04-09.json" "ncov_north-america_2020-04-17.json" \
+  "monkeypox_mpxv.json" \
 )
 
 rm -rf data/
 mkdir -p data/
 for i in "${data_files[@]}"
 do
-  curl http://data.nextstrain.org/"${i}" --compressed -o data/"${i}"
+  curl http://staging.nextstrain.org/"${i}" --compressed -o data/"${i}"
 done
 
 echo "Copying the test datasets from test/data to data"