From 3b1685410fbb706b8d1025a76fe43f7eba51e3c0 Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Fri, 4 Aug 2023 15:49:48 +1200 Subject: [PATCH] DROP! test datasets A python script to post-process a nCoV dataset to create annotations which more closely match the biology. There may be some off-by-one errors there, and it's incomplete, but it's very useful for testing. Of course we would use Nextclade to actually generate the translations P.S. The get-data script is run for Heroku review-apps --- post-process-ncov.py | 156 +++++++++++++++++++++++++++++++++++++++++++ scripts/get-data.sh | 74 ++------------------ 2 files changed, 160 insertions(+), 70 deletions(-) create mode 100644 post-process-ncov.py diff --git a/post-process-ncov.py b/post-process-ncov.py new file mode 100644 index 000000000..8434e8e2c --- /dev/null +++ b/post-process-ncov.py @@ -0,0 +1,156 @@ +NEW_ANNOTATION = { + "nuc": { + "start": 1, + "end": 29903, + "strand": "+" + }, + "ORF1ab": { + "gene": "ORF1ab", + "strand": "+", + "segments":[ + {"start": 266, "end": 13468, "name": "ORF1a"}, + {"start": 13468, "end": 21555, "name": "ORF1b"} + ], + "display_name": "AKA polyprotein PP1ab. -1 ribisomal frameshift. Cleaved to yield 15 nonstructural proteins (NSP1-10, 12-16)" + }, + "PP1a": { + "gene": "ORF1ab", + "start": 266, + "end": 13483, + "display_name": "Polyprotein PP1a. Cleaved to yield 11 nonstructural proteins (NSP1-11)" + }, + "NSP3": { + "gene": "ORF1ab", + "color": "#2c7fb8", + "start": 266 + (819-1)*3, + "end": 266 + (2763-1)*3 -1, + "display_name": "Cleaved from short + long polyproteins", + "strand": "+", + }, + "RdRp": { + "gene": "ORF1ab", + "color": "#41b6c4", + # Length is 2796nt (932aa) + "segments":[ + { # first segment is before the slip + "start": 266 + (4393-1)*3, # 13442 + "end": 13468, + }, + { + "start": 13468, + "end": 13468 + 2796 -1 + } + ], + "display_name": "NSP12; Cleaved from long polyprotein only; I'm not sure if the coordinates are correct, BTW!!!", + "strand": "+", + }, + "S": { + "gene": "Spike", + "end": 25384, + "display_name": "structural protein; spike protein; surface glycoprotein", + "start": 21563, + "strand": "+", + }, + "E": { + "end": 26472, + "dsiplay_name": "ORF4; structural protein; E protein", + "start": 26245, + "strand": "+", + "type": "CDS" + }, + "M": { + "end": 27191, + "start": 26523, + "strand": "+", + "gene": "M", + "display_name": "ORF5; structural protein (membrane glycoprotein)" + }, + "N": { + "end": 29533, + "display_name": "nucleocapsid phosphoprotein (ORF9)", + "start": 28274, + "strand": "+", + }, + "ORF3a": { + "end": 26220, + "start": 25393, + "strand": "+", + }, + "ORF6": { + "end": 27387, + "start": 27202, + "strand": "+", + }, + "ORF7a": { + "end": 27759, + "start": 27394, + "strand": "+", + }, + "ORF7b": { + "end": 27887, + "start": 27756, + "strand": "+", + }, + "ORF8": { + "end": 28259, + "start": 27894, + "strand": "+", + }, + "ORF9b": { + "end": 28577, + "start": 28284, + "strand": "+", + }, +} + +def a_pos_b(m): + return (m[0], int(m[1:-1]), m[-1]) + +def recurse(node): + + mutations = node.get('branch_attrs', {}).get('mutations', {}) + if 'ORF1a' in mutations: + # ORF1a -> ORF1ab is no-change + mutations['ORF1ab'] = [*mutations['ORF1a']] + mutations['PP1a'] = [*mutations['ORF1a']] + del mutations['ORF1a'] + if 'ORF1b' in mutations: + if 'ORF1ab' not in mutations: + mutations['ORF1ab'] = []; + for m in mutations['ORF1b']: + # ORF1b is in phase with ORF1a + a, pos, b = a_pos_b(m) + mutations['ORF1ab'].append(f"{a}{pos+4401}{b}") + del mutations['ORF1b'] + + # Extract mutations which fall in NSP3 + if 'ORF1ab' in mutations: + mutations['NSP3'] = [] + for m in mutations['ORF1ab']: + a, pos, b = a_pos_b(m) + # relative to PP1ab the coords are 819..2763 (in aa space) + if pos>=819 and pos<=2763: + mutations['NSP3'].append(f"{a}{pos-819+1}{b}") + + # Extract mutations which fall in RdRp + if 'ORF1ab' in mutations: + mutations['RdRp'] = [] + for m in mutations['ORF1ab']: + a, pos, b = a_pos_b(m) + # relative to PP1ab the coords are 4393..5324 (in aa space, so don't need to worry about -1 slippage) + if pos>=4393 and pos<=5324: + mutations['RdRp'].append(f"{a}{pos-4393+1}{b}") + + if "children" in node: + [recurse(child) for child in node["children"]] + + + +import json +with open("./data/nextclade_sars-cov-2.json", 'r') as fh: + dataset = json.load(fh) +recurse(dataset['tree']) +dataset['meta']['genome_annotations'] = NEW_ANNOTATION +dataset['meta']['title'] = 'nCoV with adjusted annotations (use with caution!)' +with open("./datasets/entropy2023/entropy-test-data_ncov.json", 'w') as fh: + json.dump(dataset, fh, indent=2) diff --git a/scripts/get-data.sh b/scripts/get-data.sh index 7861efdc6..e91c3edf8 100755 --- a/scripts/get-data.sh +++ b/scripts/get-data.sh @@ -1,83 +1,17 @@ #!/bin/bash data_files=( - "dengue_all.json" "dengue_denv1.json" "dengue_denv2.json" "dengue_denv3.json" "dengue_denv4.json"\ - "ebola.json" "ebola_root-sequence.json" \ - "ebola_2019-09-14-no-epi-id_meta.json" "ebola_2019-09-14-no-epi-id_tree.json" \ - "lassa_s_tree.json" "lassa_s_meta.json" \ - "lassa_l_tree.json" "lassa_l_meta.json" \ - "measles.json" \ - "mers_tree.json" "mers_meta.json" \ - "mumps_global.json" "mumps_na.json" \ - "WNV_NA_tree.json" "WNV_NA_meta.json" \ + "entropy-test-data_hepB.json" \ + "entropy-test-data_ncov.json" \ "zika.json" \ - "tb_global_meta.json" "tb_global_tree.json" \ - "enterovirus_d68_genome_meta.json" "enterovirus_d68_genome_tree.json" \ - "enterovirus_d68_vp1_meta.json" "enterovirus_d68_vp1_tree.json" \ - ############## AVIAN FLU ############## - "flu_avian_h7n9_ha.json" \ - "flu_avian_h7n9_mp.json" \ - "flu_avian_h7n9_na.json" \ - "flu_avian_h7n9_np.json" \ - "flu_avian_h7n9_ns.json" \ - "flu_avian_h7n9_pa.json" \ - "flu_avian_h7n9_pb1.json" \ - "flu_avian_h7n9_pb2.json" \ - ############## SEASONAL FLU ############## - "flu_seasonal_h3n2_ha_2y.json" "flu_seasonal_h3n2_ha_2y_tip-frequencies.json" \ - "flu_seasonal_h3n2_ha_3y.json" "flu_seasonal_h3n2_ha_3y_tip-frequencies.json" \ - "flu_seasonal_h3n2_ha_6y.json" "flu_seasonal_h3n2_ha_6y_tip-frequencies.json" \ - "flu_seasonal_h3n2_ha_12y.json" "flu_seasonal_h3n2_ha_12y_tip-frequencies.json" \ - "flu_seasonal_h3n2_na_2y.json" "flu_seasonal_h3n2_na_2y_tip-frequencies.json" \ - "flu_seasonal_h3n2_na_3y.json" "flu_seasonal_h3n2_na_3y_tip-frequencies.json" \ - "flu_seasonal_h3n2_na_6y.json" "flu_seasonal_h3n2_na_6y_tip-frequencies.json" \ - "flu_seasonal_h3n2_na_12y.json" "flu_seasonal_h3n2_na_12y_tip-frequencies.json" \ - "flu_seasonal_h1n1pdm_ha_2y.json" "flu_seasonal_h1n1pdm_ha_2y_tip-frequencies.json" \ - "flu_seasonal_h1n1pdm_ha_3y.json" "flu_seasonal_h1n1pdm_ha_3y_tip-frequencies.json" \ - "flu_seasonal_h1n1pdm_ha_6y.json" "flu_seasonal_h1n1pdm_ha_6y_tip-frequencies.json" \ - "flu_seasonal_h1n1pdm_ha_12y.json" "flu_seasonal_h1n1pdm_ha_12y_tip-frequencies.json" \ - "flu_seasonal_h1n1pdm_ha_pandemic_meta.json" "flu_seasonal_h1n1pdm_ha_pandemic_tree.json" "flu_seasonal_h1n1pdm_ha_pandemic_tip-frequencies.json" \ - "flu_seasonal_h1n1pdm_na_2y.json" "flu_seasonal_h1n1pdm_na_2y_tip-frequencies.json" \ - "flu_seasonal_h1n1pdm_na_3y.json" "flu_seasonal_h1n1pdm_na_3y_tip-frequencies.json" \ - "flu_seasonal_h1n1pdm_na_6y.json" "flu_seasonal_h1n1pdm_na_6y_tip-frequencies.json" \ - "flu_seasonal_h1n1pdm_na_12y.json" "flu_seasonal_h1n1pdm_na_12y_tip-frequencies.json" \ - "flu_seasonal_h1n1pdm_na_pandemic_tree.json" "flu_seasonal_h1n1pdm_na_pandemic_meta.json" "flu_seasonal_h1n1pdm_na_pandemic_tip-frequencies.json" \ - "flu_seasonal_vic_ha_2y.json" "flu_seasonal_vic_ha_2y_tip-frequencies.json" "flu_seasonal_vic_ha_2y_root-sequence.json" \ - "flu_seasonal_vic_ha_3y.json" "flu_seasonal_vic_ha_3y_tip-frequencies.json" "flu_seasonal_vic_ha_3y_root-sequence.json" \ - "flu_seasonal_vic_ha_6y.json" "flu_seasonal_vic_ha_6y_tip-frequencies.json" "flu_seasonal_vic_ha_6y_root-sequence.json" \ - "flu_seasonal_vic_ha_12y.json" "flu_seasonal_vic_ha_12y_tip-frequencies.json" "flu_seasonal_vic_ha_12y_root-sequence.json" \ - "flu_seasonal_vic_na_2y.json" "flu_seasonal_vic_na_2y_tip-frequencies.json" "flu_seasonal_vic_na_2y_root-sequence.json" \ - "flu_seasonal_vic_na_3y.json" "flu_seasonal_vic_na_3y_tip-frequencies.json" "flu_seasonal_vic_na_3y_root-sequence.json" \ - "flu_seasonal_vic_na_6y.json" "flu_seasonal_vic_na_6y_tip-frequencies.json" "flu_seasonal_vic_na_6y_root-sequence.json" \ - "flu_seasonal_vic_na_12y.json" "flu_seasonal_vic_na_12y_tip-frequencies.json" "flu_seasonal_vic_na_12y_root-sequence.json" \ - "flu_seasonal_yam_ha_2y.json" "flu_seasonal_yam_ha_2y_tip-frequencies.json" "flu_seasonal_yam_ha_2y_root-sequence.json" \ - "flu_seasonal_yam_ha_3y.json" "flu_seasonal_yam_ha_3y_tip-frequencies.json" "flu_seasonal_yam_ha_3y_root-sequence.json" \ - "flu_seasonal_yam_ha_6y.json" "flu_seasonal_yam_ha_6y_tip-frequencies.json" "flu_seasonal_yam_ha_6y_root-sequence.json" \ - "flu_seasonal_yam_ha_12y.json" "flu_seasonal_yam_ha_12y_tip-frequencies.json" "flu_seasonal_yam_ha_12y_root-sequence.json" \ - "flu_seasonal_yam_na_2y.json" "flu_seasonal_yam_na_2y_tip-frequencies.json" "flu_seasonal_yam_na_2y_root-sequence.json" \ - "flu_seasonal_yam_na_3y.json" "flu_seasonal_yam_na_3y_tip-frequencies.json" "flu_seasonal_yam_na_3y_root-sequence.json" \ - "flu_seasonal_yam_na_6y.json" "flu_seasonal_yam_na_6y_tip-frequencies.json" "flu_seasonal_yam_na_6y_root-sequence.json" \ - "flu_seasonal_yam_na_12y.json" "flu_seasonal_yam_na_12y_tip-frequencies.json" "flu_seasonal_yam_na_12y_root-sequence.json" \ - ############## LATEST CORE SARS-CoV-2 (COVID-19) BUILDS ############## - "ncov_gisaid_global.json" "ncov_gisaid_global_tip-frequencies.json" \ - "ncov_gisaid_africa.json" "ncov_gisaid_africa_tip-frequencies.json" \ - "ncov_gisaid_asia.json" "ncov_gisaid_asia_tip-frequencies.json" \ - "ncov_gisaid_europe.json" "ncov_gisaid_europe_tip-frequencies.json" \ - "ncov_gisaid_north-america.json" "ncov_gisaid_north-america_tip-frequencies.json" \ - "ncov_gisaid_oceania.json" "ncov_gisaid_oceania_tip-frequencies.json" \ - "ncov_gisaid_south-america.json" "ncov_gisaid_south-america_tip-frequencies.json" \ - ############## TIMESTAMPED SARS-CoV-2 BUILDS USED IN NARRATIVES ############# - "ncov_2020-01-23.json" "ncov_2020-01-25.json" "ncov_2020-01-26.json" "ncov_2020-01-30.json" \ - "ncov_2020-03-04.json" "ncov_2020-03-05.json" "ncov_2020-03-11.json" "ncov_2020-03-13.json" \ - "ncov_2020-03-20.json" "ncov_2020-03-27.json" "ncov_2020-04-03.json" \ - "ncov_global_2020-04-09.json" "ncov_north-america_2020-04-17.json" \ + "monkeypox_mpxv.json" \ ) rm -rf data/ mkdir -p data/ for i in "${data_files[@]}" do - curl http://data.nextstrain.org/"${i}" --compressed -o data/"${i}" + curl http://staging.nextstrain.org/"${i}" --compressed -o data/"${i}" done echo "Copying the test datasets from test/data to data"