Skip to content

Commit

Permalink
DROP! test datasets
Browse files Browse the repository at this point in the history
A python script to post-process a nCoV dataset to create annotations
which more closely match the biology. There may be some off-by-one
errors there, and it's incomplete, but it's very useful for testing. Of
course we would use Nextclade to actually generate the translations

P.S. The get-data script is run for Heroku review-apps
  • Loading branch information
jameshadfield committed Aug 17, 2023
1 parent 40f4ee4 commit 3b16854
Show file tree
Hide file tree
Showing 2 changed files with 160 additions and 70 deletions.
156 changes: 156 additions & 0 deletions post-process-ncov.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
NEW_ANNOTATION = {
"nuc": {
"start": 1,
"end": 29903,
"strand": "+"
},
"ORF1ab": {
"gene": "ORF1ab",
"strand": "+",
"segments":[
{"start": 266, "end": 13468, "name": "ORF1a"},
{"start": 13468, "end": 21555, "name": "ORF1b"}
],
"display_name": "AKA polyprotein PP1ab. -1 ribisomal frameshift. Cleaved to yield 15 nonstructural proteins (NSP1-10, 12-16)"
},
"PP1a": {
"gene": "ORF1ab",
"start": 266,
"end": 13483,
"display_name": "Polyprotein PP1a. Cleaved to yield 11 nonstructural proteins (NSP1-11)"
},
"NSP3": {
"gene": "ORF1ab",
"color": "#2c7fb8",
"start": 266 + (819-1)*3,
"end": 266 + (2763-1)*3 -1,
"display_name": "Cleaved from short + long polyproteins",
"strand": "+",
},
"RdRp": {
"gene": "ORF1ab",
"color": "#41b6c4",
# Length is 2796nt (932aa)
"segments":[
{ # first segment is before the slip
"start": 266 + (4393-1)*3, # 13442
"end": 13468,
},
{
"start": 13468,
"end": 13468 + 2796 -1
}
],
"display_name": "NSP12; Cleaved from long polyprotein only; I'm not sure if the coordinates are correct, BTW!!!",
"strand": "+",
},
"S": {
"gene": "Spike",
"end": 25384,
"display_name": "structural protein; spike protein; surface glycoprotein",
"start": 21563,
"strand": "+",
},
"E": {
"end": 26472,
"dsiplay_name": "ORF4; structural protein; E protein",
"start": 26245,
"strand": "+",
"type": "CDS"
},
"M": {
"end": 27191,
"start": 26523,
"strand": "+",
"gene": "M",
"display_name": "ORF5; structural protein (membrane glycoprotein)"
},
"N": {
"end": 29533,
"display_name": "nucleocapsid phosphoprotein (ORF9)",
"start": 28274,
"strand": "+",
},
"ORF3a": {
"end": 26220,
"start": 25393,
"strand": "+",
},
"ORF6": {
"end": 27387,
"start": 27202,
"strand": "+",
},
"ORF7a": {
"end": 27759,
"start": 27394,
"strand": "+",
},
"ORF7b": {
"end": 27887,
"start": 27756,
"strand": "+",
},
"ORF8": {
"end": 28259,
"start": 27894,
"strand": "+",
},
"ORF9b": {
"end": 28577,
"start": 28284,
"strand": "+",
},
}

def a_pos_b(m):
return (m[0], int(m[1:-1]), m[-1])

def recurse(node):

mutations = node.get('branch_attrs', {}).get('mutations', {})
if 'ORF1a' in mutations:
# ORF1a -> ORF1ab is no-change
mutations['ORF1ab'] = [*mutations['ORF1a']]
mutations['PP1a'] = [*mutations['ORF1a']]
del mutations['ORF1a']
if 'ORF1b' in mutations:
if 'ORF1ab' not in mutations:
mutations['ORF1ab'] = [];
for m in mutations['ORF1b']:
# ORF1b is in phase with ORF1a
a, pos, b = a_pos_b(m)
mutations['ORF1ab'].append(f"{a}{pos+4401}{b}")
del mutations['ORF1b']

# Extract mutations which fall in NSP3
if 'ORF1ab' in mutations:
mutations['NSP3'] = []
for m in mutations['ORF1ab']:
a, pos, b = a_pos_b(m)
# relative to PP1ab the coords are 819..2763 (in aa space)
if pos>=819 and pos<=2763:
mutations['NSP3'].append(f"{a}{pos-819+1}{b}")

# Extract mutations which fall in RdRp
if 'ORF1ab' in mutations:
mutations['RdRp'] = []
for m in mutations['ORF1ab']:
a, pos, b = a_pos_b(m)
# relative to PP1ab the coords are 4393..5324 (in aa space, so don't need to worry about -1 slippage)
if pos>=4393 and pos<=5324:
mutations['RdRp'].append(f"{a}{pos-4393+1}{b}")

if "children" in node:
[recurse(child) for child in node["children"]]



import json
with open("./data/nextclade_sars-cov-2.json", 'r') as fh:
dataset = json.load(fh)
recurse(dataset['tree'])
dataset['meta']['genome_annotations'] = NEW_ANNOTATION
dataset['meta']['title'] = 'nCoV with adjusted annotations (use with caution!)'
with open("./datasets/entropy2023/entropy-test-data_ncov.json", 'w') as fh:
json.dump(dataset, fh, indent=2)
74 changes: 4 additions & 70 deletions scripts/get-data.sh
Original file line number Diff line number Diff line change
@@ -1,83 +1,17 @@
#!/bin/bash

data_files=(
"dengue_all.json" "dengue_denv1.json" "dengue_denv2.json" "dengue_denv3.json" "dengue_denv4.json"\
"ebola.json" "ebola_root-sequence.json" \
"ebola_2019-09-14-no-epi-id_meta.json" "ebola_2019-09-14-no-epi-id_tree.json" \
"lassa_s_tree.json" "lassa_s_meta.json" \
"lassa_l_tree.json" "lassa_l_meta.json" \
"measles.json" \
"mers_tree.json" "mers_meta.json" \
"mumps_global.json" "mumps_na.json" \
"WNV_NA_tree.json" "WNV_NA_meta.json" \
"entropy-test-data_hepB.json" \
"entropy-test-data_ncov.json" \
"zika.json" \
"tb_global_meta.json" "tb_global_tree.json" \
"enterovirus_d68_genome_meta.json" "enterovirus_d68_genome_tree.json" \
"enterovirus_d68_vp1_meta.json" "enterovirus_d68_vp1_tree.json" \
############## AVIAN FLU ##############
"flu_avian_h7n9_ha.json" \
"flu_avian_h7n9_mp.json" \
"flu_avian_h7n9_na.json" \
"flu_avian_h7n9_np.json" \
"flu_avian_h7n9_ns.json" \
"flu_avian_h7n9_pa.json" \
"flu_avian_h7n9_pb1.json" \
"flu_avian_h7n9_pb2.json" \
############## SEASONAL FLU ##############
"flu_seasonal_h3n2_ha_2y.json" "flu_seasonal_h3n2_ha_2y_tip-frequencies.json" \
"flu_seasonal_h3n2_ha_3y.json" "flu_seasonal_h3n2_ha_3y_tip-frequencies.json" \
"flu_seasonal_h3n2_ha_6y.json" "flu_seasonal_h3n2_ha_6y_tip-frequencies.json" \
"flu_seasonal_h3n2_ha_12y.json" "flu_seasonal_h3n2_ha_12y_tip-frequencies.json" \
"flu_seasonal_h3n2_na_2y.json" "flu_seasonal_h3n2_na_2y_tip-frequencies.json" \
"flu_seasonal_h3n2_na_3y.json" "flu_seasonal_h3n2_na_3y_tip-frequencies.json" \
"flu_seasonal_h3n2_na_6y.json" "flu_seasonal_h3n2_na_6y_tip-frequencies.json" \
"flu_seasonal_h3n2_na_12y.json" "flu_seasonal_h3n2_na_12y_tip-frequencies.json" \
"flu_seasonal_h1n1pdm_ha_2y.json" "flu_seasonal_h1n1pdm_ha_2y_tip-frequencies.json" \
"flu_seasonal_h1n1pdm_ha_3y.json" "flu_seasonal_h1n1pdm_ha_3y_tip-frequencies.json" \
"flu_seasonal_h1n1pdm_ha_6y.json" "flu_seasonal_h1n1pdm_ha_6y_tip-frequencies.json" \
"flu_seasonal_h1n1pdm_ha_12y.json" "flu_seasonal_h1n1pdm_ha_12y_tip-frequencies.json" \
"flu_seasonal_h1n1pdm_ha_pandemic_meta.json" "flu_seasonal_h1n1pdm_ha_pandemic_tree.json" "flu_seasonal_h1n1pdm_ha_pandemic_tip-frequencies.json" \
"flu_seasonal_h1n1pdm_na_2y.json" "flu_seasonal_h1n1pdm_na_2y_tip-frequencies.json" \
"flu_seasonal_h1n1pdm_na_3y.json" "flu_seasonal_h1n1pdm_na_3y_tip-frequencies.json" \
"flu_seasonal_h1n1pdm_na_6y.json" "flu_seasonal_h1n1pdm_na_6y_tip-frequencies.json" \
"flu_seasonal_h1n1pdm_na_12y.json" "flu_seasonal_h1n1pdm_na_12y_tip-frequencies.json" \
"flu_seasonal_h1n1pdm_na_pandemic_tree.json" "flu_seasonal_h1n1pdm_na_pandemic_meta.json" "flu_seasonal_h1n1pdm_na_pandemic_tip-frequencies.json" \
"flu_seasonal_vic_ha_2y.json" "flu_seasonal_vic_ha_2y_tip-frequencies.json" "flu_seasonal_vic_ha_2y_root-sequence.json" \
"flu_seasonal_vic_ha_3y.json" "flu_seasonal_vic_ha_3y_tip-frequencies.json" "flu_seasonal_vic_ha_3y_root-sequence.json" \
"flu_seasonal_vic_ha_6y.json" "flu_seasonal_vic_ha_6y_tip-frequencies.json" "flu_seasonal_vic_ha_6y_root-sequence.json" \
"flu_seasonal_vic_ha_12y.json" "flu_seasonal_vic_ha_12y_tip-frequencies.json" "flu_seasonal_vic_ha_12y_root-sequence.json" \
"flu_seasonal_vic_na_2y.json" "flu_seasonal_vic_na_2y_tip-frequencies.json" "flu_seasonal_vic_na_2y_root-sequence.json" \
"flu_seasonal_vic_na_3y.json" "flu_seasonal_vic_na_3y_tip-frequencies.json" "flu_seasonal_vic_na_3y_root-sequence.json" \
"flu_seasonal_vic_na_6y.json" "flu_seasonal_vic_na_6y_tip-frequencies.json" "flu_seasonal_vic_na_6y_root-sequence.json" \
"flu_seasonal_vic_na_12y.json" "flu_seasonal_vic_na_12y_tip-frequencies.json" "flu_seasonal_vic_na_12y_root-sequence.json" \
"flu_seasonal_yam_ha_2y.json" "flu_seasonal_yam_ha_2y_tip-frequencies.json" "flu_seasonal_yam_ha_2y_root-sequence.json" \
"flu_seasonal_yam_ha_3y.json" "flu_seasonal_yam_ha_3y_tip-frequencies.json" "flu_seasonal_yam_ha_3y_root-sequence.json" \
"flu_seasonal_yam_ha_6y.json" "flu_seasonal_yam_ha_6y_tip-frequencies.json" "flu_seasonal_yam_ha_6y_root-sequence.json" \
"flu_seasonal_yam_ha_12y.json" "flu_seasonal_yam_ha_12y_tip-frequencies.json" "flu_seasonal_yam_ha_12y_root-sequence.json" \
"flu_seasonal_yam_na_2y.json" "flu_seasonal_yam_na_2y_tip-frequencies.json" "flu_seasonal_yam_na_2y_root-sequence.json" \
"flu_seasonal_yam_na_3y.json" "flu_seasonal_yam_na_3y_tip-frequencies.json" "flu_seasonal_yam_na_3y_root-sequence.json" \
"flu_seasonal_yam_na_6y.json" "flu_seasonal_yam_na_6y_tip-frequencies.json" "flu_seasonal_yam_na_6y_root-sequence.json" \
"flu_seasonal_yam_na_12y.json" "flu_seasonal_yam_na_12y_tip-frequencies.json" "flu_seasonal_yam_na_12y_root-sequence.json" \
############## LATEST CORE SARS-CoV-2 (COVID-19) BUILDS ##############
"ncov_gisaid_global.json" "ncov_gisaid_global_tip-frequencies.json" \
"ncov_gisaid_africa.json" "ncov_gisaid_africa_tip-frequencies.json" \
"ncov_gisaid_asia.json" "ncov_gisaid_asia_tip-frequencies.json" \
"ncov_gisaid_europe.json" "ncov_gisaid_europe_tip-frequencies.json" \
"ncov_gisaid_north-america.json" "ncov_gisaid_north-america_tip-frequencies.json" \
"ncov_gisaid_oceania.json" "ncov_gisaid_oceania_tip-frequencies.json" \
"ncov_gisaid_south-america.json" "ncov_gisaid_south-america_tip-frequencies.json" \
############## TIMESTAMPED SARS-CoV-2 BUILDS USED IN NARRATIVES #############
"ncov_2020-01-23.json" "ncov_2020-01-25.json" "ncov_2020-01-26.json" "ncov_2020-01-30.json" \
"ncov_2020-03-04.json" "ncov_2020-03-05.json" "ncov_2020-03-11.json" "ncov_2020-03-13.json" \
"ncov_2020-03-20.json" "ncov_2020-03-27.json" "ncov_2020-04-03.json" \
"ncov_global_2020-04-09.json" "ncov_north-america_2020-04-17.json" \
"monkeypox_mpxv.json" \
)

rm -rf data/
mkdir -p data/
for i in "${data_files[@]}"
do
curl http://data.nextstrain.org/"${i}" --compressed -o data/"${i}"
curl http://staging.nextstrain.org/"${i}" --compressed -o data/"${i}"
done

echo "Copying the test datasets from test/data to data"
Expand Down

0 comments on commit 3b16854

Please sign in to comment.