Skip to content

Commit

Permalink
Merge pull request #891 from nextstrain/fix-single-trait-inference
Browse files Browse the repository at this point in the history
Fix single trait inference
  • Loading branch information
huddlej authored Apr 19, 2022
2 parents c264580 + 53cf195 commit 002c44f
Show file tree
Hide file tree
Showing 6 changed files with 240 additions and 13 deletions.
32 changes: 19 additions & 13 deletions augur/traits.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,12 @@ def mugration_inference(tree=None, seq_meta=None, field='country', confidence=Tr
unique_states = list(set(traits.values()))

if len(unique_states)==0:
print("WARNING: no states found for discrete state reconstruction.")
print("WARNING: no states found for discrete state reconstruction.", file=sys.stderr)
for node in T.find_clades():
node.__setattr__(field, None)
return T, None, {}
elif len(unique_states)==1:
print("WARNING: only one state found for discrete state reconstruction:", unique_states)
print("WARNING: only one state found for discrete state reconstruction:", unique_states, file=sys.stderr)
for node in T.find_clades():
node.__setattr__(field, unique_states[0])
return T, None, {}
Expand All @@ -69,11 +69,11 @@ def mugration_inference(tree=None, seq_meta=None, field='country', confidence=Tr
reconstruct_discrete_traits(T, traits, missing_data=missing,
sampling_bias_correction=sampling_bias_correction, weights=weights)
else:
print("ERROR: 300 or more distinct discrete states found. TreeTime is currently not set up to handle that many states.")
print("ERROR: 300 or more distinct discrete states found. TreeTime is currently not set up to handle that many states.", file=sys.stderr)
sys.exit(1)

if tt is None:
print("ERROR in discrete state reconstruction in TreeTime. Please look for errors above.")
print("ERROR in discrete state reconstruction in TreeTime. Please look for errors above.", file=sys.stderr)
sys.exit(1)

# attach inferred states as e.g. node.region = 'africa'
Expand Down Expand Up @@ -135,12 +135,12 @@ def run(args):
T = Phylo.read(tree_fname, 'newick')
missing_internal_node_names = [n.name is None for n in T.get_nonterminals()]
if np.all(missing_internal_node_names):
print("\n*** WARNING: Tree has no internal node names!")
print("*** Without internal node names, ancestral traits can't be linked up to the correct node later.")
print("*** If you want to use 'augur export' later, re-run this command with the output of 'augur refine'.")
print("*** If you haven't run 'augur refine', you can add node names to your tree by running:")
print("*** augur refine --tree %s --output-tree <filename>.nwk"%(tree_fname) )
print("*** And use <filename>.nwk as the tree when running 'ancestral', 'translate', and 'traits'")
print("\n*** WARNING: Tree has no internal node names!", file=sys.stderr)
print("*** Without internal node names, ancestral traits can't be linked up to the correct node later.", file=sys.stderr)
print("*** If you want to use 'augur export' later, re-run this command with the output of 'augur refine'.", file=sys.stderr)
print("*** If you haven't run 'augur refine', you can add node names to your tree by running:", file=sys.stderr)
print("*** augur refine --tree %s --output-tree <filename>.nwk" % (tree_fname), file=sys.stderr)
print("*** And use <filename>.nwk as the tree when running 'ancestral', 'translate', and 'traits'", file=sys.stderr)

if args.weights:
weight_dict = {c:{} for c in args.columns}
Expand Down Expand Up @@ -174,10 +174,16 @@ def run(args):
continue

for node in T.find_clades():
mugration_states[node.name][column] = node.__getattribute__(column)
mugration_states[node.name][column] = getattr(node, column)

if args.confidence:
mugration_states[node.name][column+'_confidence'] = node.__getattribute__(column+'_confidence')
mugration_states[node.name][column+'_entropy'] = node.__getattribute__(column+'_entropy')
confidence = getattr(node, f"{column}_confidence", None)
if confidence is not None:
mugration_states[node.name][f"{column}_confidence"] = confidence

entropy = getattr(node, f"{column}_entropy", None)
if entropy is not None:
mugration_states[node.name][f"{column}_entropy"] = entropy

if gtr:
# add gtr models to json structure for export
Expand Down
49 changes: 49 additions & 0 deletions tests/functional/traits.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
Integration tests for augur traits.

$ pushd "$TESTDIR" > /dev/null
$ export AUGUR="../../bin/augur"

Infer the ancestral region for a given tree and metadata.

$ ${AUGUR} traits \
> --metadata "traits/metadata.tsv" \
> --tree "traits/tree.nwk" \
> --columns region \
> --output-node-data "$TMP/traits.json" > /dev/null

$ python3 "$TESTDIR/../../scripts/diff_jsons.py" "traits/traits_region.json" "$TMP/traits.json" --significant-digits 5
{}
$ rm -f "$TMP/traits.json"

Infer the ancestral "virus" value from the same metadata.
Since there is only a single virus in the data, Augur warns the user through stderr.

$ ${AUGUR} traits \
> --metadata "traits/metadata.tsv" \
> --tree "traits/tree.nwk" \
> --columns virus \
> --output-node-data "$TMP/traits.json" > /dev/null
WARNING: only one state found for discrete state reconstruction: ['zika']

$ python3 "$TESTDIR/../../scripts/diff_jsons.py" "traits/traits_virus.json" "$TMP/traits.json" --significant-digits 5
{}
$ rm -f "$TMP/traits.json"

Repeat inference of a trait with a single value, but request confidence intervals.
This should similarly warn the user through stderr, but it should produce an error.

$ ${AUGUR} traits \
> --metadata "traits/metadata.tsv" \
> --tree "traits/tree.nwk" \
> --columns virus \
> --confidence \
> --output-node-data "$TMP/traits.json" > /dev/null
WARNING: only one state found for discrete state reconstruction: ['zika']

$ python3 "$TESTDIR/../../scripts/diff_jsons.py" "traits/traits_virus.json" "$TMP/traits.json" --significant-digits 5
{}
$ rm -f "$TMP/traits.json"

Switch back to the original directory where testing started.

$ popd > /dev/null
13 changes: 13 additions & 0 deletions tests/functional/traits/metadata.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
strain virus accession date region country division city db segment authors url title journal paper_url
PAN/CDC_259359_V1_V3/2015 zika KX156774 2015-12-18 North America Panama Panama Panama genbank genome Shabman et al https://www.ncbi.nlm.nih.gov/nuccore/KX156774 Direct Submission Submitted (29-APR-2016) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/
COL/FLR_00024/2015 zika MF574569 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574569 Direct Submission Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/
PRVABC59 zika KU501215 2015-12-XX North America Puerto Rico Puerto Rico Puerto Rico genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501215 Phylogeny of Zika Virus in Western Hemisphere, 2015 Emerging Infect. Dis. 22 (5), 933-935 (2016) https://www.ncbi.nlm.nih.gov/pubmed/27088323
COL/FLR_00008/2015 zika MF574562 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574562 Direct Submission Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/
Colombia/2016/ZC204Se zika KY317939 2016-01-06 South America Colombia Colombia Colombia genbank genome Quick et al https://www.ncbi.nlm.nih.gov/nuccore/KY317939 Multiplex PCR method for MinION and Illumina sequencing of Zika and other virus genomes directly from clinical samples Nat Protoc 12 (6), 1261-1276 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538739
ZKC2/2016 zika KX253996 2016-02-16 Oceania American Samoa American Samoa American Samoa genbank genome Wu et al https://www.ncbi.nlm.nih.gov/nuccore/KX253996 Direct Submission Submitted (18-MAY-2016) Center for Diseases Control and Prevention of Guangdong Province; National Institute of Viral Disease Control and Prevention, China https://www.ncbi.nlm.nih.gov/pubmed/
VEN/UF_1/2016 zika KX702400 2016-03-25 South America Venezuela Venezuela Venezuela genbank genome Blohm et al https://www.ncbi.nlm.nih.gov/nuccore/KX702400 Complete Genome Sequences of Identical Zika virus Isolates in a Nursing Mother and Her Infant Genome Announc 5 (17), e00231-17 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28450510
DOM/2016/BB_0059 zika KY785425 2016-04-04 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785425 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734
BRA/2016/FC_6706 zika KY785433 2016-04-08 South America Brazil Brazil Brazil genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785433 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734
DOM/2016/BB_0183 zika KY785420 2016-04-18 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785420 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734
EcEs062_16 zika KX879603 2016-04-XX South America Ecuador Ecuador Ecuador genbank genome Marquez et al https://www.ncbi.nlm.nih.gov/nuccore/KX879603 First Complete Genome Sequences of Zika Virus Isolated from Febrile Patient Sera in Ecuador Genome Announc 5 (8), e01673-16 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28232448
HND/2016/HU_ME59 zika KY785418 2016-05-13 North America Honduras Honduras Honduras genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785418 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734
95 changes: 95 additions & 0 deletions tests/functional/traits/traits_region.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
{
"generated_by": {
"program": "augur",
"version": "14.1.0"
},
"models": {
"region": {
"alphabet": [
"North America",
"Oceania",
"South America",
"?"
],
"equilibrium_probabilities": [
0.31005064170488,
0.28662652026143665,
0.40332283803368346
],
"rate": 218.0279017803387,
"transition_matrix": [
[
0.0,
1.0971786118353992,
2.204495361871908
],
[
1.0971786118353992,
0.0,
1.0970462842057542
],
[
2.204495361871908,
1.0970462842057542,
0.0
]
]
}
},
"nodes": {
"BRA/2016/FC_6706": {
"region": "South America"
},
"COL/FLR_00008/2015": {
"region": "South America"
},
"Colombia/2016/ZC204Se": {
"region": "South America"
},
"DOM/2016/BB_0183": {
"region": "North America"
},
"EcEs062_16": {
"region": "South America"
},
"HND/2016/HU_ME59": {
"region": "North America"
},
"NODE_0000001": {
"region": "South America"
},
"NODE_0000002": {
"region": "North America"
},
"NODE_0000003": {
"region": "North America"
},
"NODE_0000004": {
"region": "North America"
},
"NODE_0000005": {
"region": "South America"
},
"NODE_0000006": {
"region": "South America"
},
"NODE_0000007": {
"region": "South America"
},
"NODE_0000008": {
"region": "South America"
},
"PAN/CDC_259359_V1_V3/2015": {
"region": "North America"
},
"PRVABC59": {
"region": "North America"
},
"VEN/UF_1/2016": {
"region": "South America"
},
"ZKC2/2016": {
"region": "Oceania"
}
}
}
63 changes: 63 additions & 0 deletions tests/functional/traits/traits_virus.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
{
"generated_by": {
"program": "augur",
"version": "14.1.0"
},
"models": {},
"nodes": {
"BRA/2016/FC_6706": {
"virus": "zika"
},
"COL/FLR_00008/2015": {
"virus": "zika"
},
"Colombia/2016/ZC204Se": {
"virus": "zika"
},
"DOM/2016/BB_0183": {
"virus": "zika"
},
"EcEs062_16": {
"virus": "zika"
},
"HND/2016/HU_ME59": {
"virus": "zika"
},
"NODE_0000001": {
"virus": "zika"
},
"NODE_0000002": {
"virus": "zika"
},
"NODE_0000003": {
"virus": "zika"
},
"NODE_0000004": {
"virus": "zika"
},
"NODE_0000005": {
"virus": "zika"
},
"NODE_0000006": {
"virus": "zika"
},
"NODE_0000007": {
"virus": "zika"
},
"NODE_0000008": {
"virus": "zika"
},
"PAN/CDC_259359_V1_V3/2015": {
"virus": "zika"
},
"PRVABC59": {
"virus": "zika"
},
"VEN/UF_1/2016": {
"virus": "zika"
},
"ZKC2/2016": {
"virus": "zika"
}
}
}
1 change: 1 addition & 0 deletions tests/functional/traits/tree.nwk
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
((Colombia/2016/ZC204Se:0.00105368,(PAN/CDC_259359_V1_V3/2015:0.00076051,(COL/FLR_00008/2015:0.00044440,VEN/UF_1/2016:0.00089377)NODE_0000008:0.00038502)NODE_0000007:0.00019253)NODE_0000001:0.00080159,(BRA/2016/FC_6706:0.00214920,(ZKC2/2016:0.00173693,(HND/2016/HU_ME59:0.00206150,PRVABC59:0.00135309)NODE_0000004:0.00013537,(EcEs062_16:0.00175918,DOM/2016/BB_0183:0.00184905)NODE_0000002:0.00021565)NODE_0000003:0.00013737)NODE_0000005:0.00019772)NODE_0000006:0.00100000;

0 comments on commit 002c44f

Please sign in to comment.