Skip to content

Commit 8386079

Browse files
Phylo export methods cleanup
- Remove TreeSequence.to_nexus and add TreeSequence.as_nexus - Add Tree.as_newick and refactor - Update C newick generation to cover new defaults - Refactor newick testing code paths Closes #1803 Closes #1815 Closes #1671
1 parent d47f97c commit 8386079

File tree

12 files changed

+1249
-573
lines changed

12 files changed

+1249
-573
lines changed

c/tests/test_convert.c

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,20 +45,31 @@ test_single_tree_newick(void)
4545
ret = tsk_tree_first(&t);
4646
CU_ASSERT_EQUAL_FATAL(ret, 1)
4747

48-
ret = tsk_convert_newick(&t, 0, 0, 0, buffer_size, newick);
48+
ret = tsk_convert_newick(&t, 0, 0, TSK_NEWICK_LEGACY_MS_LABELS, buffer_size, newick);
4949
CU_ASSERT_EQUAL_FATAL(ret, 0);
5050
/* Seems odd, but this is what a single node newick tree looks like.
5151
* Newick parsers seems to accept it in any case */
5252
CU_ASSERT_STRING_EQUAL(newick, "1;");
5353

54-
ret = tsk_convert_newick(&t, 4, 0, 0, buffer_size, newick);
54+
ret = tsk_convert_newick(&t, 0, 0, 0, buffer_size, newick);
55+
CU_ASSERT_EQUAL_FATAL(ret, 0);
56+
CU_ASSERT_STRING_EQUAL(newick, "n0;");
57+
58+
ret = tsk_convert_newick(&t, 4, 0, TSK_NEWICK_LEGACY_MS_LABELS, buffer_size, newick);
5559
CU_ASSERT_EQUAL_FATAL(ret, 0);
5660
CU_ASSERT_STRING_EQUAL(newick, "(1:1,2:1);");
61+
ret = tsk_convert_newick(&t, 4, 0, 0, buffer_size, newick);
62+
CU_ASSERT_EQUAL_FATAL(ret, 0);
63+
CU_ASSERT_STRING_EQUAL(newick, "(n0:1,n1:1);");
5764

58-
ret = tsk_convert_newick(&t, 6, 0, 0, buffer_size, newick);
65+
ret = tsk_convert_newick(&t, 6, 0, TSK_NEWICK_LEGACY_MS_LABELS, buffer_size, newick);
5966
CU_ASSERT_EQUAL_FATAL(ret, 0);
6067
CU_ASSERT_STRING_EQUAL(newick, "((1:1,2:1):2,(3:2,4:2):1);");
6168

69+
ret = tsk_convert_newick(&t, 6, 0, 0, buffer_size, newick);
70+
CU_ASSERT_EQUAL_FATAL(ret, 0);
71+
CU_ASSERT_STRING_EQUAL(newick, "((n0:1,n1:1):2,(n2:2,n3:2):1);");
72+
6273
tsk_tree_free(&t);
6374
tsk_treeseq_free(&ts);
6475
}
@@ -95,7 +106,8 @@ test_single_tree_newick_errors(void)
95106
ret = tsk_convert_newick(&t, 6, 0, 0, j, newick);
96107
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BUFFER_OVERFLOW);
97108
}
98-
ret = tsk_convert_newick(&t, 6, 0, 0, len, newick);
109+
ret = tsk_convert_newick(&t, 6, 0, TSK_NEWICK_LEGACY_MS_LABELS, len, newick);
110+
99111
CU_ASSERT_EQUAL_FATAL(ret, 0);
100112
CU_ASSERT_STRING_EQUAL(newick, "((1:1,2:1):2,(3:2,4:2):1);");
101113

c/tskit/convert.c

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,15 @@ tsk_newick_converter_run(
5757
const tsk_tree_t *tree = self->tree;
5858
tsk_id_t *stack = self->traversal_stack;
5959
const double *time = self->tree->tree_sequence->tables->nodes.time;
60+
const tsk_flags_t *flags = self->tree->tree_sequence->tables->nodes.flags;
6061
int stack_top = 0;
6162
int label;
6263
size_t s = 0;
6364
int r;
6465
tsk_id_t u, v, w, root_parent;
6566
double branch_length;
67+
bool ms_labels = self->options & TSK_NEWICK_LEGACY_MS_LABELS;
68+
const char *label_format = ms_labels ? "%d" : "n%d";
6669

6770
if (root < 0 || root >= (tsk_id_t) self->tree->num_nodes) {
6871
ret = TSK_ERR_NODE_OUT_OF_BOUNDS;
@@ -91,15 +94,20 @@ tsk_newick_converter_run(
9194
} else {
9295
u = tree->parent[v];
9396
stack_top--;
94-
if (tree->left_child[v] == TSK_NULL) {
97+
label = -1;
98+
if (ms_labels) {
99+
if (tree->left_child[v] == TSK_NULL) {
100+
label = (int) v + 1;
101+
}
102+
} else if (flags[v] & TSK_NODE_IS_SAMPLE) {
103+
label = (int) v;
104+
}
105+
if (label != -1) {
95106
if (s >= buffer_size) {
96107
ret = TSK_ERR_BUFFER_OVERFLOW;
97108
goto out;
98109
}
99-
/* We do this for ms-compatability. This should be a configurable option
100-
* via the flags attribute */
101-
label = (int) v + 1;
102-
r = snprintf(buffer + s, buffer_size - s, "%d", label);
110+
r = snprintf(buffer + s, buffer_size - s, label_format, label);
103111
if (r < 0) {
104112
ret = TSK_ERR_IO;
105113
goto out;

c/tskit/convert.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ extern "C" {
3232

3333
#include <tskit/trees.h>
3434

35+
#define TSK_NEWICK_LEGACY_MS_LABELS (1 << 0)
36+
3537
int tsk_convert_newick(const tsk_tree_t *tree, tsk_id_t root, unsigned int precision,
3638
tsk_flags_t options, size_t buffer_size, char *buffer);
3739

python/CHANGELOG.rst

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
--------------------
2-
[0.3.8] - 2021-0X-XX
2+
[0.4.0] - 2021-1X-XX
33
--------------------
44

55
**Breaking changes**
@@ -37,10 +37,12 @@
3737
- For ``TreeSequence.samples`` all arguments after ``population`` are now keyword only
3838
(:user:`benjeffery`, :issue:`1715`, :pr:`1831`).
3939

40-
- Fix bugs in the format produced by ``TreeSequence.to_nexus`` to make
41-
it standards-compliant. (:user:`jeetsukumaran`, :user:`jeromekelleher`,
42-
:issue:`1785`, :pr:`1835`, :pr:`1836`)
43-
[FIXME MORE UPDATES HERE AS WE CHANGE THE LABELS ETC]
40+
- Remove the method ``TreeSequence.to_nexus`` and replace with ``TreeSequence.as_nexus``.
41+
As the old method was not generating standards-compliant output, it seems unlikely
42+
that it was used by anyone. Calls to ``to_nexus`` will result in a
43+
NotImplementedError, informing users of the change. See below for details on
44+
``as_nexus``.
45+
4446

4547
**Features**
4648

@@ -84,6 +86,17 @@
8486

8587
- `dump_tables` omitted individual parents. (:user:`benjeffery`, :issue:`1828`, :pr:`1884`)
8688

89+
- Add the ``Tree.as_newick`` method and deprecate ``Tree.newick``. The
90+
``as_newick`` method by default labels samples with the pattern ``"n{node_id}"``
91+
which is much more useful that the behaviour of ``Tree.newick`` (which mimics
92+
``ms`` output). (:user:`jeromekelleher`, :issue:`1671`, :pr:`1838`.)
93+
94+
- Add the ``as_nexus`` and ``write_nexus`` methods to the TreeSequence class,
95+
replacing the broken ``to_nexus`` method (see above). This uses the same
96+
sample labelling pattern as ``as_newick``.
97+
(:user:`jeetsukumaran`, :user:`jeromekelleher`, :issue:`1785`, :pr:`1835`,
98+
:pr:`1836`, :pr:`1838`)
99+
87100
--------------------
88101
[0.3.7] - 2021-07-08
89102
--------------------

python/_tskitmodule.c

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9954,24 +9954,27 @@ static PyObject *
99549954
Tree_get_newick(Tree *self, PyObject *args, PyObject *kwds)
99559955
{
99569956
PyObject *ret = NULL;
9957-
static char *kwlist[] = { "root", "precision", "buffer_size", NULL };
9957+
static char *kwlist[]
9958+
= { "root", "precision", "buffer_size", "legacy_ms_labels", NULL };
99589959
int precision = 14;
99599960
/* We have a default bufsize for convenience, but the high-level code
99609961
* should set this by computing an upper bound. */
99619962
Py_ssize_t buffer_size = 1024;
99629963
int root, err;
99639964
char *buffer = NULL;
9965+
int legacy_ms_labels = false;
9966+
tsk_flags_t options = 0;
99649967

99659968
if (Tree_check_state(self) != 0) {
99669969
goto out;
99679970
}
9968-
if (!PyArg_ParseTupleAndKeywords(
9969-
args, kwds, "i|in", kwlist, &root, &precision, &buffer_size)) {
9971+
if (!PyArg_ParseTupleAndKeywords(args, kwds, "i|ini", kwlist, &root, &precision,
9972+
&buffer_size, &legacy_ms_labels)) {
99709973
goto out;
99719974
}
9972-
if (precision < 0 || precision > 16) {
9975+
if (precision < 0 || precision > 17) {
99739976
PyErr_SetString(
9974-
PyExc_ValueError, "Precision must be between 0 and 16, inclusive");
9977+
PyExc_ValueError, "Precision must be between 0 and 17, inclusive");
99759978
goto out;
99769979
}
99779980
if (buffer_size <= 0) {
@@ -9982,13 +9985,16 @@ Tree_get_newick(Tree *self, PyObject *args, PyObject *kwds)
99829985
if (buffer == NULL) {
99839986
PyErr_NoMemory();
99849987
}
9988+
if (legacy_ms_labels) {
9989+
options |= TSK_NEWICK_LEGACY_MS_LABELS;
9990+
}
99859991
err = tsk_convert_newick(
9986-
self->tree, (tsk_id_t) root, precision, 0, (size_t) buffer_size, buffer);
9992+
self->tree, (tsk_id_t) root, precision, options, (size_t) buffer_size, buffer);
99879993
if (err != 0) {
99889994
handle_library_error(err);
99899995
goto out;
99909996
}
9991-
ret = PyBytes_FromString(buffer);
9997+
ret = PyUnicode_FromString(buffer);
99929998
out:
99939999
if (buffer != NULL) {
999410000
PyMem_Free(buffer);

python/tests/__init__.py

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -118,26 +118,6 @@ def __eq__(self, other):
118118
def __ne__(self, other):
119119
return not self.__eq__(other)
120120

121-
def newick(self, root=None, precision=16, node_labels=None):
122-
if node_labels is None:
123-
node_labels = {u: str(u + 1) for u in self.tree.leaves()}
124-
if root is None:
125-
root = self.left_root
126-
return self._build_newick(root, precision, node_labels) + ";"
127-
128-
def _build_newick(self, node, precision, node_labels):
129-
label = node_labels.get(node, "")
130-
if self.left_child[node] == tskit.NULL:
131-
s = label
132-
else:
133-
s = "("
134-
for child in self.children(node):
135-
branch_length = self.tree.branch_length(child)
136-
subtree = self._build_newick(child, precision, node_labels)
137-
s += subtree + ":{0:.{1}f},".format(branch_length, precision)
138-
s = s[:-1] + label + ")"
139-
return s
140-
141121

142122
class PythonTreeSequence:
143123
"""

python/tests/test_highlevel.py

Lines changed: 2 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -1951,6 +1951,8 @@ def test_removed_methods(self):
19511951
ts.diffs()
19521952
with pytest.raises(NotImplementedError):
19531953
ts.newick_trees()
1954+
with pytest.raises(NotImplementedError):
1955+
ts.to_nexus()
19541956

19551957
def test_dump_pathlib(self, ts_fixture, tmp_path):
19561958
path = tmp_path / "tmp.trees"
@@ -2955,121 +2957,6 @@ def test_root_properties(self):
29552957
assert tree.root == tskit.NULL
29562958
assert len(tested) == 3
29572959

2958-
def verify_newick(self, tree):
2959-
"""
2960-
Verifies that we output the newick tree as expected.
2961-
"""
2962-
# TODO to make this work we may need to clamp the precision of node
2963-
# times because Python and C float printing algorithms work slightly
2964-
# differently. Seems to work OK now, so leaving alone.
2965-
if tree.has_single_root:
2966-
py_tree = tests.PythonTree.from_tree(tree)
2967-
newick1 = tree.newick(precision=16)
2968-
newick2 = py_tree.newick()
2969-
assert newick1 == newick2
2970-
2971-
# Make sure we get the same results for a leaf root.
2972-
newick1 = tree.newick(root=0, precision=16)
2973-
newick2 = py_tree.newick(root=0)
2974-
assert newick1 == newick2
2975-
2976-
# When we specify the node_labels we should get precisely the
2977-
# same result as we are using Python code now.
2978-
for precision in [0, 3, 19]:
2979-
newick1 = tree.newick(precision=precision, node_labels={})
2980-
newick2 = py_tree.newick(precision=precision, node_labels={})
2981-
assert newick1 == newick2
2982-
else:
2983-
with pytest.raises(ValueError):
2984-
tree.newick()
2985-
for root in tree.roots:
2986-
py_tree = tests.PythonTree.from_tree(tree)
2987-
newick1 = tree.newick(precision=16, root=root)
2988-
newick2 = py_tree.newick(root=root)
2989-
assert newick1 == newick2
2990-
2991-
def test_newick(self):
2992-
for ts in get_example_tree_sequences():
2993-
for tree in ts.trees():
2994-
self.verify_newick(tree)
2995-
2996-
def test_newick_large_times(self):
2997-
for n in [2, 10, 20, 100]:
2998-
ts = msprime.simulate(n, Ne=100e6, random_seed=1)
2999-
tree = ts.first()
3000-
for precision in [0, 1, 16]:
3001-
newick_py = tree.newick(
3002-
node_labels={u: str(u + 1) for u in ts.samples()},
3003-
precision=precision,
3004-
)
3005-
newick_c = tree.newick(precision=precision)
3006-
assert newick_c == newick_py
3007-
3008-
def test_bifurcating_newick(self):
3009-
for n_tips in range(2, 6):
3010-
ts = msprime.simulate(n_tips, random_seed=1) # msprime trees are binary
3011-
for tree in ts.trees():
3012-
base_newick = tree.newick(include_branch_lengths=False).strip(";")
3013-
for i in range(n_tips):
3014-
# Each tip number (i+1) mentioned once
3015-
assert base_newick.count(str(i + 1)) == 1
3016-
# Binary newick trees have 3 chars per extra tip: "(,)"
3017-
assert len(base_newick) == n_tips + 3 * (n_tips - 1)
3018-
3019-
def test_newick_topology_equiv(self):
3020-
replace_numeric = {ord(x): None for x in "1234567890:."}
3021-
for ts in get_example_tree_sequences():
3022-
for tree in ts.trees():
3023-
if not tree.has_single_root:
3024-
with pytest.raises(ValueError) as plain_newick_err:
3025-
tree.newick(node_labels={}, include_branch_lengths=False)
3026-
with pytest.raises(ValueError) as newick1_err:
3027-
tree.newick()
3028-
with pytest.raises(ValueError) as newick2_err:
3029-
tree.newick(node_labels={})
3030-
assert str(newick1_err) == str(newick2_err)
3031-
assert str(newick1_err) == str(plain_newick_err)
3032-
else:
3033-
plain_newick = tree.newick(
3034-
node_labels={}, include_branch_lengths=False
3035-
)
3036-
newick1 = tree.newick().translate(replace_numeric)
3037-
newick2 = tree.newick(node_labels={}).translate(replace_numeric)
3038-
assert newick1 == newick2
3039-
assert newick2 == plain_newick
3040-
3041-
def test_newick_buffer_too_small_bug(self):
3042-
nodes = io.StringIO(
3043-
"""\
3044-
id is_sample population individual time
3045-
0 1 0 -1 0.00000000000000
3046-
1 1 0 -1 0.00000000000000
3047-
2 1 0 -1 0.00000000000000
3048-
3 1 0 -1 0.00000000000000
3049-
4 0 0 -1 0.21204940078588
3050-
5 0 0 -1 0.38445004304611
3051-
6 0 0 -1 0.83130278081275
3052-
"""
3053-
)
3054-
edges = io.StringIO(
3055-
"""\
3056-
id left right parent child
3057-
0 0.00000000 1.00000000 4 0
3058-
1 0.00000000 1.00000000 4 2
3059-
2 0.00000000 1.00000000 5 1
3060-
3 0.00000000 1.00000000 5 3
3061-
4 0.00000000 1.00000000 6 4
3062-
5 0.00000000 1.00000000 6 5
3063-
"""
3064-
)
3065-
ts = tskit.load_text(nodes, edges, sequence_length=1, strict=False)
3066-
tree = ts.first()
3067-
for precision in range(17):
3068-
newick_c = tree.newick(precision=precision)
3069-
node_labels = {u: str(u + 1) for u in ts.samples()}
3070-
newick_py = tree.newick(precision=precision, node_labels=node_labels)
3071-
assert newick_c == newick_py
3072-
30732960
def test_as_dict_of_dicts(self):
30742961
for ts in get_example_tree_sequences():
30752962
tree = next(ts.trees())

0 commit comments

Comments
 (0)