Skip to content

Commit 4ab22b4

Browse files
authored
Merge pull request #146 from kayjan/enhance-newick
Enhance newick
2 parents 8b15c40 + 2635e49 commit 4ab22b4

File tree

3 files changed

+161
-12
lines changed

3 files changed

+161
-12
lines changed

bigtree/tree/export.py

Lines changed: 71 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1159,15 +1159,35 @@ def get_attr(
11591159
)
11601160

11611161

1162-
def tree_to_newick(tree: T) -> str:
1163-
"""Export tree to Newick notation.
1162+
def tree_to_newick(
1163+
tree: T,
1164+
intermediate_node_name: bool = True,
1165+
length_attr: str = "",
1166+
length_sep: str = ":",
1167+
attr_list: Iterable[str] = [],
1168+
attr_prefix: str = "&&NHX:",
1169+
attr_sep: str = ":",
1170+
) -> str:
1171+
"""Export tree to Newick notation. Useful for describing phylogenetic tree.
1172+
1173+
In the Newick Notation (or New Hampshire Notation)
1174+
- Tree is represented in round brackets i.e., `(child1,child2,child3)parent`
1175+
- If there are nested tree, they will be in nested round brackets i.e., `((grandchild1)child1,(grandchild2,grandchild3)child2)parent`
1176+
- If there is length attribute, they will be beside the name i.e., `(child1:0.5,child2:0.1)parent`
1177+
- If there are other attributes, attributes are represented in square brackets i.e., `(child1:0.5[S:human],child2:0.1[S:human])parent[S:parent]`
1178+
1179+
Customizations include
1180+
- Omitting names of root and intermediate nodes, default all node names are shown
1181+
- Changing length separator to other symbol, default is `:`
1182+
- Adding an attribute prefix, default is `&&NHX:`
1183+
- Changing the attribute separator to other symbol, default is `:`
11641184
11651185
>>> from bigtree import Node, tree_to_newick
1166-
>>> root = Node("a", age=90)
1167-
>>> b = Node("b", age=65, parent=root)
1168-
>>> c = Node("c", age=60, parent=root)
1169-
>>> d = Node("d", age=40, parent=b)
1170-
>>> e = Node("e", age=35, parent=b)
1186+
>>> root = Node("a", species="human")
1187+
>>> b = Node("b", age=65, species="human", parent=root)
1188+
>>> c = Node("c", age=60, species="human", parent=root)
1189+
>>> d = Node("d", age=40, species="human", parent=b)
1190+
>>> e = Node("e", age=35, species="human", parent=b)
11711191
>>> root.show()
11721192
a
11731193
├── b
@@ -1178,15 +1198,56 @@ def tree_to_newick(tree: T) -> str:
11781198
>>> tree_to_newick(root)
11791199
'((d,e)b,c)a'
11801200
1201+
>>> tree_to_newick(root, length_attr="age")
1202+
'((d:40,e:35)b:65,c:60)a'
1203+
1204+
>>> tree_to_newick(root, length_attr="age", attr_list=["species"])
1205+
'((d:40[&&NHX:species=human],e:35[&&NHX:species=human])b:65[&&NHX:species=human],c:60[&&NHX:species=human])a[&&NHX:species=human]'
1206+
11811207
Args:
11821208
tree (Node): tree to be exported
1209+
intermediate_node_name (bool): indicator if intermediate nodes have node names, defaults to True
1210+
length_attr (str): node attribute to extract into outside of bracket, optional
1211+
length_sep (str): separate between node name and length, used if length_attr is non-empty, defaults to ":"
1212+
attr_list (Iterable[str]): list of node attributes to extract into square bracket, optional
1213+
attr_prefix (str): prefix before all attributes, within square bracket, used if attr_list is non-empty, defaults to "&&NHX:"
1214+
attr_sep (str): separator between attributes, within square brackets, used if attr_list is non-empty, defaults to ":"
11831215
11841216
Returns:
11851217
(str)
11861218
"""
11871219
if not tree:
11881220
return ""
1221+
1222+
node_name_str = ""
1223+
if (intermediate_node_name) or (not intermediate_node_name and tree.is_leaf):
1224+
node_name_str = tree.node_name
1225+
if length_attr and not tree.is_root:
1226+
if not tree.get_attr(length_attr):
1227+
raise ValueError(f"Length attribute does not exist for node {tree}")
1228+
node_name_str += f"{length_sep}{tree.get_attr(length_attr)}"
1229+
1230+
attr_str = ""
1231+
if attr_list:
1232+
attr_str = attr_sep.join(
1233+
[f"{k}={tree.get_attr(k)}" for k in attr_list if tree.get_attr(k)]
1234+
)
1235+
if attr_str:
1236+
attr_str = f"[{attr_prefix}{attr_str}]"
1237+
11891238
if tree.is_leaf:
1190-
return tree.node_name
1191-
children_newick = ",".join(tree_to_newick(child) for child in tree.children)
1192-
return f"({children_newick}){tree.node_name}"
1239+
return f"{node_name_str}{attr_str}"
1240+
1241+
children_newick = ",".join(
1242+
tree_to_newick(
1243+
child,
1244+
intermediate_node_name=intermediate_node_name,
1245+
length_attr=length_attr,
1246+
length_sep=length_sep,
1247+
attr_list=attr_list,
1248+
attr_prefix=attr_prefix,
1249+
attr_sep=attr_sep,
1250+
)
1251+
for child in tree.children
1252+
)
1253+
return f"({children_newick}){node_name_str}{attr_str}"

docs/source/bigtree/tree/export.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,11 @@ While exporting to another data type, methods can take in arguments to determine
4949
- No
5050
- Tree style
5151
* - `tree_to_newick`
52+
- Yes with `attr_list`
5253
- No
5354
- No
5455
- No
55-
- No
56-
- N/A
56+
- Length separator and attribute prefix and separator
5757
* - `tree_to_dict`
5858
- Yes with `attr_dict` or `all_attrs`
5959
- Yes with `max_depth`

tests/tree/test_export.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import pandas as pd
22
import pytest
33

4+
from bigtree.node.node import Node
45
from bigtree.tree.construct import dataframe_to_tree, dict_to_tree, nested_dict_to_tree
56
from bigtree.tree.export import (
67
print_tree,
@@ -1385,3 +1386,90 @@ def test_tree_to_newick(tree_node):
13851386
newick_str = tree_to_newick(tree_node)
13861387
expected_str = """((d,(g,h)e)b,(f)c)a"""
13871388
assert newick_str == expected_str
1389+
1390+
@staticmethod
1391+
def test_tree_to_newick_length(tree_node):
1392+
newick_str = tree_to_newick(tree_node, length_attr="age")
1393+
expected_str = """((d:40,(g:10,h:6)e:35)b:65,(f:38)c:60)a"""
1394+
assert newick_str == expected_str
1395+
1396+
@staticmethod
1397+
def test_tree_to_newick_length_invalid_error(tree_node):
1398+
with pytest.raises(ValueError) as exc_info:
1399+
tree_to_newick(tree_node, length_attr="age2")
1400+
assert str(exc_info.value).startswith(
1401+
"Length attribute does not exist for node "
1402+
)
1403+
1404+
@staticmethod
1405+
def test_tree_to_newick_length_sep(tree_node):
1406+
newick_str = tree_to_newick(tree_node, length_attr="age", length_sep=";")
1407+
expected_str = """((d;40,(g;10,h;6)e;35)b;65,(f;38)c;60)a"""
1408+
assert newick_str == expected_str
1409+
1410+
@staticmethod
1411+
def test_tree_to_newick_attr_list(tree_node):
1412+
newick_str = tree_to_newick(tree_node, attr_list=["age"])
1413+
expected_str = """((d[&&NHX:age=40],(g[&&NHX:age=10],h[&&NHX:age=6])e[&&NHX:age=35])b[&&NHX:age=65],(f[&&NHX:age=38])c[&&NHX:age=60])a[&&NHX:age=90]"""
1414+
assert newick_str == expected_str
1415+
1416+
@staticmethod
1417+
def test_tree_to_newick_attr_list_invalid(tree_node):
1418+
newick_str = tree_to_newick(tree_node, attr_list=["age2"])
1419+
expected_str = """((d,(g,h)e)b,(f)c)a"""
1420+
assert newick_str == expected_str
1421+
1422+
@staticmethod
1423+
def test_tree_to_newick_attr_prefix(tree_node):
1424+
newick_str = tree_to_newick(tree_node, attr_list=["age"], attr_prefix="")
1425+
expected_str = """((d[age=40],(g[age=10],h[age=6])e[age=35])b[age=65],(f[age=38])c[age=60])a[age=90]"""
1426+
assert newick_str == expected_str
1427+
1428+
@staticmethod
1429+
def test_tree_to_newick_intermediate_node_name(tree_node):
1430+
newick_str = tree_to_newick(
1431+
tree_node, intermediate_node_name=False, attr_list=["age"]
1432+
)
1433+
expected_str = """((d[&&NHX:age=40],(g[&&NHX:age=10],h[&&NHX:age=6])[&&NHX:age=35])[&&NHX:age=65],(f[&&NHX:age=38])[&&NHX:age=60])[&&NHX:age=90]"""
1434+
assert newick_str == expected_str
1435+
1436+
@staticmethod
1437+
def test_tree_to_newick_phylogenetic():
1438+
"""
1439+
Example taken from: https://www.cs.mcgill.ca/~birch/doc/forester/NHX.pdf
1440+
"""
1441+
root = Node("placeholder_root", E="1.1.1.1", D="N")
1442+
metazoa = Node(
1443+
"placeholder_metazoa",
1444+
length=0.1,
1445+
S="Metazoa",
1446+
E="1.1.1.1",
1447+
D="N",
1448+
parent=root,
1449+
)
1450+
primates = Node(
1451+
"placeholder_primates",
1452+
length=0.05,
1453+
S="Primates",
1454+
E="1.1.1.1",
1455+
D="Y",
1456+
B="100",
1457+
parent=metazoa,
1458+
)
1459+
_ = Node("ADH2", length=0.1, S="human", E="1.1.1.1", parent=primates)
1460+
_ = Node("ADH1", length=0.11, S="human", E="1.1.1.1", parent=primates)
1461+
_ = Node("ADHY", length=0.1, S="nematode", E="1.1.1.1", parent=metazoa)
1462+
_ = Node("ADHX", length=0.12, S="insect", E="1.1.1.1", parent=metazoa)
1463+
fungi = Node("placeholder_fungi", length=0.1, S="Fungi", parent=root)
1464+
_ = Node("ADH4", length=0.09, S="yeast", E="1.1.1.1", parent=fungi)
1465+
_ = Node("ADH3", length=0.13, S="yeast", E="1.1.1.1", parent=fungi)
1466+
_ = Node("ADH2", length=0.12, S="yeast", E="1.1.1.1", parent=fungi)
1467+
_ = Node("ADH1", length=0.11, S="yeast", E="1.1.1.1", parent=fungi)
1468+
newick_str = tree_to_newick(
1469+
root,
1470+
intermediate_node_name=False,
1471+
length_attr="length",
1472+
attr_list=["S", "E", "D", "B"],
1473+
)
1474+
expected_str = """(((ADH2:0.1[&&NHX:S=human:E=1.1.1.1],ADH1:0.11[&&NHX:S=human:E=1.1.1.1]):0.05[&&NHX:S=Primates:E=1.1.1.1:D=Y:B=100],ADHY:0.1[&&NHX:S=nematode:E=1.1.1.1],ADHX:0.12[&&NHX:S=insect:E=1.1.1.1]):0.1[&&NHX:S=Metazoa:E=1.1.1.1:D=N],(ADH4:0.09[&&NHX:S=yeast:E=1.1.1.1],ADH3:0.13[&&NHX:S=yeast:E=1.1.1.1],ADH2:0.12[&&NHX:S=yeast:E=1.1.1.1],ADH1:0.11[&&NHX:S=yeast:E=1.1.1.1]):0.1[&&NHX:S=Fungi])[&&NHX:E=1.1.1.1:D=N]"""
1475+
assert newick_str == expected_str

0 commit comments

Comments
 (0)