Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 71 additions & 10 deletions bigtree/tree/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -1159,15 +1159,35 @@ def get_attr(
)


def tree_to_newick(tree: T) -> str:
"""Export tree to Newick notation.
def tree_to_newick(
tree: T,
intermediate_node_name: bool = True,
length_attr: str = "",
length_sep: str = ":",
attr_list: Iterable[str] = [],
attr_prefix: str = "&&NHX:",
attr_sep: str = ":",
) -> str:
"""Export tree to Newick notation. Useful for describing phylogenetic tree.

In the Newick Notation (or New Hampshire Notation)
- Tree is represented in round brackets i.e., `(child1,child2,child3)parent`
- If there are nested tree, they will be in nested round brackets i.e., `((grandchild1)child1,(grandchild2,grandchild3)child2)parent`
- If there is length attribute, they will be beside the name i.e., `(child1:0.5,child2:0.1)parent`
- If there are other attributes, attributes are represented in square brackets i.e., `(child1:0.5[S:human],child2:0.1[S:human])parent[S:parent]`

Customizations include
- Omitting names of root and intermediate nodes, default all node names are shown
- Changing length separator to other symbol, default is `:`
- Adding an attribute prefix, default is `&&NHX:`
- Changing the attribute separator to other symbol, default is `:`

>>> from bigtree import Node, tree_to_newick
>>> root = Node("a", age=90)
>>> b = Node("b", age=65, parent=root)
>>> c = Node("c", age=60, parent=root)
>>> d = Node("d", age=40, parent=b)
>>> e = Node("e", age=35, parent=b)
>>> root = Node("a", species="human")
>>> b = Node("b", age=65, species="human", parent=root)
>>> c = Node("c", age=60, species="human", parent=root)
>>> d = Node("d", age=40, species="human", parent=b)
>>> e = Node("e", age=35, species="human", parent=b)
>>> root.show()
a
├── b
Expand All @@ -1178,15 +1198,56 @@ def tree_to_newick(tree: T) -> str:
>>> tree_to_newick(root)
'((d,e)b,c)a'

>>> tree_to_newick(root, length_attr="age")
'((d:40,e:35)b:65,c:60)a'

>>> tree_to_newick(root, length_attr="age", attr_list=["species"])
'((d:40[&&NHX:species=human],e:35[&&NHX:species=human])b:65[&&NHX:species=human],c:60[&&NHX:species=human])a[&&NHX:species=human]'

Args:
tree (Node): tree to be exported
intermediate_node_name (bool): indicator if intermediate nodes have node names, defaults to True
length_attr (str): node attribute to extract into outside of bracket, optional
length_sep (str): separate between node name and length, used if length_attr is non-empty, defaults to ":"
attr_list (Iterable[str]): list of node attributes to extract into square bracket, optional
attr_prefix (str): prefix before all attributes, within square bracket, used if attr_list is non-empty, defaults to "&&NHX:"
attr_sep (str): separator between attributes, within square brackets, used if attr_list is non-empty, defaults to ":"

Returns:
(str)
"""
if not tree:
return ""

node_name_str = ""
if (intermediate_node_name) or (not intermediate_node_name and tree.is_leaf):
node_name_str = tree.node_name
if length_attr and not tree.is_root:
if not tree.get_attr(length_attr):
raise ValueError(f"Length attribute does not exist for node {tree}")
node_name_str += f"{length_sep}{tree.get_attr(length_attr)}"

attr_str = ""
if attr_list:
attr_str = attr_sep.join(
[f"{k}={tree.get_attr(k)}" for k in attr_list if tree.get_attr(k)]
)
if attr_str:
attr_str = f"[{attr_prefix}{attr_str}]"

if tree.is_leaf:
return tree.node_name
children_newick = ",".join(tree_to_newick(child) for child in tree.children)
return f"({children_newick}){tree.node_name}"
return f"{node_name_str}{attr_str}"

children_newick = ",".join(
tree_to_newick(
child,
intermediate_node_name=intermediate_node_name,
length_attr=length_attr,
length_sep=length_sep,
attr_list=attr_list,
attr_prefix=attr_prefix,
attr_sep=attr_sep,
)
for child in tree.children
)
return f"({children_newick}){node_name_str}{attr_str}"
4 changes: 2 additions & 2 deletions docs/source/bigtree/tree/export.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,11 @@ While exporting to another data type, methods can take in arguments to determine
- No
- Tree style
* - `tree_to_newick`
- Yes with `attr_list`
- No
- No
- No
- No
- N/A
- Length separator and attribute prefix and separator
* - `tree_to_dict`
- Yes with `attr_dict` or `all_attrs`
- Yes with `max_depth`
Expand Down
88 changes: 88 additions & 0 deletions tests/tree/test_export.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd
import pytest

from bigtree.node.node import Node
from bigtree.tree.construct import dataframe_to_tree, dict_to_tree, nested_dict_to_tree
from bigtree.tree.export import (
print_tree,
Expand Down Expand Up @@ -1385,3 +1386,90 @@ def test_tree_to_newick(tree_node):
newick_str = tree_to_newick(tree_node)
expected_str = """((d,(g,h)e)b,(f)c)a"""
assert newick_str == expected_str

@staticmethod
def test_tree_to_newick_length(tree_node):
newick_str = tree_to_newick(tree_node, length_attr="age")
expected_str = """((d:40,(g:10,h:6)e:35)b:65,(f:38)c:60)a"""
assert newick_str == expected_str

@staticmethod
def test_tree_to_newick_length_invalid_error(tree_node):
with pytest.raises(ValueError) as exc_info:
tree_to_newick(tree_node, length_attr="age2")
assert str(exc_info.value).startswith(
"Length attribute does not exist for node "
)

@staticmethod
def test_tree_to_newick_length_sep(tree_node):
newick_str = tree_to_newick(tree_node, length_attr="age", length_sep=";")
expected_str = """((d;40,(g;10,h;6)e;35)b;65,(f;38)c;60)a"""
assert newick_str == expected_str

@staticmethod
def test_tree_to_newick_attr_list(tree_node):
newick_str = tree_to_newick(tree_node, attr_list=["age"])
expected_str = """((d[&&NHX:age=40],(g[&&NHX:age=10],h[&&NHX:age=6])e[&&NHX:age=35])b[&&NHX:age=65],(f[&&NHX:age=38])c[&&NHX:age=60])a[&&NHX:age=90]"""
assert newick_str == expected_str

@staticmethod
def test_tree_to_newick_attr_list_invalid(tree_node):
newick_str = tree_to_newick(tree_node, attr_list=["age2"])
expected_str = """((d,(g,h)e)b,(f)c)a"""
assert newick_str == expected_str

@staticmethod
def test_tree_to_newick_attr_prefix(tree_node):
newick_str = tree_to_newick(tree_node, attr_list=["age"], attr_prefix="")
expected_str = """((d[age=40],(g[age=10],h[age=6])e[age=35])b[age=65],(f[age=38])c[age=60])a[age=90]"""
assert newick_str == expected_str

@staticmethod
def test_tree_to_newick_intermediate_node_name(tree_node):
newick_str = tree_to_newick(
tree_node, intermediate_node_name=False, attr_list=["age"]
)
expected_str = """((d[&&NHX:age=40],(g[&&NHX:age=10],h[&&NHX:age=6])[&&NHX:age=35])[&&NHX:age=65],(f[&&NHX:age=38])[&&NHX:age=60])[&&NHX:age=90]"""
assert newick_str == expected_str

@staticmethod
def test_tree_to_newick_phylogenetic():
"""
Example taken from: https://www.cs.mcgill.ca/~birch/doc/forester/NHX.pdf
"""
root = Node("placeholder_root", E="1.1.1.1", D="N")
metazoa = Node(
"placeholder_metazoa",
length=0.1,
S="Metazoa",
E="1.1.1.1",
D="N",
parent=root,
)
primates = Node(
"placeholder_primates",
length=0.05,
S="Primates",
E="1.1.1.1",
D="Y",
B="100",
parent=metazoa,
)
_ = Node("ADH2", length=0.1, S="human", E="1.1.1.1", parent=primates)
_ = Node("ADH1", length=0.11, S="human", E="1.1.1.1", parent=primates)
_ = Node("ADHY", length=0.1, S="nematode", E="1.1.1.1", parent=metazoa)
_ = Node("ADHX", length=0.12, S="insect", E="1.1.1.1", parent=metazoa)
fungi = Node("placeholder_fungi", length=0.1, S="Fungi", parent=root)
_ = Node("ADH4", length=0.09, S="yeast", E="1.1.1.1", parent=fungi)
_ = Node("ADH3", length=0.13, S="yeast", E="1.1.1.1", parent=fungi)
_ = Node("ADH2", length=0.12, S="yeast", E="1.1.1.1", parent=fungi)
_ = Node("ADH1", length=0.11, S="yeast", E="1.1.1.1", parent=fungi)
newick_str = tree_to_newick(
root,
intermediate_node_name=False,
length_attr="length",
attr_list=["S", "E", "D", "B"],
)
expected_str = """(((ADH2:0.1[&&NHX:S=human:E=1.1.1.1],ADH1:0.11[&&NHX:S=human:E=1.1.1.1]):0.05[&&NHX:S=Primates:E=1.1.1.1:D=Y:B=100],ADHY:0.1[&&NHX:S=nematode:E=1.1.1.1],ADHX:0.12[&&NHX:S=insect:E=1.1.1.1]):0.1[&&NHX:S=Metazoa:E=1.1.1.1:D=N],(ADH4:0.09[&&NHX:S=yeast:E=1.1.1.1],ADH3:0.13[&&NHX:S=yeast:E=1.1.1.1],ADH2:0.12[&&NHX:S=yeast:E=1.1.1.1],ADH1:0.11[&&NHX:S=yeast:E=1.1.1.1]):0.1[&&NHX:S=Fungi])[&&NHX:E=1.1.1.1:D=N]"""
assert newick_str == expected_str