Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified assets/sphinx/tree_construct.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions bigtree/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
list_to_tree,
list_to_tree_by_relation,
nested_dict_to_tree,
newick_to_tree,
str_to_tree,
)
from bigtree.tree.export import (
Expand Down
270 changes: 269 additions & 1 deletion bigtree/tree/construct.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from __future__ import annotations

import re
from collections import OrderedDict
from collections import OrderedDict, defaultdict
from typing import Any, Dict, Iterable, List, Optional, Tuple, Type

from bigtree.node.node import Node
from bigtree.tree.export import tree_to_dataframe
from bigtree.tree.search import find_child_by_name, find_name
from bigtree.utils.constants import NewickCharacter, NewickState
from bigtree.utils.exceptions import (
DuplicatedNodeError,
TreeError,
Expand All @@ -31,6 +32,7 @@
"nested_dict_to_tree",
"dataframe_to_tree",
"dataframe_to_tree_by_relation",
"newick_to_tree",
]


Expand Down Expand Up @@ -977,3 +979,269 @@ def recursive_create_child(parent_node: Node) -> None:
root_node.set_attrs(retrieve_attr(row))
recursive_create_child(root_node)
return root_node


def newick_to_tree(
tree_string: str,
length_attr: str = "length",
attr_prefix: str = "&&NHX:",
node_type: Type[Node] = Node,
) -> Node:
"""Construct tree from Newick notation, return root of tree.

In the Newick Notation (or New Hampshire Notation)
- Tree is represented in round brackets i.e., `(child1,child2,child3)parent`
- If there are nested tree, they will be in nested round brackets i.e., `((grandchild1)child1,(grandchild2,grandchild3)child2)parent`
- If there is length attribute, they will be beside the name i.e., `(child1:0.5,child2:0.1)parent`
- If there are other attributes, attributes are represented in square brackets i.e., `(child1:0.5[S:human],child2:0.1[S:human])parent[S:parent]`

Variations supported
- Support special characters ([, ], (, ), :, ,) in node name, attribute name, and attribute values if
they are enclosed in single quotes i.e., '(name:!)'

>>> from bigtree import newick_to_tree
>>> root = newick_to_tree("((d,e)b,c)a")
>>> root.show()
a
├── b
│ ├── d
│ └── e
└── c

>>> root = newick_to_tree("((d:40,e:35)b:65,c:60)a", length_attr="age")
>>> root.show(attr_list=["age"])
a
├── b [age=65]
│ ├── d [age=40]
│ └── e [age=35]
└── c [age=60]

>>> root = newick_to_tree(
... "((d:40[&&NHX:species=human],e:35[&&NHX:species=human])b:65[&&NHX:species=human],c:60[&&NHX:species=human])a[&&NHX:species=human]",
... length_attr="age",
... )
>>> root.show(all_attrs=True)
a [species=human]
├── b [age=65, species=human]
│ ├── d [age=40, species=human]
│ └── e [age=35, species=human]
└── c [age=60, species=human]

Args:
tree_string (str): String in Newick notation to construct tree
length_attr (str): attribute name to store node length, optional, defaults to 'length'
attr_prefix (str): prefix before all attributes, within square bracket, used to detect attributes, defaults to "&&NHX:"
node_type (Type[Node]): node type of tree to be created, defaults to ``Node``

Returns:
(Node)
"""
if not len(tree_string):
raise ValueError("Tree string does not contain any data, check `tree_string`")

# Store results (for tracking)
depth_nodes: Dict[int, List[Node]] = defaultdict(list)
unlabelled_node_counter: int = 0
current_depth: int = 1
tree_string_idx: int = 0

# Store states (for assertions and checks)
current_state: NewickState = NewickState.PARSE_STRING
current_node: Optional[Node] = None
cumulative_string: str = ""
cumulative_string_value: str = ""

def _create_node(
_new_node: Optional[Node],
_cumulative_string: str,
_unlabelled_node_counter: int,
_depth_nodes: Dict[int, List[Node]],
_current_depth: int,
) -> Tuple[Node, int]:
if not _new_node:
if not _cumulative_string:
_cumulative_string = f"node{_unlabelled_node_counter}"
_unlabelled_node_counter += 1
_new_node = node_type(_cumulative_string)
_depth_nodes[_current_depth].append(_new_node)
elif _cumulative_string:
_new_node.set_attrs(
{
length_attr: int(_cumulative_string)
if _cumulative_string.isdigit()
else float(_cumulative_string)
}
)

if len(_depth_nodes[_current_depth + 1]):
_new_node.children = depth_nodes[_current_depth + 1] # type: ignore
del _depth_nodes[_current_depth + 1]
return _new_node, _unlabelled_node_counter

def _raise_value_error(tree_idx: int) -> None:
raise ValueError(
f"String not properly closed, check `tree_string` at index {tree_idx}"
)

while tree_string_idx < len(tree_string):
character = tree_string[tree_string_idx]
if character == NewickCharacter.OPEN_BRACKET:
# Check and/or change state
state_title = "Node creation start"
if current_state not in [NewickState.PARSE_STRING]:
_raise_value_error(tree_string_idx)
# Logic
current_depth += 1
if current_node:
_raise_value_error(tree_string_idx)
if cumulative_string:
_raise_value_error(tree_string_idx)
assert (
not cumulative_string_value
), f"{state_title}, should not have cumulative_string_value"
tree_string_idx += 1
continue

if character in [
NewickCharacter.CLOSE_BRACKET,
NewickCharacter.ATTR_START,
NewickCharacter.NODE_SEP,
]:
# Check and/or change state
state_title = "Node creation end / Node attribute start"
if current_state not in [
NewickState.PARSE_STRING,
NewickState.PARSE_ATTRIBUTE_NAME,
]:
_raise_value_error(tree_string_idx)
# Logic
if character == NewickCharacter.ATTR_START:
current_state = NewickState.PARSE_ATTRIBUTE_NAME
if tree_string[tree_string_idx + 1 :].startswith( # noqa: E203
attr_prefix
):
tree_string_idx += len(attr_prefix)
current_node, unlabelled_node_counter = _create_node(
current_node,
cumulative_string,
unlabelled_node_counter,
depth_nodes,
current_depth,
)
if character == NewickCharacter.CLOSE_BRACKET:
current_depth -= 1
current_node = None
if character == NewickCharacter.NODE_SEP:
current_node = None
cumulative_string = ""
assert (
not cumulative_string_value
), f"{state_title}, should not have cumulative_string_value"
tree_string_idx += 1
continue

if character == NewickCharacter.ATTR_END:
# Check and/or change state
state_title = "Node attribute end"
if current_state not in [NewickState.PARSE_ATTRIBUTE_VALUE]:
_raise_value_error(tree_string_idx)
current_state = NewickState.PARSE_STRING
# Logic
assert current_node, f"{state_title}, should have current_node"
current_node.set_attrs({cumulative_string: cumulative_string_value})
cumulative_string = ""
cumulative_string_value = ""
tree_string_idx += 1
continue

if character == NewickCharacter.ATTR_KEY_VALUE:
# Check and/or change state
state_title = "Node attribute creation"
if current_state not in [NewickState.PARSE_ATTRIBUTE_NAME]:
_raise_value_error(tree_string_idx)
current_state = NewickState.PARSE_ATTRIBUTE_VALUE
# Logic
assert current_node, f"{state_title}, should have current_node"
if not cumulative_string:
_raise_value_error(tree_string_idx)
assert (
not cumulative_string_value
), f"{state_title}, should not have cumulative_string_value"
tree_string_idx += 1
continue

if character == NewickCharacter.ATTR_QUOTE:
# Logic
quote_end_idx = tree_string.find(
NewickCharacter.ATTR_QUOTE, tree_string_idx + 1
)
if quote_end_idx == -1:
_raise_value_error(tree_string_idx)
if current_state in [
NewickState.PARSE_STRING,
NewickState.PARSE_ATTRIBUTE_NAME,
]:
cumulative_string = tree_string[
tree_string_idx + 1 : quote_end_idx # noqa: E203
]
else:
cumulative_string_value = tree_string[
tree_string_idx + 1 : quote_end_idx # noqa: E203
]
tree_string_idx = quote_end_idx + 1
continue

if character == NewickCharacter.SEP:
# Check and/or change state
state_title = "Node length creation / Node attribute creation"
if current_state not in [
NewickState.PARSE_STRING,
NewickState.PARSE_ATTRIBUTE_VALUE,
]:
_raise_value_error(tree_string_idx)
# Logic
if current_state == NewickState.PARSE_STRING:
if current_node:
_raise_value_error(tree_string_idx)
current_node, unlabelled_node_counter = _create_node(
current_node,
cumulative_string,
unlabelled_node_counter,
depth_nodes,
current_depth,
)
cumulative_string = ""
assert (
not cumulative_string_value
), f"{state_title}, should not have cumulative_string_value"
tree_string_idx += 1
continue
else:
current_state = NewickState.PARSE_ATTRIBUTE_NAME
assert current_node, f"{state_title}, should not have current_node"
current_node.set_attrs({cumulative_string: cumulative_string_value})
cumulative_string = ""
cumulative_string_value = ""
tree_string_idx += 1
continue

if current_state == NewickState.PARSE_ATTRIBUTE_VALUE:
cumulative_string_value += character
else:
cumulative_string += character
tree_string_idx += 1

if current_depth != 1:
_raise_value_error(tree_string_idx)

# Final root node
if len(depth_nodes[current_depth]):
current_node = depth_nodes[current_depth][0]
current_node, unlabelled_node_counter = _create_node(
current_node,
cumulative_string,
unlabelled_node_counter,
depth_nodes,
current_depth,
)
return current_node
10 changes: 7 additions & 3 deletions bigtree/tree/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from bigtree.node.node import Node
from bigtree.tree.search import find_path
from bigtree.utils.assertions import assert_key_in_dict, assert_str_in_list
from bigtree.utils.constants import ExportConstants, MermaidConstants
from bigtree.utils.constants import ExportConstants, MermaidConstants, NewickCharacter
from bigtree.utils.exceptions import (
optional_dependencies_image,
optional_dependencies_pandas,
Expand Down Expand Up @@ -1163,10 +1163,10 @@ def tree_to_newick(
tree: T,
intermediate_node_name: bool = True,
length_attr: str = "",
length_sep: str = ":",
length_sep: Union[str, NewickCharacter] = NewickCharacter.SEP,
attr_list: Iterable[str] = [],
attr_prefix: str = "&&NHX:",
attr_sep: str = ":",
attr_sep: Union[str, NewickCharacter] = NewickCharacter.SEP,
) -> str:
"""Export tree to Newick notation. Useful for describing phylogenetic tree.

Expand Down Expand Up @@ -1218,6 +1218,10 @@ def tree_to_newick(
"""
if not tree:
return ""
if isinstance(length_sep, NewickCharacter):
length_sep = length_sep.value
if isinstance(attr_sep, NewickCharacter):
attr_sep = attr_sep.value

node_name_str = ""
if (intermediate_node_name) or (not intermediate_node_name and tree.is_leaf):
Expand Down
18 changes: 18 additions & 0 deletions bigtree/utils/constants.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from enum import Enum, auto
from typing import Dict, List, Tuple


Expand Down Expand Up @@ -58,3 +59,20 @@ class MermaidConstants:
"double_circle": "o--o",
"double_cross": "x--x",
}


class NewickState(Enum):
PARSE_STRING = auto()
PARSE_ATTRIBUTE_NAME = auto()
PARSE_ATTRIBUTE_VALUE = auto()


class NewickCharacter(str, Enum):
OPEN_BRACKET = "("
CLOSE_BRACKET = ")"
ATTR_START = "["
ATTR_END = "]"
ATTR_KEY_VALUE = "="
ATTR_QUOTE = "'"
SEP = ":"
NODE_SEP = ","
10 changes: 8 additions & 2 deletions docs/source/bigtree/tree/construct.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,34 @@ Construct Tree from list, dictionary, and pandas DataFrame.
To decide which method to use, consider your data type and data values.

.. list-table:: Tree Construct Methods
:widths: 35 30 50 40
:widths: 35 30 50 30 40
:header-rows: 1

* - Construct Tree from
- Using full path
- Using parent-child relation
- Using notation
- Add node attributes
* - String
- `str_to_tree`
- NA
- No
- `newick_to_tree`
- | No (for `str_to_tree`)
| Yes (for `newick_to_tree`)
* - List
- `list_to_tree`
- `list_to_tree_by_relation`
- NA
- No
* - Dictionary
- `dict_to_tree`
- `nested_dict_to_tree`
- NA
- Yes
* - DataFrame
- `dataframe_to_tree`
- `dataframe_to_tree_by_relation`
- NA
- Yes


Expand Down
Loading