Skip to content

Commit c19d83c

Browse files
committed
refactor: DAG constructor to not rely on pandas + handle reserved keywords cases + add test cases
1 parent e40628a commit c19d83c

File tree

5 files changed

+156
-21
lines changed

5 files changed

+156
-21
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

77
## [Unreleased]
8+
### Changed:
9+
- DAG Constructor: `list_to_dag` and `dict_to_dag` does not rely on `dataframe_to_dag` as pandas dataframe operation
10+
is phased out.
11+
### Fixed:
12+
- DAG Constructor: Handle cases where reserved keywords are part of attribute upon creation and throw error accordingly.
813

914
## [0.17.0] - 2024-04-04
1015
### Added

bigtree/dag/construct.py

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from bigtree.utils.assertions import (
77
assert_dataframe_no_duplicate_attribute,
88
assert_dataframe_not_empty,
9+
assert_key_not_in_dict_or_df,
910
assert_length_not_empty,
1011
filter_attributes,
1112
isnull,
@@ -20,7 +21,6 @@
2021
__all__ = ["list_to_dag", "dict_to_dag", "dataframe_to_dag"]
2122

2223

23-
@optional_dependencies_pandas
2424
def list_to_dag(
2525
relations: List[Tuple[str, str]],
2626
node_type: Type[DAGNode] = DAGNode,
@@ -44,13 +44,26 @@ def list_to_dag(
4444
"""
4545
assert_length_not_empty(relations, "Input list", "relations")
4646

47-
relation_data = pd.DataFrame(relations, columns=["parent", "child"])
48-
return dataframe_to_dag(
49-
relation_data, child_col="child", parent_col="parent", node_type=node_type
50-
)
47+
node_dict: Dict[str, DAGNode] = dict()
48+
parent_node = DAGNode()
49+
50+
for parent_name, child_name in relations:
51+
if parent_name not in node_dict:
52+
parent_node = node_type(parent_name)
53+
node_dict[parent_name] = parent_node
54+
else:
55+
parent_node = node_dict[parent_name]
56+
if child_name not in node_dict:
57+
child_node = node_type(child_name)
58+
node_dict[child_name] = child_node
59+
else:
60+
child_node = node_dict[child_name]
61+
62+
child_node.parents = [parent_node]
63+
64+
return parent_node
5165

5266

53-
@optional_dependencies_pandas
5467
def dict_to_dag(
5568
relation_attrs: Dict[str, Any],
5669
parent_key: str = "parents",
@@ -84,20 +97,34 @@ def dict_to_dag(
8497
"""
8598
assert_length_not_empty(relation_attrs, "Dictionary", "relation_attrs")
8699

87-
# Convert dictionary to dataframe
88-
data = pd.DataFrame(relation_attrs).T.rename_axis("_tmp_child").reset_index()
89-
if parent_key not in data:
100+
node_dict: Dict[str, DAGNode] = dict()
101+
parent_node: DAGNode | None = None
102+
103+
for child_name, node_attrs in relation_attrs.items():
104+
node_attrs = node_attrs.copy()
105+
parent_names: List[str] = []
106+
if parent_key in node_attrs:
107+
parent_names = node_attrs.pop(parent_key)
108+
assert_key_not_in_dict_or_df(node_attrs, ["parent", "parents", "children"])
109+
110+
if child_name in node_dict:
111+
child_node = node_dict[child_name]
112+
child_node.set_attrs(node_attrs)
113+
else:
114+
child_node = node_type(child_name, **node_attrs)
115+
node_dict[child_name] = child_node
116+
117+
for parent_name in parent_names:
118+
parent_node = node_dict.get(parent_name, node_type(parent_name))
119+
node_dict[parent_name] = parent_node
120+
child_node.parents = [parent_node]
121+
122+
if parent_node is None:
90123
raise ValueError(
91124
f"Parent key {parent_key} not in dictionary, check `relation_attrs` and `parent_key`"
92125
)
93126

94-
data = data.explode(parent_key)
95-
return dataframe_to_dag(
96-
data,
97-
child_col="_tmp_child",
98-
parent_col=parent_key,
99-
node_type=node_type,
100-
)
127+
return parent_node
101128

102129

103130
@optional_dependencies_pandas
@@ -163,6 +190,7 @@ def dataframe_to_dag(
163190
attribute_cols = list(data.columns)
164191
attribute_cols.remove(child_col)
165192
attribute_cols.remove(parent_col)
193+
assert_key_not_in_dict_or_df(attribute_cols, ["parent", "parents", "children"])
166194

167195
data = data[[child_col, parent_col] + attribute_cols].copy()
168196

bigtree/utils/assertions.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,23 @@ def assert_str_in_list(
5858
)
5959

6060

61+
def assert_key_not_in_dict_or_df(
62+
parameter_dict: Union[Dict[str, Any], pd.DataFrame],
63+
not_accepted_parameters: List[str],
64+
) -> None:
65+
"""Raise ValueError is parameter is in key of dictionary
66+
67+
Args:
68+
parameter_dict (Dict[str, Any]/pd.DataFrame): argument input for parameter
69+
not_accepted_parameters (List[str]): list of not accepted parameters
70+
"""
71+
for parameter in parameter_dict:
72+
if parameter in not_accepted_parameters:
73+
raise ValueError(
74+
f"Invalid input, check `{parameter}` is not a valid key as it is a reserved keyword"
75+
)
76+
77+
6178
def assert_key_in_dict(
6279
parameter_name: str,
6380
parameter: Any,

tests/dag/test_construct.py

Lines changed: 88 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,48 @@ def test_dict_to_dag_empty_error(self):
8080
parameter="relation_attrs"
8181
)
8282

83-
def test_dict_to_dag_parent_key_error(self):
83+
@staticmethod
84+
def test_dict_to_dag_parent_key_error():
85+
relation_dict = {
86+
"a": {"age": 90},
87+
"b": {"age": 65},
88+
"c": {"parent1": ["a", "b"], "age": 60},
89+
"d": {"parent1": ["a", "c"], "age": 40},
90+
"e": {"parent1": ["d"], "age": 35},
91+
"f": {"parent1": ["c", "d"], "age": 38},
92+
"g": {"parent1": ["c"], "age": 10},
93+
"h": {"parent1": ["g"], "age": 6},
94+
}
95+
with pytest.raises(ValueError) as exc_info:
96+
dict_to_dag(relation_dict)
97+
assert str(exc_info.value) == Constants.ERROR_DAG_DICT_PARENT_KEY.format(
98+
parent_key="parents"
99+
)
100+
101+
def test_dict_to_dag_parent_key_reserved_keyword_parents_error(self):
84102
with pytest.raises(ValueError) as exc_info:
85103
dict_to_dag(self.relation_dict, parent_key="parent")
86-
assert str(exc_info.value) == Constants.ERROR_DAG_DICT_PARENT_KEY
104+
assert str(exc_info.value) == Constants.ERROR_DAG_DICT_INVALID_KEY.format(
105+
parameter="parents"
106+
)
107+
108+
@staticmethod
109+
def test_dict_to_dag_parent_key_reserved_keyword_parent_error():
110+
relation_dict = {
111+
"a": {"age": 90},
112+
"b": {"age": 65},
113+
"c": {"parent": ["a", "b"], "age": 60},
114+
"d": {"parent": ["a", "c"], "age": 40},
115+
"e": {"parent": ["d"], "age": 35},
116+
"f": {"parent": ["c", "d"], "age": 38},
117+
"g": {"parent": ["c"], "age": 10},
118+
"h": {"parent": ["g"], "age": 6},
119+
}
120+
with pytest.raises(ValueError) as exc_info:
121+
dict_to_dag(relation_dict)
122+
assert str(exc_info.value) == Constants.ERROR_DAG_DICT_INVALID_KEY.format(
123+
parameter="parent"
124+
)
87125

88126
def test_dict_to_dag_node_type(self):
89127
dag = dict_to_dag(self.relation_dict, node_type=DAGNodeA)
@@ -178,6 +216,54 @@ def test_dataframe_to_dag_parent_col_error(self):
178216
parent_col=parent_col
179217
)
180218

219+
@staticmethod
220+
def test_dataframe_to_dag_parent_col_reserved_keyword_parents_error():
221+
data = pd.DataFrame(
222+
[
223+
["h", "g", "a", 6],
224+
["g", "c", "a", 10],
225+
["f", "d", "a", 38],
226+
["f", "c", "a", 38],
227+
["e", "d", "a", 35],
228+
["d", "c", "a", 40],
229+
["d", "a", "a", 40],
230+
["c", "b", "a", 60],
231+
["c", "a", "a", 60],
232+
["a", None, None, 90],
233+
["b", None, None, 65],
234+
],
235+
columns=["child", "parent", "parents", "age"],
236+
)
237+
with pytest.raises(ValueError) as exc_info:
238+
dataframe_to_dag(data, parent_col="parent")
239+
assert str(exc_info.value) == Constants.ERROR_DAG_DICT_INVALID_KEY.format(
240+
parameter="parents"
241+
)
242+
243+
@staticmethod
244+
def test_dataframe_to_dag_parent_col_reserved_keyword_parent_error():
245+
data = pd.DataFrame(
246+
[
247+
["h", "g", "a", 6],
248+
["g", "c", "a", 10],
249+
["f", "d", "a", 38],
250+
["f", "c", "a", 38],
251+
["e", "d", "a", 35],
252+
["d", "c", "a", 40],
253+
["d", "a", "a", 40],
254+
["c", "b", "a", 60],
255+
["c", "a", "a", 60],
256+
["a", None, None, 90],
257+
["b", None, None, 65],
258+
],
259+
columns=["child", "parent", "parents", "age"],
260+
)
261+
with pytest.raises(ValueError) as exc_info:
262+
dataframe_to_dag(data, parent_col="parents")
263+
assert str(exc_info.value) == Constants.ERROR_DAG_DICT_INVALID_KEY.format(
264+
parameter="parent"
265+
)
266+
181267
def test_dataframe_to_dag_attribute_cols(self):
182268
dag = dataframe_to_dag(self.data, attribute_cols=["age"])
183269
assert_dag_structure_root(dag)

tests/test_constants.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,8 @@ class Constants:
2020
ERROR_CUSTOM_TYPE = "Node type is not `{type}`"
2121

2222
# dag/construct
23-
ERROR_DAG_DICT_PARENT_KEY = (
24-
"Parent key parent not in dictionary, check `relation_attrs` and `parent_key`"
25-
)
23+
ERROR_DAG_DICT_INVALID_KEY = "Invalid input, check `{parameter}` is not a valid key as it is a reserved keyword"
24+
ERROR_DAG_DICT_PARENT_KEY = "Parent key {parent_key} not in dictionary, check `relation_attrs` and `parent_key`"
2625
ERROR_DAG_DATAFRAME_EMPTY_CHILD = (
2726
"Child name cannot be empty, check column: {child_col}"
2827
)

0 commit comments

Comments
 (0)