Skip to content

Commit f3d0141

Browse files
authored
Merge pull request #323 from kayjan/feature/tree-diff-agg
Add aggregate param to tree_diff
2 parents 9b7aec8 + adae847 commit f3d0141

File tree

7 files changed

+491
-269
lines changed

7 files changed

+491
-269
lines changed

CHANGELOG.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1010
### Added:
1111
- Tree Export: Print tree to allow alias.
1212
- Tree Export: Mermaid diagram to include theme.
13-
### Fixed:
14-
- Misc: Doctest for docstrings, docstring to indicate usage prefers `node_name` to `name`.
13+
- Tree Helper: Get tree diff to take in `aggregate` parameter to indicate differences at the top-level node.
1514
- Misc: Documentation to include tips and tricks on working with custom classes.
15+
### Changed:
16+
- Misc: Docstring to indicate usage prefers `node_name` to `name`.
17+
- Misc: Standardise testing fixtures.
18+
### Fixed:
1619
- Misc: Polars set up to work on laptop with M1 chip.
1720
- Tree Export: Mermaid diagram title to add newline.
18-
- Tree Helper: Get tree diff string replacement bug when the path change is substring of another path.
1921
- Tree Export: Polars unit test to work with old (<=1.9.0) and new polars version.
22+
- Tree Helper: Get tree diff string replacement bug when the path change is substring of another path.
2023

2124
## [0.22.1] - 2024-11-03
2225
### Added:

bigtree/tree/helper.py

Lines changed: 105 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ def get_tree_diff(
250250
other_tree: node.Node,
251251
only_diff: bool = True,
252252
detail: bool = False,
253+
aggregate: bool = False,
253254
attr_list: List[str] = [],
254255
fallback_sep: str = "/",
255256
) -> node.Node:
@@ -267,6 +268,9 @@ def get_tree_diff(
267268
If `detail=True`, (added) and (moved to) will be used instead of (+), (removed) and (moved from)
268269
will be used instead of (-).
269270
271+
If `aggregate=True`, differences (+)/(added)/(moved to) and (-)/(removed)/(moved from) will only be indicated at
272+
the parent-level. This is useful when a subtree is shifted and we want the differences to shown only at the top node.
273+
270274
!!! note
271275
272276
- tree and other_tree must have the same `sep` symbol, otherwise this will raise ValueError
@@ -276,50 +280,79 @@ def get_tree_diff(
276280
Examples:
277281
>>> # Create original tree
278282
>>> from bigtree import Node, get_tree_diff, list_to_tree
279-
>>> root = list_to_tree(["Downloads/Pictures/photo1.jpg", "Downloads/file1.doc", "Downloads/photo2.jpg"])
283+
>>> root = list_to_tree(["Downloads/Pictures/photo1.jpg", "Downloads/file1.doc", "Downloads/Trip/photo2.jpg"])
280284
>>> root.show()
281285
Downloads
282286
├── Pictures
283287
│ └── photo1.jpg
284288
├── file1.doc
285-
└── photo2.jpg
289+
└── Trip
290+
└── photo2.jpg
286291
287292
>>> # Create other tree
288-
>>> root_other = list_to_tree(["Downloads/Pictures/photo1.jpg", "Downloads/Pictures/photo2.jpg", "Downloads/file1.doc"])
293+
>>> root_other = list_to_tree(
294+
... ["Downloads/Pictures/photo1.jpg", "Downloads/Pictures/Trip/photo2.jpg", "Downloads/file1.doc", "Downloads/file2.doc"]
295+
... )
289296
>>> root_other.show()
290297
Downloads
291298
├── Pictures
292299
│ ├── photo1.jpg
293-
│ └── photo2.jpg
294-
└── file1.doc
300+
│ └── Trip
301+
│ └── photo2.jpg
302+
├── file1.doc
303+
└── file2.doc
295304
296-
>>> # Get tree differences
305+
# Get tree differences
297306
>>> tree_diff = get_tree_diff(root, root_other)
298307
>>> tree_diff.show()
299308
Downloads
300309
├── Pictures
301-
│ └── photo2.jpg (+)
302-
└── photo2.jpg (-)
310+
│ └── Trip (+)
311+
│ └── photo2.jpg (+)
312+
├── Trip (-)
313+
│ └── photo2.jpg (-)
314+
└── file2.doc (+)
303315
316+
>>> # Get tree differences - all differences
304317
>>> tree_diff = get_tree_diff(root, root_other, only_diff=False)
305318
>>> tree_diff.show()
306319
Downloads
307320
├── Pictures
308-
│ ├── photo1.jpg
309-
│ └── photo2.jpg (+)
321+
│ ├── Trip (+)
322+
│ │ └── photo2.jpg (+)
323+
│ └── photo1.jpg
324+
├── Trip (-)
325+
│ └── photo2.jpg (-)
310326
├── file1.doc
311-
└── photo2.jpg (-)
327+
└── file2.doc (+)
312328
329+
>>> # Get tree differences - all differences with details
313330
>>> tree_diff = get_tree_diff(root, root_other, only_diff=False, detail=True)
314331
>>> tree_diff.show()
315332
Downloads
316333
├── Pictures
317-
│ ├── photo1.jpg
318-
│ └── photo2.jpg (moved to)
334+
│ ├── Trip (moved to)
335+
│ │ └── photo2.jpg (moved to)
336+
│ └── photo1.jpg
337+
├── Trip (moved from)
338+
│ └── photo2.jpg (moved from)
319339
├── file1.doc
320-
└── photo2.jpg (moved from)
340+
└── file2.doc (added)
321341
322-
Comparing tree attributes
342+
>>> # Get tree differences - all differences with details on aggregated level
343+
>>> tree_diff = get_tree_diff(root, root_other, only_diff=False, detail=True, aggregate=True)
344+
>>> tree_diff.show()
345+
Downloads
346+
├── Pictures
347+
│ ├── Trip (moved to)
348+
│ │ └── photo2.jpg
349+
│ └── photo1.jpg
350+
├── Trip (moved from)
351+
│ └── photo2.jpg
352+
├── file1.doc
353+
└── file2.doc (added)
354+
355+
# Comparing tree attributes
323356
324357
- (~) will be added to node name if there are differences in tree attributes defined in `attr_list`.
325358
- The node's attributes will be a list of [value in `tree`, value in `other_tree`]
@@ -361,6 +394,7 @@ def get_tree_diff(
361394
other_tree (Node): tree to be compared with
362395
only_diff (bool): indicator to show all nodes or only nodes that are different (+/-), defaults to True
363396
detail (bool): indicator to differentiate between different types of diff e.g., added or removed or moved
397+
aggregate (bool): indicator to only add difference indicator to parent-level e.g., when shifting subtrees
364398
attr_list (List[str]): tree attributes to check for difference, defaults to empty list
365399
fallback_sep (str): sep to fall back to if tree and other_tree has sep that clashes with symbols "+" / "-" / "~".
366400
All node names in tree and other_tree should not contain this fallback_sep, defaults to "/"
@@ -383,6 +417,7 @@ def get_tree_diff(
383417

384418
name_col = "name"
385419
path_col = "PATH"
420+
parent_col = "PARENT"
386421
indicator_col = "Exists"
387422
tree_sep = tree.sep
388423

@@ -391,26 +426,34 @@ def get_tree_diff(
391426
_tree,
392427
name_col=name_col,
393428
path_col=path_col,
429+
parent_col=parent_col,
394430
attr_dict={k: k for k in attr_list},
395431
)
396432
for _tree in (tree, other_tree)
397433
)
398434

399435
# Check tree structure difference
400-
data_both = data[[path_col, name_col] + attr_list].merge(
401-
data_other[[path_col, name_col] + attr_list],
436+
data_both = data[[path_col, name_col, parent_col] + attr_list].merge(
437+
data_other[[path_col, name_col, parent_col] + attr_list],
402438
how="outer",
403-
on=[path_col, name_col],
439+
on=[path_col, name_col, parent_col],
404440
indicator=indicator_col,
405441
)
442+
if aggregate:
443+
data_both_agg = data_both[
444+
(data_both[indicator_col] == "left_only")
445+
| (data_both[indicator_col] == "right_only")
446+
].drop_duplicates(subset=[name_col, parent_col], keep=False)
447+
else:
448+
data_both_agg = data_both
406449

407450
# Handle tree structure difference
408-
nodes_removed = list(data_both[data_both[indicator_col] == "left_only"][path_col])[
409-
::-1
410-
]
411-
nodes_added = list(data_both[data_both[indicator_col] == "right_only"][path_col])[
412-
::-1
413-
]
451+
nodes_removed = list(
452+
data_both_agg[data_both_agg[indicator_col] == "left_only"][path_col]
453+
)[::-1]
454+
nodes_added = list(
455+
data_both_agg[data_both_agg[indicator_col] == "right_only"][path_col]
456+
)[::-1]
414457

415458
moved_from_indicator: List[bool] = [True for _ in range(len(nodes_removed))]
416459
moved_to_indicator: List[bool] = [True for _ in range(len(nodes_added))]
@@ -432,8 +475,8 @@ def get_tree_diff(
432475

433476
def add_suffix_to_path(
434477
_data: pd.DataFrame, _condition: pd.Series, _original_name: str, _suffix: str
435-
) -> pd.DataFrame:
436-
"""Add suffix to path string
478+
) -> None:
479+
"""Add suffix to path string, in-place
437480
438481
Args:
439482
_data (pd.DataFrame): original data with path column
@@ -446,35 +489,42 @@ def add_suffix_to_path(
446489
"""
447490
_data.iloc[_condition.values, _data.columns.get_loc(path_col)] = _data.iloc[
448491
_condition.values, _data.columns.get_loc(path_col)
449-
].str.replace(_original_name, f"{_original_name} ({suffix})", regex=True)
450-
return _data
451-
452-
for node_removed, move_indicator in zip(nodes_removed, moved_from_indicator):
453-
if not detail:
454-
suffix = "-"
455-
elif move_indicator:
456-
suffix = "moved from"
457-
else:
458-
suffix = "removed"
459-
condition_node_removed = data_both[path_col].str.endswith(
460-
node_removed
461-
) | data_both[path_col].str.contains(node_removed + tree_sep)
462-
data_both = add_suffix_to_path(
463-
data_both, condition_node_removed, node_removed, suffix
464-
)
465-
for node_added, move_indicator in zip(nodes_added, moved_to_indicator):
466-
if not detail:
467-
suffix = "+"
468-
elif move_indicator:
469-
suffix = "moved to"
470-
else:
471-
suffix = "added"
472-
condition_node_added = data_both[path_col].str.endswith(node_added) | data_both[
473-
path_col
474-
].str.contains(node_added + tree_sep)
475-
data_both = add_suffix_to_path(
476-
data_both, condition_node_added, node_added, suffix
477-
)
492+
].str.replace(_original_name, f"{_original_name} ({_suffix})", regex=True)
493+
494+
def add_suffix_to_data(
495+
_data: pd.DataFrame,
496+
nodes_diff: List[str],
497+
move_indicator: List[bool],
498+
suffix_general: str,
499+
suffix_move: str,
500+
suffix_not_moved: str,
501+
) -> None:
502+
"""Add suffix to data, in-place
503+
504+
Args:
505+
_data (pd.DataFrame): original data with path column
506+
nodes_diff (List[str]): list of paths that were modified (e.g., added/removed)
507+
move_indicator (List[bool]): move indicator to indicate path was moved instead of added/removed
508+
suffix_general (str): path suffix for general case
509+
suffix_move (str): path suffix if path was moved
510+
suffix_not_moved (str): path suffix if path is not moved (e.g., added/removed)
511+
"""
512+
for _node_diff, _move_indicator in zip(nodes_diff, move_indicator):
513+
if not detail:
514+
suffix = suffix_general
515+
else:
516+
suffix = suffix_move if _move_indicator else suffix_not_moved
517+
condition_node_modified = data_both[path_col].str.endswith(
518+
_node_diff
519+
) | data_both[path_col].str.contains(_node_diff + tree_sep)
520+
add_suffix_to_path(data_both, condition_node_modified, _node_diff, suffix)
521+
522+
add_suffix_to_data(
523+
data_both, nodes_removed, moved_from_indicator, "-", "moved from", "removed"
524+
)
525+
add_suffix_to_data(
526+
data_both, nodes_added, moved_to_indicator, "+", "moved to", "added"
527+
)
478528

479529
# Check tree attribute difference
480530
path_changes_list_of_dict: List[Dict[str, Dict[str, Any]]] = []

docs/gettingstarted/demo/tree.md

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -965,7 +965,11 @@ To compare tree attributes:
965965
- `(~)`: Node has different attributes, only available when comparing attributes
966966

967967
For more details, `(moved from)`, `(moved to)`, `(added)`, and `(removed)` can
968-
be indicated instead if `(+)` and `(-)`.
968+
be indicated instead if `(+)` and `(-)` by passing `detail=True`.
969+
970+
For aggregating the differences at the parent-level instead of having `(+)` and
971+
`(-)` at every child node, pass in `aggregate=True`. This is useful if
972+
subtrees are shifted, and if you want to view the shifting at the parent-level.
969973

970974
=== "Only differences"
971975
```python hl_lines="20"
@@ -1029,13 +1033,14 @@ be indicated instead if `(+)` and `(-)`.
10291033
# └── g (+)
10301034
```
10311035
=== "With details"
1032-
```python hl_lines="21"
1036+
```python hl_lines="23"
10331037
from bigtree import str_to_tree, get_tree_diff
10341038

10351039
root = str_to_tree("""
10361040
a
10371041
├── b
10381042
│ ├── d
1043+
│ │ └── g
10391044
│ └── e
10401045
└── c
10411046
└── f
@@ -1044,9 +1049,10 @@ be indicated instead if `(+)` and `(-)`.
10441049
root_other = str_to_tree("""
10451050
a
10461051
├── b
1047-
│ └── g
1052+
│ └── h
10481053
└── c
10491054
├── d
1055+
│ └── g
10501056
└── f
10511057
""")
10521058

@@ -1055,10 +1061,48 @@ be indicated instead if `(+)` and `(-)`.
10551061
# a
10561062
# ├── b
10571063
# │ ├── d (moved from)
1064+
# │ │ └── g (moved from)
1065+
# │ ├── e (removed)
1066+
# │ └── h (added)
1067+
# └── c
1068+
# └── d (moved to)
1069+
# └── g (moved to)
1070+
```
1071+
=== "With aggregated differences"
1072+
```python hl_lines="23"
1073+
from bigtree import str_to_tree, get_tree_diff
1074+
1075+
root = str_to_tree("""
1076+
a
1077+
├── b
1078+
│ ├── d
1079+
│ │ └── g
1080+
│ └── e
1081+
└── c
1082+
└── f
1083+
""")
1084+
1085+
root_other = str_to_tree("""
1086+
a
1087+
├── b
1088+
│ └── h
1089+
└── c
1090+
├── d
1091+
│ └── g
1092+
└── f
1093+
""")
1094+
1095+
tree_diff = get_tree_diff(root, root_other, detail=True, aggregate=True)
1096+
tree_diff.show()
1097+
# a
1098+
# ├── b
1099+
# │ ├── d (moved from)
1100+
# │ │ └── g
10581101
# │ ├── e (removed)
1059-
# │ └── g (added)
1102+
# │ └── h (added)
10601103
# └── c
10611104
# └── d (moved to)
1105+
# └── g
10621106
```
10631107
=== "Attribute difference"
10641108
```python hl_lines="25"

0 commit comments

Comments
 (0)