@@ -250,6 +250,7 @@ def get_tree_diff(
250250 other_tree : node .Node ,
251251 only_diff : bool = True ,
252252 detail : bool = False ,
253+ aggregate : bool = False ,
253254 attr_list : List [str ] = [],
254255 fallback_sep : str = "/" ,
255256) -> node .Node :
@@ -267,6 +268,9 @@ def get_tree_diff(
267268 If `detail=True`, (added) and (moved to) will be used instead of (+), (removed) and (moved from)
268269 will be used instead of (-).
269270
271+ If `aggregate=True`, differences (+)/(added)/(moved to) and (-)/(removed)/(moved from) will only be indicated at
272+ the parent-level. This is useful when a subtree is shifted and we want the differences to shown only at the top node.
273+
270274 !!! note
271275
272276 - tree and other_tree must have the same `sep` symbol, otherwise this will raise ValueError
@@ -276,50 +280,79 @@ def get_tree_diff(
276280 Examples:
277281 >>> # Create original tree
278282 >>> from bigtree import Node, get_tree_diff, list_to_tree
279- >>> root = list_to_tree(["Downloads/Pictures/photo1.jpg", "Downloads/file1.doc", "Downloads/photo2.jpg"])
283+ >>> root = list_to_tree(["Downloads/Pictures/photo1.jpg", "Downloads/file1.doc", "Downloads/Trip/ photo2.jpg"])
280284 >>> root.show()
281285 Downloads
282286 ├── Pictures
283287 │ └── photo1.jpg
284288 ├── file1.doc
285- └── photo2.jpg
289+ └── Trip
290+ └── photo2.jpg
286291
287292 >>> # Create other tree
288- >>> root_other = list_to_tree(["Downloads/Pictures/photo1.jpg", "Downloads/Pictures/photo2.jpg", "Downloads/file1.doc"])
293+ >>> root_other = list_to_tree(
294+ ... ["Downloads/Pictures/photo1.jpg", "Downloads/Pictures/Trip/photo2.jpg", "Downloads/file1.doc", "Downloads/file2.doc"]
295+ ... )
289296 >>> root_other.show()
290297 Downloads
291298 ├── Pictures
292299 │ ├── photo1.jpg
293- │ └── photo2.jpg
294- └── file1.doc
300+ │ └── Trip
301+ │ └── photo2.jpg
302+ ├── file1.doc
303+ └── file2.doc
295304
296- >>> # Get tree differences
305+ # Get tree differences
297306 >>> tree_diff = get_tree_diff(root, root_other)
298307 >>> tree_diff.show()
299308 Downloads
300309 ├── Pictures
301- │ └── photo2.jpg (+)
302- └── photo2.jpg (-)
310+ │ └── Trip (+)
311+ │ └── photo2.jpg (+)
312+ ├── Trip (-)
313+ │ └── photo2.jpg (-)
314+ └── file2.doc (+)
303315
316+ >>> # Get tree differences - all differences
304317 >>> tree_diff = get_tree_diff(root, root_other, only_diff=False)
305318 >>> tree_diff.show()
306319 Downloads
307320 ├── Pictures
308- │ ├── photo1.jpg
309- │ └── photo2.jpg (+)
321+ │ ├── Trip (+)
322+ │ │ └── photo2.jpg (+)
323+ │ └── photo1.jpg
324+ ├── Trip (-)
325+ │ └── photo2.jpg (-)
310326 ├── file1.doc
311- └── photo2.jpg (- )
327+ └── file2.doc (+ )
312328
329+ >>> # Get tree differences - all differences with details
313330 >>> tree_diff = get_tree_diff(root, root_other, only_diff=False, detail=True)
314331 >>> tree_diff.show()
315332 Downloads
316333 ├── Pictures
317- │ ├── photo1.jpg
318- │ └── photo2.jpg (moved to)
334+ │ ├── Trip (moved to)
335+ │ │ └── photo2.jpg (moved to)
336+ │ └── photo1.jpg
337+ ├── Trip (moved from)
338+ │ └── photo2.jpg (moved from)
319339 ├── file1.doc
320- └── photo2.jpg (moved from )
340+ └── file2.doc (added )
321341
322- Comparing tree attributes
342+ >>> # Get tree differences - all differences with details on aggregated level
343+ >>> tree_diff = get_tree_diff(root, root_other, only_diff=False, detail=True, aggregate=True)
344+ >>> tree_diff.show()
345+ Downloads
346+ ├── Pictures
347+ │ ├── Trip (moved to)
348+ │ │ └── photo2.jpg
349+ │ └── photo1.jpg
350+ ├── Trip (moved from)
351+ │ └── photo2.jpg
352+ ├── file1.doc
353+ └── file2.doc (added)
354+
355+ # Comparing tree attributes
323356
324357 - (~) will be added to node name if there are differences in tree attributes defined in `attr_list`.
325358 - The node's attributes will be a list of [value in `tree`, value in `other_tree`]
@@ -361,6 +394,7 @@ def get_tree_diff(
361394 other_tree (Node): tree to be compared with
362395 only_diff (bool): indicator to show all nodes or only nodes that are different (+/-), defaults to True
363396 detail (bool): indicator to differentiate between different types of diff e.g., added or removed or moved
397+ aggregate (bool): indicator to only add difference indicator to parent-level e.g., when shifting subtrees
364398 attr_list (List[str]): tree attributes to check for difference, defaults to empty list
365399 fallback_sep (str): sep to fall back to if tree and other_tree has sep that clashes with symbols "+" / "-" / "~".
366400 All node names in tree and other_tree should not contain this fallback_sep, defaults to "/"
@@ -383,6 +417,7 @@ def get_tree_diff(
383417
384418 name_col = "name"
385419 path_col = "PATH"
420+ parent_col = "PARENT"
386421 indicator_col = "Exists"
387422 tree_sep = tree .sep
388423
@@ -391,26 +426,34 @@ def get_tree_diff(
391426 _tree ,
392427 name_col = name_col ,
393428 path_col = path_col ,
429+ parent_col = parent_col ,
394430 attr_dict = {k : k for k in attr_list },
395431 )
396432 for _tree in (tree , other_tree )
397433 )
398434
399435 # Check tree structure difference
400- data_both = data [[path_col , name_col ] + attr_list ].merge (
401- data_other [[path_col , name_col ] + attr_list ],
436+ data_both = data [[path_col , name_col , parent_col ] + attr_list ].merge (
437+ data_other [[path_col , name_col , parent_col ] + attr_list ],
402438 how = "outer" ,
403- on = [path_col , name_col ],
439+ on = [path_col , name_col , parent_col ],
404440 indicator = indicator_col ,
405441 )
442+ if aggregate :
443+ data_both_agg = data_both [
444+ (data_both [indicator_col ] == "left_only" )
445+ | (data_both [indicator_col ] == "right_only" )
446+ ].drop_duplicates (subset = [name_col , parent_col ], keep = False )
447+ else :
448+ data_both_agg = data_both
406449
407450 # Handle tree structure difference
408- nodes_removed = list (data_both [ data_both [ indicator_col ] == "left_only" ][ path_col ])[
409- :: - 1
410- ]
411- nodes_added = list (data_both [ data_both [ indicator_col ] == "right_only" ][ path_col ])[
412- :: - 1
413- ]
451+ nodes_removed = list (
452+ data_both_agg [ data_both_agg [ indicator_col ] == "left_only" ][ path_col ]
453+ )[:: - 1 ]
454+ nodes_added = list (
455+ data_both_agg [ data_both_agg [ indicator_col ] == "right_only" ][ path_col ]
456+ )[:: - 1 ]
414457
415458 moved_from_indicator : List [bool ] = [True for _ in range (len (nodes_removed ))]
416459 moved_to_indicator : List [bool ] = [True for _ in range (len (nodes_added ))]
@@ -432,8 +475,8 @@ def get_tree_diff(
432475
433476 def add_suffix_to_path (
434477 _data : pd .DataFrame , _condition : pd .Series , _original_name : str , _suffix : str
435- ) -> pd . DataFrame :
436- """Add suffix to path string
478+ ) -> None :
479+ """Add suffix to path string, in-place
437480
438481 Args:
439482 _data (pd.DataFrame): original data with path column
@@ -446,35 +489,42 @@ def add_suffix_to_path(
446489 """
447490 _data .iloc [_condition .values , _data .columns .get_loc (path_col )] = _data .iloc [
448491 _condition .values , _data .columns .get_loc (path_col )
449- ].str .replace (_original_name , f"{ _original_name } ({ suffix } )" , regex = True )
450- return _data
451-
452- for node_removed , move_indicator in zip (nodes_removed , moved_from_indicator ):
453- if not detail :
454- suffix = "-"
455- elif move_indicator :
456- suffix = "moved from"
457- else :
458- suffix = "removed"
459- condition_node_removed = data_both [path_col ].str .endswith (
460- node_removed
461- ) | data_both [path_col ].str .contains (node_removed + tree_sep )
462- data_both = add_suffix_to_path (
463- data_both , condition_node_removed , node_removed , suffix
464- )
465- for node_added , move_indicator in zip (nodes_added , moved_to_indicator ):
466- if not detail :
467- suffix = "+"
468- elif move_indicator :
469- suffix = "moved to"
470- else :
471- suffix = "added"
472- condition_node_added = data_both [path_col ].str .endswith (node_added ) | data_both [
473- path_col
474- ].str .contains (node_added + tree_sep )
475- data_both = add_suffix_to_path (
476- data_both , condition_node_added , node_added , suffix
477- )
492+ ].str .replace (_original_name , f"{ _original_name } ({ _suffix } )" , regex = True )
493+
494+ def add_suffix_to_data (
495+ _data : pd .DataFrame ,
496+ nodes_diff : List [str ],
497+ move_indicator : List [bool ],
498+ suffix_general : str ,
499+ suffix_move : str ,
500+ suffix_not_moved : str ,
501+ ) -> None :
502+ """Add suffix to data, in-place
503+
504+ Args:
505+ _data (pd.DataFrame): original data with path column
506+ nodes_diff (List[str]): list of paths that were modified (e.g., added/removed)
507+ move_indicator (List[bool]): move indicator to indicate path was moved instead of added/removed
508+ suffix_general (str): path suffix for general case
509+ suffix_move (str): path suffix if path was moved
510+ suffix_not_moved (str): path suffix if path is not moved (e.g., added/removed)
511+ """
512+ for _node_diff , _move_indicator in zip (nodes_diff , move_indicator ):
513+ if not detail :
514+ suffix = suffix_general
515+ else :
516+ suffix = suffix_move if _move_indicator else suffix_not_moved
517+ condition_node_modified = data_both [path_col ].str .endswith (
518+ _node_diff
519+ ) | data_both [path_col ].str .contains (_node_diff + tree_sep )
520+ add_suffix_to_path (data_both , condition_node_modified , _node_diff , suffix )
521+
522+ add_suffix_to_data (
523+ data_both , nodes_removed , moved_from_indicator , "-" , "moved from" , "removed"
524+ )
525+ add_suffix_to_data (
526+ data_both , nodes_added , moved_to_indicator , "+" , "moved to" , "added"
527+ )
478528
479529 # Check tree attribute difference
480530 path_changes_list_of_dict : List [Dict [str , Dict [str , Any ]]] = []
0 commit comments