Skip to content

Commit

Permalink
Merge pull request #329 from kayjan/feature/tree-diff-attr
Browse files Browse the repository at this point in the history
Check for moved indicator via dataframe operations
  • Loading branch information
kayjan authored Nov 13, 2024
2 parents 813ebca + 6d777ab commit da6793a
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 33 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed:
- Tree Helper: Get tree diff logic to be faster to compare all attribute list and data at once (for attr diff).
- Tree Helper: Get tree diff logic to be faster to add suffix at the end (for path diff).
- Tree Helper: Get tree diff logic to be faster to detect moved indicator using dataframe operations (for detail).

## [0.22.2] - 2024-11-11
### Added:
Expand Down
80 changes: 47 additions & 33 deletions bigtree/tree/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ def get_tree_diff(
indicator_col = "Exists"
old_suffix = "_old"
new_suffix = "_new"
tree_sep = tree.sep
moved_ind = "moved_ind"

data, data_other = (
export.tree_to_dataframe(
Expand Down Expand Up @@ -475,32 +475,46 @@ def get_tree_diff(
data_path_diff = data_compare

# Handle tree structure difference
paths_removed = list(
data_path_diff[data_path_diff[indicator_col] == "left_only"][path_col]
)[::-1]
paths_added = list(
data_path_diff[data_path_diff[indicator_col] == "right_only"][path_col]
)[::-1]

moved_from_ind: List[bool] = [True for _ in range(len(paths_removed))]
moved_to_ind: List[bool] = [True for _ in range(len(paths_added))]
data_tree = data_path_diff[data_path_diff[indicator_col] == "left_only"]
data_tree_other = data_path_diff[data_path_diff[indicator_col] == "right_only"]

if detail:
names_removed = [path.split(tree_sep)[-1] for path in paths_removed]
names_added = [path.split(tree_sep)[-1] for path in paths_added]
moved_from_ind = [name in names_added for name in names_removed]
moved_to_ind = [name in names_removed for name in names_added]

path_removed_to_suffix = {
path: "-" if not detail else ("moved from" if move_ind else "removed")
for path, move_ind in zip(paths_removed, moved_from_ind)
}
path_added_to_suffix = {
path: "+" if not detail else ("moved to" if move_ind else "added")
for path, move_ind in zip(paths_added, moved_to_ind)
}
data_tree[moved_ind] = False
data_tree_other[moved_ind] = False

if len(data_tree) and len(data_tree_other):
# Check for moved from and moved to
move_from_condition = data_tree[
data_tree[name_col].isin(set(data_tree_other[name_col]))
]
data_tree.loc[move_from_condition.index, moved_ind] = True
move_to_condition = data_tree_other[
data_tree_other[name_col].isin(set(data_tree[name_col]))
]
data_tree_other.loc[move_to_condition.index, moved_ind] = True

path_move_from = data_tree.set_index(path_col)[[moved_ind]].to_dict(
orient="index"
)
path_move_to = data_tree_other.set_index(path_col)[[moved_ind]].to_dict(
orient="index"
)
path_move_from_suffix = {
path: "moved from" if v[moved_ind] else "removed"
for path, v in path_move_from.items()
}
path_move_to_suffix = {
path: "moved to" if v[moved_ind] else "added"
for path, v in path_move_to.items()
}
else:
path_move_from_suffix = dict(zip(data_tree[path_col], "-" * len(data_tree)))
path_move_to_suffix = dict(
zip(data_tree_other[path_col], "+" * len(data_tree_other))
)

# Check tree attribute difference
dict_attr_diff: Dict[str, Dict[str, Any]] = {}
path_attr_diff: Dict[str, Dict[str, Any]] = {}
if attr_list:
data_both = data_compare[data_compare[indicator_col] == "both"]
condition_attr_diff = (
Expand All @@ -517,7 +531,7 @@ def get_tree_diff(
data_attr_diff = data_both[eval(condition_attr_diff)]
dict_attr_all = data_attr_diff.set_index(path_col).to_dict(orient="index")
for path, node_attr in dict_attr_all.items():
dict_attr_diff[path] = {
path_attr_diff[path] = {
attr: (
node_attr[f"{attr}{old_suffix}"],
node_attr[f"{attr}{new_suffix}"],
Expand All @@ -531,24 +545,24 @@ def get_tree_diff(
if only_diff:
data_compare = data_compare[
(data_compare[indicator_col] != "both")
| (data_compare[path_col].isin(dict_attr_diff.keys()))
| (data_compare[path_col].isin(path_attr_diff.keys()))
]
data_compare = data_compare[[path_col]].sort_values(path_col)
if len(data_compare):
tree_diff = construct.dataframe_to_tree(
data_compare, node_type=tree.__class__, sep=tree.sep
)
for path in sorted(path_removed_to_suffix, reverse=True):
for path in sorted(path_move_from_suffix, reverse=True):
_node = search.find_full_path(tree_diff, path)
_node.name += f""" ({path_removed_to_suffix[path]})"""
for path in sorted(path_added_to_suffix, reverse=True):
_node.name += f""" ({path_move_from_suffix[path]})"""
for path in sorted(path_move_to_suffix, reverse=True):
_node = search.find_full_path(tree_diff, path)
_node.name += f""" ({path_added_to_suffix[path]})"""
_node.name += f""" ({path_move_to_suffix[path]})"""

# Handle tree attribute difference
if dict_attr_diff:
tree_diff = construct.add_dict_to_tree_by_path(tree_diff, dict_attr_diff)
for path in sorted(dict_attr_diff, reverse=True):
if path_attr_diff:
tree_diff = construct.add_dict_to_tree_by_path(tree_diff, path_attr_diff)
for path in sorted(path_attr_diff, reverse=True):
_node = search.find_full_path(tree_diff, path)
_node.name += " (~)"
return tree_diff

0 comments on commit da6793a

Please sign in to comment.