diff --git a/CHANGELOG.md b/CHANGELOG.md index b5f5ad4c..a990feb1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,35 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.17.0] - 2024-04-04 +### Added +- Misc: Group tests for benchmark timings to compare the timings by multiplier more effectively. +### Changed +- Tree Constructor: `add_dict_to_tree_by_name` and `add_dataframe_to_tree_by_name` modifies tree in-place instead +of returning new tree, and does not accept `join_type` as argument as pandas dataframe operation is phased out. +If there are clashing attributes, only those that have values will be replaced. +**This might not be backwards-compatible!** +- Tree Constructor: `dataframe_to_tree` no longer relies on `add_dataframe_to_tree_by_path` as it performs +assertion checks twice. This leads to 5% improvement in timings for a tree with 10000 nodes, averaged across 10 runs. +- Misc: Abstract out assertion checks for empty dataframe and duplicate attribute. +- Misc: Abstract out logic for checking null and filtering attributes. +- Misc: Optimization in dictionary and dataframe operations. +### Fixed +- Tree Constructor: `dict_to_tree` no longer uses dataframe operations, leading to 33% improvement in timings for +a tree with 10000 nodes, averaged across 10 runs. The resulting data type of node follows the dictionary exactly, +compared to the previous dataframe operations that may change the dtypes for certain columns. +**This might not be backwards-compatible!** +- Tree Constructor: `dataframe_to_tree_by_relation` fix root node detection logic, ignore existing name column, +ignore non-attribute columns, ignore null attribute columns. +- Tree Constructor: `add_dataframe_to_tree_by_path` ignore existing name column, ignore non-attribute columns, +ignore null attribute columns. +- Tree Constructor: `add_dataframe_to_tree_by_name` ignore existing name column, ignore non-attribute columns, +ignore null attribute columns. +- Tree Constructor: `dataframe_to_tree` ignore existing name column, ignore non-attribute columns, +ignore null attribute columns. +- DAG Constructor: `dataframe_to_dag` ignore existing name column, ignore non-attribute columns, +ignore null attribute columns. + ## [0.16.4] - 2024-03-14 ### Fixed - [#216] Tree Exporter: Fix nan checker when printing trees. @@ -511,7 +540,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Utility Iterator: Tree traversal methods. - Workflow To Do App: Tree use case with to-do list implementation. -[Unreleased]: https://github.com/kayjan/bigtree/compare/0.16.4...HEAD +[Unreleased]: https://github.com/kayjan/bigtree/compare/0.17.0...HEAD +[0.17.0]: https://github.com/kayjan/bigtree/compare/0.16.4...0.17.0 [0.16.4]: https://github.com/kayjan/bigtree/compare/0.16.3...0.16.4 [0.16.3]: https://github.com/kayjan/bigtree/compare/0.16.2...0.16.3 [0.16.2]: https://github.com/kayjan/bigtree/compare/0.16.1...0.16.2 diff --git a/assets/docs/tree_construct.png b/assets/docs/tree_construct.png index 7be70d0d..6194fece 100644 Binary files a/assets/docs/tree_construct.png and b/assets/docs/tree_construct.png differ diff --git a/bigtree/__init__.py b/bigtree/__init__.py index 9960d6bf..e0d3b4fb 100644 --- a/bigtree/__init__.py +++ b/bigtree/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.16.4" +__version__ = "0.17.0" from bigtree.binarytree.construct import list_to_binarytree from bigtree.dag.construct import dataframe_to_dag, dict_to_dag, list_to_dag diff --git a/bigtree/binarytree/construct.py b/bigtree/binarytree/construct.py index 8ba4cb11..211d48ee 100644 --- a/bigtree/binarytree/construct.py +++ b/bigtree/binarytree/construct.py @@ -4,6 +4,8 @@ __all__ = ["list_to_binarytree"] +from bigtree.utils.assertions import assert_length_not_empty + def list_to_binarytree( heapq_list: List[int], node_type: Type[BinaryNode] = BinaryNode @@ -37,8 +39,7 @@ def list_to_binarytree( Returns: (BinaryNode) """ - if not len(heapq_list): - raise ValueError("Input list does not contain any data, check `heapq_list`") + assert_length_not_empty(heapq_list, "Input list", "heapq_list") root_node = node_type(heapq_list[0]) node_list = [root_node] diff --git a/bigtree/dag/construct.py b/bigtree/dag/construct.py index 7054610e..88c68b68 100644 --- a/bigtree/dag/construct.py +++ b/bigtree/dag/construct.py @@ -3,6 +3,14 @@ from typing import Any, Dict, List, Tuple, Type from bigtree.node.dagnode import DAGNode +from bigtree.utils.assertions import ( + assert_dataframe_no_duplicate_attribute, + assert_dataframe_not_empty, + assert_dictionary_not_empty, + assert_length_not_empty, + filter_attributes, + isnull, +) from bigtree.utils.exceptions import optional_dependencies_pandas try: @@ -35,8 +43,7 @@ def list_to_dag( Returns: (DAGNode) """ - if not len(relations): - raise ValueError("Input list does not contain any data, check `relations`") + assert_length_not_empty(relations, "Input list", "relations") relation_data = pd.DataFrame(relations, columns=["parent", "child"]) return dataframe_to_dag( @@ -44,6 +51,7 @@ def list_to_dag( ) +@optional_dependencies_pandas def dict_to_dag( relation_attrs: Dict[str, Any], parent_key: str = "parents", @@ -75,8 +83,7 @@ def dict_to_dag( Returns: (DAGNode) """ - if not len(relation_attrs): - raise ValueError("Dictionary does not contain any data, check `relation_attrs`") + assert_dictionary_not_empty(relation_attrs, "relation_attrs") # Convert dictionary to dataframe data = pd.DataFrame(relation_attrs).T.rename_axis("_tmp_child").reset_index() @@ -110,6 +117,8 @@ def dataframe_to_dag( - If columns are not specified, `child_col` takes first column, `parent_col` takes second column, and all other columns are `attribute_cols`. + Only attributes in `attribute_cols` with non-null values will be added to the tree. + Examples: >>> import pandas as pd >>> from bigtree import dataframe_to_dag, dag_iterator @@ -141,12 +150,7 @@ def dataframe_to_dag( Returns: (DAGNode) """ - data = data.copy() - - if not len(data.columns): - raise ValueError("Data does not contain any columns, check `data`") - if not len(data): - raise ValueError("Data does not contain any rows, check `data`") + assert_dataframe_not_empty(data) if not child_col: child_col = data.columns[0] @@ -160,27 +164,12 @@ def dataframe_to_dag( attribute_cols = list(data.columns) attribute_cols.remove(child_col) attribute_cols.remove(parent_col) - elif any([col not in data.columns for col in attribute_cols]): - raise ValueError( - f"One or more attribute column(s) not in data, check `attribute_cols`: {attribute_cols}" - ) - data_check = data.copy()[[child_col, parent_col] + attribute_cols].drop_duplicates( - subset=[child_col] + attribute_cols - ) - _duplicate_check = ( - data_check[child_col] - .value_counts() - .to_frame("counts") - .rename_axis(child_col) - .reset_index() + data = data[[child_col, parent_col] + attribute_cols].copy() + + assert_dataframe_no_duplicate_attribute( + data, "child name", child_col, attribute_cols ) - _duplicate_check = _duplicate_check[_duplicate_check["counts"] > 1] - if len(_duplicate_check): - raise ValueError( - f"There exists duplicate child name with different attributes\n" - f"Check {_duplicate_check}" - ) if sum(data[child_col].isnull()): raise ValueError(f"Child name cannot be empty, check column: {child_col}") @@ -190,15 +179,14 @@ def dataframe_to_dag( for row in data.reset_index(drop=True).to_dict(orient="index").values(): child_name = row[child_col] parent_name = row[parent_col] - node_attrs = row.copy() - del node_attrs[child_col] - del node_attrs[parent_col] - node_attrs = {k: v for k, v in node_attrs.items() if not pd.isnull(v)} - child_node = node_dict.get(child_name, node_type(child_name)) + node_attrs = filter_attributes( + row, omit_keys=["name", child_col, parent_col], omit_null_values=True + ) + child_node = node_dict.get(child_name, node_type(child_name, **node_attrs)) child_node.set_attrs(node_attrs) node_dict[child_name] = child_node - if not pd.isnull(parent_name): + if not isnull(parent_name): parent_node = node_dict.get(parent_name, node_type(parent_name)) node_dict[parent_name] = parent_node child_node.parents = [parent_node] diff --git a/bigtree/tree/construct.py b/bigtree/tree/construct.py index bd657289..9c032244 100644 --- a/bigtree/tree/construct.py +++ b/bigtree/tree/construct.py @@ -2,11 +2,19 @@ import re from collections import OrderedDict, defaultdict -from typing import Any, Dict, Iterable, List, Optional, Tuple, Type +from typing import Any, Dict, List, Optional, Tuple, Type from bigtree.node.node import Node -from bigtree.tree.export import tree_to_dataframe from bigtree.tree.search import find_child_by_name, find_name +from bigtree.utils.assertions import ( + assert_dataframe_no_duplicate_attribute, + assert_dataframe_no_duplicate_children, + assert_dataframe_not_empty, + assert_dictionary_not_empty, + assert_length_not_empty, + filter_attributes, + isnull, +) from bigtree.utils.constants import NewickCharacter, NewickState from bigtree.utils.exceptions import ( DuplicatedNodeError, @@ -59,6 +67,8 @@ def add_path_to_tree( - For example: Path strings should be "a/b", "a/c", "a/b/d" etc., and should not start with another root node. + All attributes in `node_attrs` will be added to the tree, including attributes with null values. + Examples: >>> from bigtree import add_path_to_tree, Node >>> root = Node("a") @@ -79,27 +89,26 @@ def add_path_to_tree( Returns: (Node) """ - if not len(path): - raise ValueError("Path is empty, check `path`") + assert_length_not_empty(path, "Path", "path") - tree_root = tree.root - tree_sep = tree_root.sep - node_type = tree_root.__class__ + root_node = tree.root + tree_sep = root_node.sep + node_type = root_node.__class__ branch = path.lstrip(sep).rstrip(sep).split(sep) - if branch[0] != tree_root.node_name: + if branch[0] != root_node.node_name: raise TreeError( - f"Path does not have same root node, expected {tree_root.node_name}, received {branch[0]}\n" + f"Path does not have same root node, expected {root_node.node_name}, received {branch[0]}\n" f"Check your input paths or verify that path separator `sep` is set correctly" ) # Grow tree - node = tree_root - parent_node = tree_root + node = root_node + parent_node = root_node for idx in range(1, len(branch)): node_name = branch[idx] node_path = tree_sep.join(branch[: idx + 1]) if not duplicate_name_allowed: - node = find_name(tree_root, node_name) + node = find_name(root_node, node_name) if node and not node.path_name.endswith(node_path): raise DuplicatedNodeError( f"Node {node_name} already exists, try setting `duplicate_name_allowed` to True " @@ -109,10 +118,9 @@ def add_path_to_tree( node = find_child_by_name(parent_node, node_name) if not node: if idx == len(branch) - 1: - node_name = node_attrs.pop("name", branch[idx]) node = node_type(node_name, **node_attrs) else: - node = node_type(branch[idx]) + node = node_type(node_name) node.parent = parent_node parent_node = node node.set_attrs(node_attrs) @@ -128,6 +136,8 @@ def add_dict_to_tree_by_path( """Add nodes and attributes to tree *in-place*, return root of tree. Adds to existing tree from nested dictionary, ``key``: path, ``value``: dict of attribute name and attribute value. + All attributes in `path_attrs` will be added to the tree, including attributes with null values. + Path should contain ``Node`` name, separated by `sep`. - For example: Path string "a/b" refers to Node("b") with parent Node("a"). @@ -175,30 +185,27 @@ def add_dict_to_tree_by_path( Returns: (Node) """ - if not len(path_attrs): - raise ValueError("Dictionary does not contain any data, check `path_attrs`") + assert_dictionary_not_empty(path_attrs, "path_attrs") - tree_root = tree.root + root_node = tree.root - for k, v in path_attrs.items(): + for path, node_attrs in path_attrs.items(): add_path_to_tree( - tree_root, - k, + root_node, + path, sep=sep, duplicate_name_allowed=duplicate_name_allowed, - node_attrs=v, + node_attrs=node_attrs, ) - return tree_root + return root_node -@optional_dependencies_pandas -def add_dict_to_tree_by_name( - tree: Node, name_attrs: Dict[str, Dict[str, Any]], join_type: str = "left" -) -> Node: - """Add attributes to tree, return *new* root of tree. +def add_dict_to_tree_by_name(tree: Node, name_attrs: Dict[str, Dict[str, Any]]) -> Node: + """Add attributes to existing tree *in-place*. Adds to existing tree from nested dictionary, ``key``: name, ``value``: dict of attribute name and attribute value. - Function can return all existing tree nodes or only tree nodes that are in the input dictionary keys depending on join type. + All attributes in `name_attrs` will be added to the tree, including attributes with null values. + Input dictionary keys that are not existing node names will be ignored. Note that if multiple nodes have the same name, attributes will be added to all nodes sharing the same name. @@ -219,22 +226,23 @@ def add_dict_to_tree_by_name( tree (Node): existing tree name_attrs (Dict[str, Dict[str, Any]]): dictionary containing node name and attribute information, key: node name, value: dict of node attribute name and attribute value - join_type (str): join type with attribute, default of 'left' takes existing tree nodes, - if join_type is set to 'inner' it will only take tree nodes that are in `name_attrs` key and drop others Returns: (Node) """ - if join_type not in ["inner", "left"]: - raise ValueError("`join_type` must be one of 'inner' or 'left'") + from bigtree.tree.search import findall - if not len(name_attrs): - raise ValueError("Dictionary does not contain any data, check `name_attrs`") + assert_dictionary_not_empty(name_attrs, "name_attrs") - # Convert dictionary to dataframe - data = pd.DataFrame(name_attrs).T.rename_axis("NAME").reset_index() - data = data.replace({float("nan"): None}) - return add_dataframe_to_tree_by_name(tree, data=data, join_type=join_type) + attr_dict_names = set(name_attrs.keys()) + + for node in findall(tree, lambda _node: _node.node_name in attr_dict_names): + node_attrs = filter_attributes( + name_attrs[node.node_name], omit_keys=["name"], omit_null_values=False + ) + node.set_attrs(node_attrs) + + return tree def add_dataframe_to_tree_by_path( @@ -246,6 +254,9 @@ def add_dataframe_to_tree_by_path( duplicate_name_allowed: bool = True, ) -> Node: """Add nodes and attributes to tree *in-place*, return root of tree. + Adds to existing tree from pandas DataFrame. + + Only attributes in `attribute_cols` with non-null values will be added to the tree. `path_col` and `attribute_cols` specify columns for node path and attributes to add to existing tree. If columns are not specified, `path_col` takes first column and all other columns are `attribute_cols` @@ -303,12 +314,7 @@ def add_dataframe_to_tree_by_path( Returns: (Node) """ - data = data.copy() - - if not len(data.columns): - raise ValueError("Data does not contain any columns, check `data`") - if not len(data): - raise ValueError("Data does not contain any rows, check `data`") + assert_dataframe_not_empty(data) if not path_col: path_col = data.columns[0] @@ -316,50 +322,39 @@ def add_dataframe_to_tree_by_path( attribute_cols = list(data.columns) attribute_cols.remove(path_col) - tree_root = tree.root + data = data[[path_col] + attribute_cols].copy() data[path_col] = data[path_col].str.lstrip(sep).str.rstrip(sep) - data2 = data.copy()[[path_col] + attribute_cols].astype(str).drop_duplicates() - _duplicate_check = ( - data2[path_col] - .value_counts() - .to_frame("counts") - .rename_axis(path_col) - .reset_index() - ) - _duplicate_check = _duplicate_check[_duplicate_check["counts"] > 1] - if len(_duplicate_check): - raise ValueError( - f"There exists duplicate path with different attributes\nCheck {_duplicate_check}" - ) + assert_dataframe_no_duplicate_attribute(data, "path", path_col, attribute_cols) + root_node = tree.root for row in data.to_dict(orient="index").values(): - node_attrs = row.copy() - del node_attrs[path_col] - node_attrs = {k: v for k, v in node_attrs.items() if v is not None} + node_attrs = filter_attributes( + row, omit_keys=["name", path_col], omit_null_values=True + ) add_path_to_tree( - tree_root, + root_node, row[path_col], sep=sep, duplicate_name_allowed=duplicate_name_allowed, node_attrs=node_attrs, ) - return tree_root + return root_node -@optional_dependencies_pandas def add_dataframe_to_tree_by_name( tree: Node, data: pd.DataFrame, name_col: str = "", attribute_cols: List[str] = [], - join_type: str = "left", ) -> Node: - """Add attributes to tree, return *new* root of tree. + """Add attributes to existing tree *in-place*. + Adds to existing tree from pandas DataFrame. + + Only attributes in `attribute_cols` with non-null values will be added to the tree. `name_col` and `attribute_cols` specify columns for node name and attributes to add to existing tree. If columns are not specified, the first column will be taken as name column and all other columns as attributes. - Function can return all existing tree nodes or only tree nodes that are in the input data node names. Input data node names that are not existing node names will be ignored. Note that if multiple nodes have the same name, attributes will be added to all nodes sharing same name. @@ -386,21 +381,11 @@ def add_dataframe_to_tree_by_name( if not set, it will take the first column of data attribute_cols (List[str]): column(s) of data containing node attribute information, if not set, it will take all columns of data except `path_col` - join_type (str): join type with attribute, default of 'left' takes existing tree nodes, - if join_type is set to 'inner' it will only take tree nodes with attributes and drop the other nodes Returns: (Node) """ - data = data.copy() - - if join_type not in ["inner", "left"]: - raise ValueError("`join_type` must be one of 'inner' or 'left'") - - if not len(data.columns): - raise ValueError("Data does not contain any columns, check `data`") - if not len(data): - raise ValueError("Data does not contain any rows, check `data`") + assert_dataframe_not_empty(data) if not name_col: name_col = data.columns[0] @@ -408,39 +393,20 @@ def add_dataframe_to_tree_by_name( attribute_cols = list(data.columns) attribute_cols.remove(name_col) - # Attribute data - path_col = "PATH" - data2 = data.copy()[[name_col] + attribute_cols].astype(str).drop_duplicates() - _duplicate_check = ( - data2[name_col] - .value_counts() - .to_frame("counts") - .rename_axis(name_col) - .reset_index() - ) - _duplicate_check = _duplicate_check[_duplicate_check["counts"] > 1] - if len(_duplicate_check): - raise ValueError( - f"There exists duplicate name with different attributes\nCheck {_duplicate_check}" - ) + assert_dataframe_no_duplicate_attribute(data, "name", name_col, attribute_cols) - # Tree data - tree_root = tree.root - sep = tree_root.sep - node_type = tree_root.__class__ - data_tree = tree_to_dataframe( - tree_root, name_col=name_col, path_col=path_col, all_attrs=True + # Get attribute dict, remove null attributes + name_attrs = ( + data.drop_duplicates(name_col) + .set_index(name_col)[attribute_cols] + .to_dict(orient="index") ) - common_cols = list(set(data_tree.columns).intersection(attribute_cols)) - data_tree = data_tree.drop(columns=common_cols) - - # Attribute data - data_tree_attrs = pd.merge(data_tree, data, on=name_col, how=join_type) - data_tree_attrs = data_tree_attrs.drop(columns=name_col) + name_attrs = { + k1: {k2: v2 for k2, v2 in v1.items() if not isnull(v2)} + for k1, v1 in name_attrs.items() + } - return dataframe_to_tree( - data_tree_attrs, path_col=path_col, sep=sep, node_type=node_type - ) + return add_dict_to_tree_by_name(tree, name_attrs) def str_to_tree( @@ -474,14 +440,13 @@ def str_to_tree( (Node) """ tree_string = tree_string.strip("\n") - if not len(tree_string): - raise ValueError("Tree string does not contain any data, check `tree_string`") + assert_length_not_empty(tree_string, "Tree string", "tree_string") tree_list = tree_string.split("\n") - tree_root = node_type(tree_list[0]) + root_node = node_type(tree_list[0]) # Infer prefix length prefix_length = None - cur_parent = tree_root + cur_parent = root_node for node_str in tree_list[1:]: if len(tree_prefix_list): node_name = re.split("|".join(tree_prefix_list), node_str)[-1].lstrip() @@ -509,11 +474,11 @@ def str_to_tree( child_node.parent = cur_parent cur_parent = child_node - return tree_root + return root_node def list_to_tree( - paths: Iterable[str], + paths: List[str], sep: str = "/", duplicate_name_allowed: bool = True, node_type: Type[Node] = Node, @@ -547,7 +512,7 @@ def list_to_tree( └── f Args: - paths (Iterable[str]): list containing path strings + paths (List[str]): list containing path strings sep (str): path separator for input `paths` and created tree, defaults to `/` duplicate_name_allowed (bool): indicator if nodes with duplicate ``Node`` name is allowed, defaults to True node_type (Type[Node]): node type of tree to be created, defaults to ``Node`` @@ -555,8 +520,7 @@ def list_to_tree( Returns: (Node) """ - if not paths: - raise ValueError("Path list does not contain any data, check `paths`") + assert_length_not_empty(paths, "Path list", "paths") # Remove duplicates paths = list(OrderedDict.fromkeys(paths)) @@ -570,18 +534,19 @@ def list_to_tree( add_path_to_tree( root_node, path, sep=sep, duplicate_name_allowed=duplicate_name_allowed ) - root_node.sep = sep return root_node @optional_dependencies_pandas def list_to_tree_by_relation( - relations: Iterable[Tuple[str, str]], + relations: List[Tuple[str, str]], allow_duplicates: bool = False, node_type: Type[Node] = Node, ) -> Node: """Construct tree from list of tuple containing parent-child names. + Root node is inferred when parent is empty, or when name appears as parent but not as child. + Since tree is created from parent-child names, only names of leaf nodes may be repeated. Error will be thrown if names of intermediate nodes are repeated as there will be confusion. This error can be ignored by setting `allow_duplicates` to be True. @@ -601,7 +566,7 @@ def list_to_tree_by_relation( └── f Args: - relations (Iterable[Tuple[str, str]]): list containing tuple containing parent-child names + relations (List[Tuple[str, str]]): list containing tuple containing parent-child names allow_duplicates (bool): allow duplicate intermediate nodes such that child node will be tagged to multiple parent nodes, defaults to False node_type (Type[Node]): node type of tree to be created, defaults to ``Node`` @@ -609,8 +574,7 @@ def list_to_tree_by_relation( Returns: (Node) """ - if not relations: - raise ValueError("Path list does not contain any data, check `relations`") + assert_length_not_empty(relations, "Path list", "relations") relation_data = pd.DataFrame(relations, columns=["parent", "child"]) return dataframe_to_tree_by_relation( @@ -622,7 +586,6 @@ def list_to_tree_by_relation( ) -@optional_dependencies_pandas def dict_to_tree( path_attrs: Dict[str, Any], sep: str = "/", @@ -644,6 +607,8 @@ def dict_to_tree( - For example: Path strings should be "a/b", "a/c", "a/b/d" etc. and should not start with another root node. + All attributes in `path_attrs` will be added to the tree, including attributes with null values. + Examples: >>> from bigtree import dict_to_tree >>> path_dict = { @@ -677,19 +642,39 @@ def dict_to_tree( Returns: (Node) """ - if not len(path_attrs): - raise ValueError("Dictionary does not contain any data, check `path_attrs`") - - # Convert dictionary to dataframe - data = pd.DataFrame(path_attrs).T.rename_axis("PATH").reset_index() - data = data.replace({float("nan"): None}) - return dataframe_to_tree( - data, + assert_dictionary_not_empty(path_attrs, "path_attrs") + + # Initial tree + root_name = list(path_attrs.keys())[0].lstrip(sep).rstrip(sep).split(sep)[0] + root_node_attrs = dict( + path_attrs.get(root_name, {}) + or path_attrs.get(sep + root_name, {}) + or path_attrs.get(root_name + sep, {}) + or path_attrs.get(sep + root_name + sep, {}) + ) + root_node_attrs = filter_attributes( + root_node_attrs, omit_keys=["name"], omit_null_values=False + ) + root_node = node_type( + name=root_name, sep=sep, - duplicate_name_allowed=duplicate_name_allowed, - node_type=node_type, + **root_node_attrs, ) + # Convert dictionary to dataframe + for node_path, node_attrs in path_attrs.items(): + node_attrs = filter_attributes( + node_attrs, omit_keys=["name"], omit_null_values=False + ) + add_path_to_tree( + root_node, + node_path, + sep=sep, + duplicate_name_allowed=duplicate_name_allowed, + node_attrs=node_attrs, + ) + return root_node + def nested_dict_to_tree( node_attrs: Dict[str, Any], @@ -739,8 +724,7 @@ def nested_dict_to_tree( Returns: (Node) """ - if not node_attrs: - raise ValueError("Dictionary does not contain any data, check `node_attrs`") + assert_dictionary_not_empty(node_attrs, "node_attrs") def _recursive_add_child( child_dict: Dict[str, Any], parent_node: Optional[Node] = None @@ -783,6 +767,8 @@ def dataframe_to_tree( `path_col` and `attribute_cols` specify columns for node path and attributes to construct tree. If columns are not specified, `path_col` takes first column and all other columns are `attribute_cols`. + Only attributes in `attribute_cols` with non-null values will be added to the tree. + Path in path column can start from root node `name`, or start with `sep`. - For example: Path string can be "/a/b" or "a/b", if sep is "/". @@ -834,12 +820,7 @@ def dataframe_to_tree( Returns: (Node) """ - data = data.copy() - - if not len(data.columns): - raise ValueError("Data does not contain any columns, check `data`") - if not len(data): - raise ValueError("Data does not contain any rows, check `data`") + assert_dataframe_not_empty(data) if not path_col: path_col = data.columns[0] @@ -847,20 +828,9 @@ def dataframe_to_tree( attribute_cols = list(data.columns) attribute_cols.remove(path_col) + data = data[[path_col] + attribute_cols].copy() data[path_col] = data[path_col].str.lstrip(sep).str.rstrip(sep) - data2 = data.copy()[[path_col] + attribute_cols].astype(str).drop_duplicates() - _duplicate_check = ( - data2[path_col] - .value_counts() - .to_frame("counts") - .rename_axis(path_col) - .reset_index() - ) - _duplicate_check = _duplicate_check[_duplicate_check["counts"] > 1] - if len(_duplicate_check): - raise ValueError( - f"There exists duplicate path with different attributes\nCheck {_duplicate_check}" - ) + assert_dataframe_no_duplicate_attribute(data, "path", path_col, attribute_cols) root_name = data[path_col].values[0].split(sep)[0] root_node_data = data[data[path_col] == root_name] @@ -868,18 +838,24 @@ def dataframe_to_tree( root_node_kwargs = list( root_node_data[attribute_cols].to_dict(orient="index").values() )[0] - root_name = root_node_kwargs.pop("name", root_name) + root_node_kwargs = filter_attributes( + root_node_kwargs, omit_keys=["name", path_col], omit_null_values=True + ) root_node = node_type(root_name, **root_node_kwargs) else: root_node = node_type(root_name) - add_dataframe_to_tree_by_path( - root_node, - data, - path_col=path_col, - attribute_cols=attribute_cols, - sep=sep, - duplicate_name_allowed=duplicate_name_allowed, - ) + + for row in data.to_dict(orient="index").values(): + node_attrs = filter_attributes( + row, omit_keys=["name", path_col], omit_null_values=True + ) + add_path_to_tree( + root_node, + row[path_col], + sep=sep, + duplicate_name_allowed=duplicate_name_allowed, + node_attrs=node_attrs, + ) root_node.sep = sep return root_node @@ -894,6 +870,8 @@ def dataframe_to_tree_by_relation( ) -> Node: """Construct tree from pandas DataFrame using parent and child names, return root of tree. + Root node is inferred when parent name is empty, or when name appears in parent column but not in child column. + Since tree is created from parent-child names, only names of leaf nodes may be repeated. Error will be thrown if names of intermediate nodes are repeated as there will be confusion. This error can be ignored by setting `allow_duplicates` to be True. @@ -903,6 +881,8 @@ def dataframe_to_tree_by_relation( If columns are not specified, `child_col` takes first column, `parent_col` takes second column, and all other columns are `attribute_cols`. + Only attributes in `attribute_cols` with non-null values will be added to the tree. + Examples: >>> import pandas as pd >>> from bigtree import dataframe_to_tree_by_relation @@ -944,12 +924,7 @@ def dataframe_to_tree_by_relation( Returns: (Node) """ - data = data.copy() - - if not len(data.columns): - raise ValueError("Data does not contain any columns, check `data`") - if not len(data): - raise ValueError("Data does not contain any rows, check `data`") + assert_dataframe_not_empty(data) if not child_col: child_col = data.columns[0] @@ -960,44 +935,18 @@ def dataframe_to_tree_by_relation( attribute_cols.remove(child_col) attribute_cols.remove(parent_col) - data_check = data.copy()[[child_col, parent_col]].drop_duplicates() - # Filter for child nodes that are parent of other nodes + data = data[[child_col, parent_col] + attribute_cols].copy() if not allow_duplicates: - data_check = data_check[data_check[child_col].isin(data_check[parent_col])] - _duplicate_check = ( - data_check[child_col] - .value_counts() - .to_frame("counts") - .rename_axis(child_col) - .reset_index() - ) - _duplicate_check = _duplicate_check[_duplicate_check["counts"] > 1] - if len(_duplicate_check): - raise ValueError( - f"There exists duplicate child with different parent where the child is also a parent node.\n" - f"Duplicated node names should not happen, but can only exist in leaf nodes to avoid confusion.\n" - f"Check {_duplicate_check}" - ) + assert_dataframe_no_duplicate_children(data, child_col, parent_col) - # If parent-child contains None -> root - root_row = data[data[parent_col].isnull()] - root_names = list(root_row[child_col]) - if not len(root_names): - root_names = list(set(data[parent_col]) - set(data[child_col])) + # Infer root node + root_names = set(data[data[parent_col].isnull()][child_col]) + root_names.update(set(data[parent_col]) - set(data[child_col]) - {None}) if len(root_names) != 1: raise ValueError( - f"Unable to determine root node\nPossible root nodes: {root_names}" + f"Unable to determine root node\nPossible root nodes: {sorted(root_names)}" ) - root_name = root_names[0] - root_node_data = data[data[child_col] == root_name] - if len(root_node_data): - root_node_kwargs = list( - root_node_data[attribute_cols].to_dict(orient="index").values() - )[0] - root_name = root_node_kwargs.pop("name", root_name) - root_node = node_type(root_name, **root_node_kwargs) - else: - root_node = node_type(root_name) + root_name = list(root_names)[0] def _retrieve_attr(_row: Dict[str, Any]) -> Dict[str, Any]: """Retrieve node attributes from dictionary, remove parent and child column from dictionary. @@ -1008,12 +957,11 @@ def _retrieve_attr(_row: Dict[str, Any]) -> Dict[str, Any]: Returns: (Dict[str, Any]) """ - node_attrs = _row.copy() - node_attrs["name"] = node_attrs[child_col] - del node_attrs[child_col] - del node_attrs[parent_col] - _node_attrs = {k: v for k, v in node_attrs.items() if v is not None} - return _node_attrs + node_attrs = filter_attributes( + _row, omit_keys=[child_col, parent_col], omit_null_values=True + ) + node_attrs["name"] = _row[child_col] + return node_attrs def _recursive_add_child(parent_node: Node) -> None: """Recursive add child to tree, given current node. @@ -1029,9 +977,12 @@ def _recursive_add_child(parent_node: Node) -> None: _recursive_add_child(child_node) # Create root node attributes + root_row = data[data[child_col] == root_name] if len(root_row): row = list(root_row.to_dict(orient="index").values())[0] - root_node.set_attrs(_retrieve_attr(row)) + root_node = node_type(**_retrieve_attr(row)) + else: + root_node = node_type(root_name) _recursive_add_child(root_node) return root_node @@ -1095,8 +1046,7 @@ def newick_to_tree( Returns: (Node) """ - if not len(tree_string): - raise ValueError("Tree string does not contain any data, check `tree_string`") + assert_length_not_empty(tree_string, "Tree string", "tree_string") # Store results (for tracking) depth_nodes: Dict[int, List[Node]] = defaultdict(list) diff --git a/bigtree/tree/export.py b/bigtree/tree/export.py index 21f78c7a..c883bd4d 100644 --- a/bigtree/tree/export.py +++ b/bigtree/tree/export.py @@ -1,15 +1,14 @@ from __future__ import annotations import collections -import math from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, TypeVar, Union -from urllib.request import urlopen from bigtree.node.node import Node from bigtree.utils.assertions import ( assert_key_in_dict, assert_str_in_list, assert_style_in_dict, + isnull, ) from bigtree.utils.constants import ExportConstants, MermaidConstants, NewickCharacter from bigtree.utils.exceptions import ( @@ -51,20 +50,6 @@ T = TypeVar("T", bound=Node) -def _isnull(value: Any) -> bool: - """Check if value is null - - Args: - value (Any): value to check - - Returns: - (bool) - """ - if not value or (isinstance(value, float) and math.isnan(value)): - return True - return False - - def print_tree( tree: T, node_name_or_path: str = "", @@ -210,7 +195,7 @@ def print_tree( attr_str_list = [ f"{attr_name}={_node.get_attr(attr_name)}" for attr_name in attr_list - if not _isnull(_node.get_attr(attr_name)) + if not isnull(_node.get_attr(attr_name)) ] else: attr_str_list = [ @@ -1326,6 +1311,8 @@ def tree_to_pillow( """ # Initialize font if not font_family: + from urllib.request import urlopen + dejavusans_url = "https://github.com/kayjan/bigtree/raw/master/assets/DejaVuSans.ttf?raw=true" font_family = urlopen(dejavusans_url) try: diff --git a/bigtree/utils/assertions.py b/bigtree/utils/assertions.py index 8b0a6518..8a430d45 100644 --- a/bigtree/utils/assertions.py +++ b/bigtree/utils/assertions.py @@ -1,4 +1,12 @@ -from typing import Any, Dict, List +from __future__ import annotations + +import math +from typing import Any, Dict, List, Union + +try: + import pandas as pd +except ImportError: # pragma: no cover + pd = None def assert_style_in_dict( @@ -51,3 +59,138 @@ def assert_key_in_dict( raise ValueError( f"Invalid input, check `{parameter_name}` should be one of {accepted_parameters.keys()}" ) + + +def assert_length_not_empty( + data: Union[str, List[Any]], argument_name: str, argument: str +) -> None: + """Raise ValueError if data (str, list, or iterable) does not have length + + Args: + data (str/List[Any]): data to check + argument_name: argument name for data, for error message + argument (str): argument for data, for error message + """ + if not len(data): + raise ValueError( + f"{argument_name} does not contain any data, check `{argument}`" + ) + + +def assert_dictionary_not_empty(data_dict: Dict[Any, Any], argument: str) -> None: + """Raise ValueError is dictionary is empty + + Args: + data_dict (Dict[Any, Any]): dictionary to check + argument (str): argument for dictionary, for error message + """ + if not len(data_dict): + raise ValueError(f"Dictionary does not contain any data, check `{argument}`") + + +def assert_dataframe_not_empty(data: pd.DataFrame) -> None: + """Raise ValueError is dataframe is empty + + Args: + data (pd.DataFrame): dataframe to check + """ + if not len(data.columns): + raise ValueError("Data does not contain any columns, check `data`") + if not len(data): + raise ValueError("Data does not contain any rows, check `data`") + + +def assert_dataframe_no_duplicate_attribute( + data: pd.DataFrame, id_type: str, id_col: str, attribute_cols: List[str] +) -> None: + """Raise ValueError is dataframe contains different attributes for same path + + Args: + data (pd.DataFrame): dataframe to check + id_type (str): type of uniqueness to check for, for error message + id_col (str): column of data that is unique, can be name or path + attribute_cols (List[str]): columns of data containing node attribute information, + """ + data_check = data[[id_col] + attribute_cols].astype(str).drop_duplicates() + duplicate_check = ( + data_check[id_col] + .value_counts() + .to_frame("counts") + .rename_axis(id_col) + .reset_index() + ) + duplicate_check = duplicate_check[duplicate_check["counts"] > 1] + if len(duplicate_check): + raise ValueError( + f"There exists duplicate {id_type} with different attributes\nCheck {duplicate_check}" + ) + + +def assert_dataframe_no_duplicate_children( + data: pd.DataFrame, + child_col: str, + parent_col: str, +) -> None: + """Raise ValueError is dataframe contains different duplicated parent tagged to different grandparents + + Args: + data (pd.DataFrame): dataframe to check + child_col (str): column of data containing child name information + parent_col (str): column of data containing parent name information + """ + # Filter for child nodes that are parent of other nodes + data_check = data[[child_col, parent_col]].drop_duplicates() + data_check = data_check[data_check[child_col].isin(data_check[parent_col])] + + duplicate_check = ( + data_check[child_col] + .value_counts() + .to_frame("counts") + .rename_axis(child_col) + .reset_index() + ) + duplicate_check = duplicate_check[duplicate_check["counts"] > 1] + if len(duplicate_check): + raise ValueError( + f"There exists duplicate child with different parent where the child is also a parent node.\n" + f"Duplicated node names should not happen, but can only exist in leaf nodes to avoid confusion.\n" + f"Check {duplicate_check}" + ) + + +def isnull(value: Any) -> bool: + """Check if value is null + + Args: + value (Any): value to check + + Returns: + (bool) + """ + if not value or (isinstance(value, float) and math.isnan(value)): + return True + return False + + +def filter_attributes( + node_attributes: Dict[str, Any], + omit_keys: List[str], + omit_null_values: bool, +) -> Dict[str, Any]: + """Filter node attributes to remove certain keys and/or values + + Args: + node_attributes (Dict[str, Any]): node attribute dictionary + omit_keys (List[str]): list of keys to omit + omit_null_values (bool): indicator whether to omit values that are null + + Returns: + (Dict[str, Any]) + """ + if omit_null_values: + return { + k: v + for k, v in node_attributes.items() + if not isnull(v) and k not in omit_keys + } + return {k: v for k, v in node_attributes.items() if k not in omit_keys} diff --git a/docs/contributing.md b/docs/contributing.md index 7a6f563f..aa533d37 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -84,6 +84,7 @@ $ git checkout -b feat/add-this ``` During pre-commit checks, this project enforces [conventional commits](https://www.conventionalcommits.org/en/v1.0.0/) when writing commit messages, and checks and formats code using `black`, `flake8`, `isort`, and `mypy`. +- The regex for conventional commits is as such `(?s)(build|ci|docs|feat|fix|perf|refactor|style|test|chore|revert|bump)(\(\S+\))?!?:( [^\n\r]+)((\n\n.*)|(\s*))?$`. For testing, this project uses `pytest` and `coverage` package for testing of codes, and `docstr-coverage` and `doctest` package for testing of docstrings. diff --git a/tests/dag/test_construct.py b/tests/dag/test_construct.py index 3ab64ebf..9abc852a 100644 --- a/tests/dag/test_construct.py +++ b/tests/dag/test_construct.py @@ -185,13 +185,8 @@ def test_dataframe_to_dag_attribute_cols(self): def test_dataframe_to_dag_attribute_cols_error(self): attribute_cols = ["age2"] - with pytest.raises(ValueError) as exc_info: + with pytest.raises(KeyError): dataframe_to_dag(self.data, attribute_cols=attribute_cols) - assert str( - exc_info.value - ) == Constants.ERROR_DAG_DATAFRAME_ATTRIBUTE_COL.format( - attribute_cols=attribute_cols - ) @staticmethod def test_dataframe_to_dag_empty_child_error(): @@ -218,6 +213,77 @@ def test_dataframe_to_dag_empty_child_error(): child_col=child_col ) + @staticmethod + def test_dataframe_to_dag_ignore_name_col(): + data = pd.DataFrame( + [ + ["a", None, 90, "a1"], + ["b", None, 65, "b1"], + ["c", "a", 60, "c1"], + ["c", "b", 60, "c1"], + ["d", "a", 40, "d1"], + ["d", "c", 40, "d1"], + ["e", "d", 35, "e1"], + ["f", "c", 38, "f1"], + ["f", "d", 38, "f1"], + ["g", "c", 10, "g1"], + ["h", "g", 6, "h1"], + ], + columns=["child", "parent", "age", "name"], + ) + dag = dataframe_to_dag(data) + assert_dag_structure_root(dag) + assert_dag_structure_root_attr(dag) + + @staticmethod + def test_dataframe_to_dag_ignore_non_attribute_cols(): + data = pd.DataFrame( + [ + ["a", None, 90, "a1"], + ["b", None, 65, "b1"], + ["c", "a", 60, "c1"], + ["c", "b", 60, "c1"], + ["d", "a", 40, "d1"], + ["d", "c", 40, "d1"], + ["e", "d", 35, "e1"], + ["f", "c", 38, "f1"], + ["f", "d", 38, "f1"], + ["g", "c", 10, "g1"], + ["h", "g", 6, "h1"], + ], + columns=["child", "parent", "age", "name2"], + ) + dag = dataframe_to_dag( + data, child_col="child", parent_col="parent", attribute_cols=["age"] + ) + assert not dag.get_attr("name2") + assert_dag_structure_root(dag) + assert_dag_structure_root_attr(dag) + + @staticmethod + def test_dataframe_to_dag_node_empty_attribute(): + data = pd.DataFrame( + [ + ["a", None, 90], + ["b", None, 65], + ["c", "a", 60], + ["c", "b", 60], + ["d", "a", 40], + ["d", "c", 40], + ["e", "d", 35], + ["f", "c", 38], + ["f", "d", 38], + ["g", "c", None], + ["h", "g", 6], + ], + columns=["child", "parent", "age"], + ) + dag = dataframe_to_dag(data) + assert not dag.get_attr("age") + dag.set_attrs({"age": 10}) + assert_dag_structure_root(dag) + assert_dag_structure_root_attr(dag) + @staticmethod def test_dataframe_to_dag_duplicate_data(): data = pd.DataFrame( diff --git a/tests/node/test_node_benchmark.py b/tests/node/test_node_benchmark.py index ebe98cec..8e333f70 100644 --- a/tests/node/test_node_benchmark.py +++ b/tests/node/test_node_benchmark.py @@ -1,6 +1,8 @@ import sys from unittest.mock import patch +import pytest + from bigtree.node.node import Node sys.setrecursionlimit(2000) @@ -24,37 +26,45 @@ def run_construct_node(depth: int, width: int = 1, parent_node: Node = None) -> return new_node +@pytest.mark.benchmark(group="width_1_depth_10") def test_node_benchmark_width_1_depth_10(benchmark): benchmark.pedantic(run_construct_node, (10, 1), iterations=10, rounds=2) +@pytest.mark.benchmark(group="width_1_depth_100") def test_node_benchmark_width_1_depth_100(benchmark): benchmark.pedantic(run_construct_node, (100, 1), iterations=10, rounds=2) +@pytest.mark.benchmark(group="width_1_depth_1000") def test_node_benchmark_width_1_depth_1000(benchmark): benchmark.pedantic(run_construct_node, (1000, 1), iterations=10, rounds=2) +@pytest.mark.benchmark(group="width_2_depth_10") def test_node_benchmark_width_2_depth_10(benchmark): benchmark.pedantic(run_construct_node, (10, 2), iterations=10, rounds=2) +@pytest.mark.benchmark(group="width_1_depth_10") @patch("bigtree.node.basenode.ASSERTIONS", "") def test_node_benchmark_width_1_depth_10_no_assertions(benchmark): benchmark.pedantic(run_construct_node, (10, 1), iterations=10, rounds=2) +@pytest.mark.benchmark(group="width_1_depth_100") @patch("bigtree.node.basenode.ASSERTIONS", "") def test_node_benchmark_width_1_depth_100_no_assertions(benchmark): benchmark.pedantic(run_construct_node, (100, 1), iterations=10, rounds=2) +@pytest.mark.benchmark(group="width_1_depth_1000") @patch("bigtree.node.basenode.ASSERTIONS", "") def test_node_benchmark_width_1_depth_1000_no_assertions(benchmark): benchmark.pedantic(run_construct_node, (1000, 1), iterations=10, rounds=2) +@pytest.mark.benchmark(group="width_2_depth_10") @patch("bigtree.node.basenode.ASSERTIONS", "") def test_node_benchmark_width_2_depth_10_no_assertions(benchmark): benchmark.pedantic(run_construct_node, (10, 2), iterations=10, rounds=2) diff --git a/tests/test_constants.py b/tests/test_constants.py index 02b3307e..0882da83 100644 --- a/tests/test_constants.py +++ b/tests/test_constants.py @@ -32,7 +32,6 @@ class Constants: ERROR_DAG_DATAFRAME_CHILD_COL = ( "Child column not in data, check `child_col`: {child_col}" ) - ERROR_DAG_DATAFRAME_ATTRIBUTE_COL = "One or more attribute column(s) not in data, check `attribute_cols`: {attribute_cols}" ERROR_DAG_DATAFRAME_DUPLICATE_PARENT = ( "There exists duplicate child name with different attributes\nCheck " @@ -109,7 +108,7 @@ class Constants: "child_key {child_key} should be List type, received {child}" ) ERROR_NODE_LIST_EMPTY = "Path list does not contain any data, check `{parameter}`" - ERROR_NODE_PATH_EMPTY = "Path is empty, check `path`" + ERROR_NODE_PATH_EMPTY = "Path does not contain any data, check `path`" ERROR_NODE_STRING_EMPTY = ( "Tree string does not contain any data, check `tree_string`" ) diff --git a/tests/tree/test_construct.py b/tests/tree/test_construct.py index c437404f..d3f2e478 100644 --- a/tests/tree/test_construct.py +++ b/tests/tree/test_construct.py @@ -458,10 +458,10 @@ def tearDown(self): self.name_dict = None def test_add_dict_to_tree_by_name(self): - root = add_dict_to_tree_by_name(self.root, self.name_dict) - assert_tree_structure_basenode_root(root) - assert_tree_structure_basenode_root_attr(root) - assert_tree_structure_node_root(root) + add_dict_to_tree_by_name(self.root, self.name_dict) + assert_tree_structure_basenode_root(self.root) + assert_tree_structure_basenode_root_attr(self.root) + assert_tree_structure_node_root(self.root) def test_add_dict_to_tree_by_name_different_dtype(self): name_dict = { @@ -474,11 +474,11 @@ def test_add_dict_to_tree_by_name_different_dtype(self): "g": {"random": -1}, "h": {"random": [-1]}, } - root = add_dict_to_tree_by_name(self.root, name_dict) + add_dict_to_tree_by_name(self.root, name_dict) nodes = ["a", "b", "c", "d", "e", "f", "g", "h"] expected_list = [[1], [1, 2], [1, None], [None], None, 0, -1, [-1]] for node_name, expected in zip(nodes, expected_list): - actual = find_name(root, node_name).get_attr("random") + actual = find_name(self.root, node_name).get_attr("random") assert actual == expected, f"Expected\n{expected}\nReceived\n{actual}" def test_add_dict_to_tree_by_name_empty_error(self): @@ -488,32 +488,6 @@ def test_add_dict_to_tree_by_name_empty_error(self): parameter="name_attrs" ) - def test_add_dict_to_tree_by_name_inner_join_tree(self): - dummy = Node("dummy") - dummy.parent = self.b - root = add_dict_to_tree_by_name(self.root, self.name_dict, join_type="inner") - assert_tree_structure_basenode_root(root) - assert_tree_structure_basenode_root_attr(root) - assert_tree_structure_node_root(root) - - def test_add_dict_to_tree_by_name_inner_join_dict(self): - self.name_dict["dummy"] = {"age": 100} - root = add_dict_to_tree_by_name(self.root, self.name_dict, join_type="inner") - assert_tree_structure_basenode_root(root) - assert_tree_structure_basenode_root_attr(root) - assert_tree_structure_node_root(root) - - def test_add_dict_to_tree_by_name_left_join(self): - root = add_dict_to_tree_by_name(self.root, self.name_dict, join_type="left") - assert_tree_structure_basenode_root(root) - assert_tree_structure_basenode_root_attr(root) - assert_tree_structure_node_root(root) - - def test_add_dict_to_tree_by_name_invalid_join_error(self): - with pytest.raises(ValueError) as exc_info: - add_dict_to_tree_by_name(self.root, self.name_dict, join_type="something") - assert str(exc_info.value) == Constants.ERROR_NODE_JOIN_TYPE - def test_add_dict_to_tree_by_name_sep_tree(self): self.root.sep = "\\" root = add_dict_to_tree_by_name(self.root, self.name_dict) @@ -522,11 +496,11 @@ def test_add_dict_to_tree_by_name_sep_tree(self): def test_add_dict_to_tree_by_name_duplicate_name(self): hh = Node("h", age=6) hh.parent = self.root - root = add_dict_to_tree_by_name(self.root, self.name_dict) + add_dict_to_tree_by_name(self.root, self.name_dict) assert ( - len(list(find_names(root, "h"))) == 2 + len(list(find_names(self.root, "h"))) == 2 ), "There is less node 'h' than expected" - for _node in list(find_names(root, "h")): + for _node in list(find_names(self.root, "h")): assert _node.get_attr("age") == 6 def test_add_dict_to_tree_by_name_node_type(self): @@ -543,7 +517,7 @@ def test_add_dict_to_tree_by_name_node_type(self): f.parent = c g.parent = e h.parent = e - root = add_dict_to_tree_by_name(root, self.name_dict) + add_dict_to_tree_by_name(root, self.name_dict) assert isinstance(root, NodeA), Constants.ERROR_CUSTOM_TYPE.format(type="NodeA") assert all( isinstance(node, NodeA) for node in root.children @@ -571,7 +545,7 @@ def test_add_dict_to_tree_by_name_custom_node_type(self): "g": {"custom_field": 10, "custom_field_str": "g"}, "h": {"custom_field": 6, "custom_field_str": "h"}, } - root = add_dict_to_tree_by_name(root, name_dict) + add_dict_to_tree_by_name(root, name_dict) assert isinstance(root, CustomNode), Constants.ERROR_CUSTOM_TYPE.format( type="CustomNode" ) @@ -588,10 +562,10 @@ def test_add_dict_to_tree_by_name_inconsistent_attributes(self): "b": {}, "c": {"age": 60}, } - root = add_dict_to_tree_by_name(self.root, name_dict) - expected_root_str = "a [age=90.0]\n" "├── b\n" "└── c [age=60.0]\n" + add_dict_to_tree_by_name(self.root, name_dict) + expected_root_str = "a [age=90]\n" "├── b [age=1]\n" "└── c [age=60]\n" assert_print_statement( - print_tree, expected_root_str, root, all_attrs=True, max_depth=2 + print_tree, expected_root_str, self.root, all_attrs=True, max_depth=2 ) @@ -669,6 +643,75 @@ def test_add_dataframe_to_tree_by_path_empty_col_error(self): add_dataframe_to_tree_by_path(self.root, data) assert str(exc_info.value) == Constants.ERROR_NODE_DATAFRAME_EMPTY_COL + def test_add_dataframe_to_tree_by_path_attribute_cols_error(self): + attribute_cols = ["age2"] + with pytest.raises(KeyError): + add_dataframe_to_tree_by_path( + self.root, self.data, attribute_cols=attribute_cols + ) + + def test_add_dataframe_to_tree_by_path_ignore_name_col(self): + data = pd.DataFrame( + [ + ["a", 90, "a1"], + ["a/b", 65, "b1"], + ["a/c", 60, "c1"], + ["a/b/d", 40, "d1"], + ["a/b/e", 35, "e1"], + ["a/c/f", 38, "f1"], + ["a/b/e/g", 10, "g1"], + ["a/b/e/h", 6, "h1"], + ], + columns=["PATH", "age", "name"], + ) + add_dataframe_to_tree_by_path(self.root, data) + assert_tree_structure_basenode_root(self.root) + assert_tree_structure_basenode_root_attr(self.root) + assert_tree_structure_node_root(self.root) + + def test_add_dataframe_to_tree_by_path_ignore_non_attribute_cols(self): + data = pd.DataFrame( + [ + ["a", 90, "a1"], + ["a/b", 65, "b1"], + ["a/c", 60, "c1"], + ["a/b/d", 40, "d1"], + ["a/b/e", 35, "e1"], + ["a/c/f", 38, "f1"], + ["a/b/e/g", 10, "g1"], + ["a/b/e/h", 6, "h1"], + ], + columns=["PATH", "age", "name2"], + ) + add_dataframe_to_tree_by_path( + self.root, data, path_col="PATH", attribute_cols=["age"] + ) + assert not self.root.get_attr("name2") + assert_tree_structure_basenode_root(self.root) + assert_tree_structure_basenode_root_attr(self.root) + assert_tree_structure_node_root(self.root) + + def test_add_dataframe_to_tree_by_path_root_node_empty_attribute(self): + data = pd.DataFrame( + [ + ["a", None], + ["a/b", 65], + ["a/c", 60], + ["a/b/d", 40], + ["a/b/e", 35], + ["a/c/f", 38], + ["a/b/e/g", 10], + ["a/b/e/h", 6], + ], + columns=["PATH", "age"], + ) + add_dataframe_to_tree_by_path(self.root, data) + assert self.root.get_attr("age") == 1 + self.root.set_attrs({"age": 90}) + assert_tree_structure_basenode_root(self.root) + assert_tree_structure_basenode_root_attr(self.root) + assert_tree_structure_node_root(self.root) + def test_add_dataframe_to_tree_by_path_no_attribute(self): data = pd.DataFrame( [ @@ -958,29 +1001,29 @@ def tearDown(self): self.data = None def test_add_dataframe_to_tree_by_name(self): - root = add_dataframe_to_tree_by_name(self.root, self.data) - assert_tree_structure_basenode_root(root) - assert_tree_structure_basenode_root_attr(root) - assert_tree_structure_node_root(root) + add_dataframe_to_tree_by_name(self.root, self.data) + assert_tree_structure_basenode_root(self.root) + assert_tree_structure_basenode_root_attr(self.root) + assert_tree_structure_node_root(self.root) def test_add_dataframe_to_tree_by_name_col_name(self): - root = add_dataframe_to_tree_by_name( + add_dataframe_to_tree_by_name( self.root, self.data, name_col="NAME", attribute_cols=["age"] ) - assert_tree_structure_basenode_root(root) - assert_tree_structure_basenode_root_attr(root) - assert_tree_structure_node_root(root) + assert_tree_structure_basenode_root(self.root) + assert_tree_structure_basenode_root_attr(self.root) + assert_tree_structure_node_root(self.root) def test_add_dataframe_to_tree_by_name_col_name_reverse(self): - root = add_dataframe_to_tree_by_name( + add_dataframe_to_tree_by_name( self.root, self.data[["age", "NAME"]], name_col="NAME", attribute_cols=["age"], ) - assert_tree_structure_basenode_root(root) - assert_tree_structure_basenode_root_attr(root) - assert_tree_structure_node_root(root) + assert_tree_structure_basenode_root(self.root) + assert_tree_structure_basenode_root_attr(self.root) + assert_tree_structure_node_root(self.root) def test_add_dataframe_to_tree_by_name_empty_error(self): with pytest.raises(ValueError) as exc_info: @@ -999,18 +1042,58 @@ def test_add_dataframe_to_tree_by_name_empty_col_error(self): add_dataframe_to_tree_by_name(self.root, data) assert str(exc_info.value) == Constants.ERROR_NODE_DATAFRAME_EMPTY_COL - def test_add_dataframe_to_tree_by_name_inner_join_tree(self): - dummy = Node("dummy") - dummy.parent = self.b - root = add_dataframe_to_tree_by_name(self.root, self.data, join_type="inner") - assert_tree_structure_basenode_root(root) - assert_tree_structure_basenode_root_attr(root) - assert_tree_structure_node_root(root) + def test_add_dataframe_to_tree_by_name_attribute_cols_error(self): + attribute_cols = ["age2"] + with pytest.raises(KeyError): + add_dataframe_to_tree_by_name( + self.root, self.data, attribute_cols=attribute_cols + ) - def test_add_dataframe_to_tree_by_name_inner_join_data(self): + def test_add_dataframe_to_tree_by_name_ignore_name_col(self): data = pd.DataFrame( [ - ["a", 90], + ["a", 90, "a1"], + ["b", 65, "b1"], + ["c", 60, "c1"], + ["d", 40, "d1"], + ["e", 35, "e1"], + ["f", 38, "f1"], + ["g", 10, "g1"], + ["h", 6, "h1"], + ], + columns=["name2", "age", "name"], + ) + add_dataframe_to_tree_by_name(self.root, data, name_col="name2") + assert_tree_structure_basenode_root(self.root) + assert_tree_structure_basenode_root_attr(self.root) + assert_tree_structure_node_root(self.root) + + def test_add_dataframe_to_tree_by_name_ignore_non_attribute_cols(self): + data = pd.DataFrame( + [ + ["a", 90, "a1"], + ["b", 65, "b1"], + ["c", 60, "c1"], + ["d", 40, "d1"], + ["e", 35, "e1"], + ["f", 38, "f1"], + ["g", 10, "g1"], + ["h", 6, "h1"], + ], + columns=["NAME", "age", "name2"], + ) + add_dataframe_to_tree_by_name( + self.root, data, name_col="NAME", attribute_cols=["age"] + ) + assert not self.root.get_attr("name2") + assert_tree_structure_basenode_root(self.root) + assert_tree_structure_basenode_root_attr(self.root) + assert_tree_structure_node_root(self.root) + + def test_add_dataframe_to_tree_by_name_root_node_empty_attribute(self): + data = pd.DataFrame( + [ + ["a", None], ["b", 65], ["c", 60], ["d", 40], @@ -1018,25 +1101,15 @@ def test_add_dataframe_to_tree_by_name_inner_join_data(self): ["f", 38], ["g", 10], ["h", 6], - ["dummy", 100], ], columns=["NAME", "age"], ) - root = add_dataframe_to_tree_by_name(self.root, data, join_type="inner") - assert_tree_structure_basenode_root(root) - assert_tree_structure_basenode_root_attr(root) - assert_tree_structure_node_root(root) - - def test_add_dataframe_to_tree_by_name_left_join(self): - root = add_dataframe_to_tree_by_name(self.root, self.data, join_type="left") - assert_tree_structure_basenode_root(root) - assert_tree_structure_basenode_root_attr(root) - assert_tree_structure_node_root(root) - - def test_add_dataframe_to_tree_by_name_invalid_join_error(self): - with pytest.raises(ValueError) as exc_info: - add_dataframe_to_tree_by_name(self.root, self.data, join_type="something") - assert str(exc_info.value) == Constants.ERROR_NODE_JOIN_TYPE + add_dataframe_to_tree_by_name(self.root, data) + assert self.root.get_attr("age") == 1 + self.root.set_attrs({"age": 90}) + assert_tree_structure_basenode_root(self.root) + assert_tree_structure_basenode_root_attr(self.root) + assert_tree_structure_node_root(self.root) def test_add_dataframe_to_tree_by_name_sep_tree(self): self.root.sep = "\\" @@ -1087,7 +1160,7 @@ def test_add_dataframe_to_tree_by_name_node_type(self): f.parent = c g.parent = e h.parent = e - root = add_dataframe_to_tree_by_name(root, self.data) + add_dataframe_to_tree_by_name(root, self.data) assert isinstance(root, NodeA), Constants.ERROR_CUSTOM_TYPE.format(type="NodeA") assert all( isinstance(node, NodeA) for node in root.children @@ -1118,7 +1191,7 @@ def test_add_dataframe_to_tree_by_name_custom_node_type(self): ], columns=["NAME", "custom_field", "custom_field_str"], ) - root = add_dataframe_to_tree_by_name(root, data) + add_dataframe_to_tree_by_name(root, data) assert isinstance(root, CustomNode), Constants.ERROR_CUSTOM_TYPE.format( type="CustomNode" ) @@ -1129,6 +1202,21 @@ def test_add_dataframe_to_tree_by_name_custom_node_type(self): assert_tree_structure_customnode_root_attr(root) assert_tree_structure_node_root(root) + def test_add_dataframe_to_tree_by_name_inconsistent_attributes(self): + data = pd.DataFrame( + [ + ["a", 90], + ["b", None], + ["c", 60], + ], + columns=["NAME", "age"], + ) + add_dataframe_to_tree_by_name(self.root, data) + expected_root_str = "a [age=90.0]\n" "├── b [age=1]\n" "└── c [age=60.0]\n" + assert_print_statement( + print_tree, expected_root_str, self.root, all_attrs=True, max_depth=2 + ) + class TestStrToTree(unittest.TestCase): def setUp(self): @@ -1703,7 +1791,7 @@ def test_dict_to_tree_inconsistent_attributes(): "a/c": {"age": 60}, } root = dict_to_tree(path_dict) - expected_root_str = "a [age=90.0]\n" "├── b\n" "└── c [age=60.0]\n" + expected_root_str = "a [age=90]\n" "├── b\n" "└── c [age=60]\n" assert_print_statement(print_tree, expected_root_str, root, all_attrs=True) @@ -2001,6 +2089,74 @@ def test_dataframe_to_tree_empty_col_error(): dataframe_to_tree(path_data) assert str(exc_info.value) == Constants.ERROR_NODE_DATAFRAME_EMPTY_COL + def test_dataframe_to_tree_attribute_cols_error(self): + attribute_cols = ["age2"] + with pytest.raises(KeyError): + dataframe_to_tree(self.path_data, attribute_cols=attribute_cols) + + @staticmethod + def test_dataframe_to_tree_ignore_name_col(): + path_data = pd.DataFrame( + [ + ["a", 90, "a1"], + ["a/b", 65, "b1"], + ["a/c", 60, "c1"], + ["a/b/d", 40, "d1"], + ["a/b/e", 35, "e1"], + ["a/c/f", 38, "f1"], + ["a/b/e/g", 10, "g1"], + ["a/b/e/h", 6, "h1"], + ], + columns=["PATH", "age", "name"], + ) + root = dataframe_to_tree(path_data) + assert_tree_structure_basenode_root(root) + assert_tree_structure_basenode_root_attr(root) + assert_tree_structure_node_root(root) + + @staticmethod + def test_dataframe_to_tree_ignore_non_attribute_cols(): + path_data = pd.DataFrame( + [ + ["a", 90, "a1"], + ["a/b", 65, "b1"], + ["a/c", 60, "c1"], + ["a/b/d", 40, "d1"], + ["a/b/e", 35, "e1"], + ["a/c/f", 38, "f1"], + ["a/b/e/g", 10, "g1"], + ["a/b/e/h", 6, "h1"], + ], + columns=["PATH", "age", "name2"], + ) + root = dataframe_to_tree(path_data, path_col="PATH", attribute_cols=["age"]) + assert not root.get_attr("name2") + assert_tree_structure_basenode_root(root) + assert_tree_structure_basenode_root_attr(root) + assert_tree_structure_node_root(root) + + @staticmethod + def test_dataframe_to_tree_root_node_empty_attribute(): + path_data = pd.DataFrame( + [ + ["a", None], + ["a/b", 65], + ["a/c", 60], + ["a/b/d", 40], + ["a/b/e", 35], + ["a/c/f", 38], + ["a/b/e/g", 10], + ["a/b/e/h", 6], + ], + columns=["PATH", "age"], + ) + root = dataframe_to_tree(path_data) + assert not root.get_attr("age") + root.set_attrs({"age": 90}) + assert_tree_structure_basenode_root(root) + assert_tree_structure_basenode_root_attr(root) + assert_tree_structure_node_root(root) + @staticmethod def test_dataframe_to_tree_sep_leading(): path_data = pd.DataFrame( @@ -2271,18 +2427,95 @@ def test_dataframe_to_tree_by_relation_col_name_reverse(self): assert_tree_structure_basenode_root_attr(root) assert_tree_structure_node_root(root) - def test_dataframe_to_tree_by_relation_empty_row_error(self): + @staticmethod + def test_dataframe_to_tree_by_relation_empty_row_error(): relation_data = pd.DataFrame(columns=["child", "parent"]) with pytest.raises(ValueError) as exc_info: dataframe_to_tree_by_relation(relation_data) assert str(exc_info.value) == Constants.ERROR_NODE_DATAFRAME_EMPTY_ROW - def test_dataframe_to_tree_by_relation_empty_col_error(self): + @staticmethod + def test_dataframe_to_tree_by_relation_empty_col_error(): relation_data = pd.DataFrame() with pytest.raises(ValueError) as exc_info: dataframe_to_tree_by_relation(relation_data) assert str(exc_info.value) == Constants.ERROR_NODE_DATAFRAME_EMPTY_COL + def test_dataframe_to_tree_by_relation_attribute_cols_error(self): + attribute_cols = ["age2"] + with pytest.raises(KeyError): + dataframe_to_tree_by_relation( + self.relation_data, attribute_cols=attribute_cols + ) + + @staticmethod + def test_dataframe_to_tree_by_relation_ignore_name_col(): + relation_data = pd.DataFrame( + [ + ["a", None, 90, "a1"], + ["b", "a", 65, "b1"], + ["c", "a", 60, "c1"], + ["d", "b", 40, "d1"], + ["e", "b", 35, "e1"], + ["f", "c", 38, "f1"], + ["g", "e", 10, "g1"], + ["h", "e", 6, "h1"], + ], + columns=["child", "parent", "age", "name"], + ) + root = dataframe_to_tree_by_relation(relation_data) + assert_tree_structure_basenode_root(root) + assert_tree_structure_basenode_root_attr(root) + assert_tree_structure_node_root(root) + + @staticmethod + def test_dataframe_to_tree_by_relation_ignore_non_attribute_cols(): + relation_data = pd.DataFrame( + [ + ["a", None, 90, "a1"], + ["b", "a", 65, "b1"], + ["c", "a", 60, "c1"], + ["d", "b", 40, "d1"], + ["e", "b", 35, "e1"], + ["f", "c", 38, "f1"], + ["g", "e", 10, "g1"], + ["h", "e", 6, "h1"], + ], + columns=["child", "parent", "age", "name2"], + ) + root = dataframe_to_tree_by_relation( + relation_data, + child_col="child", + parent_col="parent", + attribute_cols=["age"], + ) + assert not root.get_attr("name2") + assert_tree_structure_basenode_root(root) + assert_tree_structure_basenode_root_attr(root) + assert_tree_structure_node_root(root) + + @staticmethod + def test_dataframe_to_tree_by_relation_root_node_empty_attribute(): + relation_data = pd.DataFrame( + [ + ["a", None, None], + ["b", "a", 65], + ["c", "a", 60], + ["d", "b", 40], + ["e", "b", 35], + ["f", "c", 38], + ["g", "e", 10], + ["h", "e", 6], + ], + columns=["child", "parent", "age"], + ) + root = dataframe_to_tree_by_relation(relation_data) + assert not root.get_attr("age") + root.set_attrs({"age": 90}) + assert_tree_structure_basenode_root(root) + assert_tree_structure_basenode_root_attr(root) + assert_tree_structure_node_root(root) + @staticmethod def test_dataframe_to_tree_by_relation_duplicate_leaf_node(): relation_data = pd.DataFrame( @@ -2292,10 +2525,10 @@ def test_dataframe_to_tree_by_relation_duplicate_leaf_node(): ["c", "a", 60], ["d", "b", 40], ["e", "b", 35], - ["h", "b", 1], - ["h", "c", 2], - ["g", "e", 10], # duplicate - ["h", "e", 1], # duplicate + ["h", "b", 1], # duplicate + ["h", "c", 2], # duplicate + ["g", "e", 10], + ["h", "e", 6], # duplicate ], columns=["child", "parent", "age"], ) @@ -2315,7 +2548,7 @@ def test_dataframe_to_tree_by_relation_duplicate_leaf_node(): @staticmethod def test_dataframe_to_tree_by_relation_duplicate_intermediate_node_error(): - data = pd.DataFrame( + relation_data = pd.DataFrame( [ ["a", None, 90], ["b", "a", 65], @@ -2330,28 +2563,28 @@ def test_dataframe_to_tree_by_relation_duplicate_intermediate_node_error(): columns=["child", "parent", "age"], ) with pytest.raises(ValueError) as exc_info: - dataframe_to_tree_by_relation(data) + dataframe_to_tree_by_relation(relation_data) assert str(exc_info.value).startswith( Constants.ERROR_NODE_DUPLICATED_INTERMEDIATE_NODE ) @staticmethod def test_dataframe_to_tree_by_relation_duplicate_intermediate_node(): - data = pd.DataFrame( + relation_data = pd.DataFrame( [ ["a", None, 90], ["b", "a", 65], ["c", "a", 60], ["d", "b", 40], ["e", "b", 35], - ["e", "c", 1], + ["e", "c", 1], # duplicate intermediate node ["f", "c", 38], ["g", "e", 10], ["h", "e", 6], ], columns=["child", "parent", "age"], ) - root = dataframe_to_tree_by_relation(data, allow_duplicates=True) + root = dataframe_to_tree_by_relation(relation_data, allow_duplicates=True) actual = len(list(root.descendants)) assert actual == 10, f"Expected tree to have 10 descendants, received {actual}" @@ -2391,45 +2624,63 @@ def test_dataframe_to_tree_by_relation_custom_node_type(self): assert_tree_structure_node_root(root) @staticmethod - def test_dataframe_to_tree_by_relation_different_root_error(): - data = pd.DataFrame( + def test_dataframe_to_tree_by_relation_multiple_root_parent_none_error(): + relation_data = pd.DataFrame( [ ["a", None, 90], ["b", None, 65], ["c", "a", 60], - ["e", "b", 40], + ["d", "b", 40], + ["e", "b", 35], + ["f", "c", 38], + ["g", "e", 10], + ["h", "e", 6], + ], + columns=["child", "parent", "age"], + ) + with pytest.raises(ValueError) as exc_info: + dataframe_to_tree_by_relation(relation_data) + assert str( + exc_info.value + ) == Constants.ERROR_NODE_DATAFRAME_MULTIPLE_ROOT.format(root_nodes=["a", "b"]) + + @staticmethod + def test_dataframe_to_tree_by_relation_multiple_root_error(): + relation_data = pd.DataFrame( + [ + ["a", None, 90], + ["c", "a", 60], + ["d", "b", 40], ["e", "b", 35], - ["h", "b", 1], - ["h", "c", 2], + ["f", "c", 38], ["g", "e", 10], - ["h", "e", 1], + ["h", "e", 6], ], columns=["child", "parent", "age"], ) with pytest.raises(ValueError) as exc_info: - dataframe_to_tree_by_relation(data) + dataframe_to_tree_by_relation(relation_data) assert str( exc_info.value ) == Constants.ERROR_NODE_DATAFRAME_MULTIPLE_ROOT.format(root_nodes=["a", "b"]) @staticmethod def test_dataframe_to_tree_by_relation_no_root_error(): - data = pd.DataFrame( + relation_data = pd.DataFrame( [ ["a", "b", 90], ["b", "a", 65], ["c", "a", 60], ["d", "b", 40], ["e", "b", 35], - ["h", "b", 1], - ["h", "c", 2], + ["f", "c", 38], ["g", "e", 10], - ["h", "e", 1], + ["h", "e", 6], ], columns=["child", "parent", "age"], ) with pytest.raises(ValueError) as exc_info: - dataframe_to_tree_by_relation(data) + dataframe_to_tree_by_relation(relation_data) assert str( exc_info.value ) == Constants.ERROR_NODE_DATAFRAME_MULTIPLE_ROOT.format(root_nodes=[]) diff --git a/tests/tree/test_search.py b/tests/tree/test_search.py index 881e82bc..745d7154 100644 --- a/tests/tree/test_search.py +++ b/tests/tree/test_search.py @@ -603,12 +603,12 @@ def test_find_children(self): (), (), ] - for idx, input in enumerate(inputs): - actual = find_children(input, lambda node: node.age > 1) + for idx, input_ in enumerate(inputs): + actual = find_children(input_, lambda node: node.age > 1) expected = expected_ans[idx] assert ( actual == expected - ), f"Expected find_children to return {expected}, received {actual} for input {input}" + ), f"Expected find_children to return {expected}, received {actual} for input {input_}" def test_find_children_condition(self): inputs = [self.a, self.b, self.c, self.d, self.e, self.f, self.g, self.h] @@ -632,7 +632,7 @@ def test_find_children_condition(self): expected = expected_ans[idx] assert ( actual == expected - ), f"Expected find_children to return {expected}, received {actual} for input {input}" + ), f"Expected find_children to return {expected}, received {actual} for input {input_}" def test_find_children_max_count_error(self): with pytest.raises(SearchError) as exc_info: @@ -670,7 +670,7 @@ def test_find_child(self): expected = expected_ans[idx] assert ( actual == expected - ), f"Expected find_children to return {expected}, received {actual} for input {input}" + ), f"Expected find_children to return {expected}, received {actual} for input {input_}" def test_find_child_error(self): with pytest.raises(SearchError) as exc_info: