diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ad63760..3c4a5359 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Changed: +- DAG Constructor: `list_to_dag` and `dict_to_dag` does not rely on `dataframe_to_dag` as pandas dataframe operation +is phased out. +### Fixed: +- DAG Constructor: Handle cases where reserved keywords are part of attribute upon creation and throw error accordingly. ## [0.17.1] - 2024-04-23 ### Fixed diff --git a/bigtree/binarytree/construct.py b/bigtree/binarytree/construct.py index 211d48ee..0ed98762 100644 --- a/bigtree/binarytree/construct.py +++ b/bigtree/binarytree/construct.py @@ -45,10 +45,7 @@ def list_to_binarytree( node_list = [root_node] for idx, num in enumerate(heapq_list): if idx: - if idx % 2: - parent_idx = int((idx - 1) / 2) - else: - parent_idx = int((idx - 2) / 2) + parent_idx = int((idx + 1) / 2) - 1 node = node_type(num, parent=node_list[parent_idx]) # type: ignore node_list.append(node) return root_node diff --git a/bigtree/dag/construct.py b/bigtree/dag/construct.py index 88c68b68..491f9e02 100644 --- a/bigtree/dag/construct.py +++ b/bigtree/dag/construct.py @@ -6,7 +6,7 @@ from bigtree.utils.assertions import ( assert_dataframe_no_duplicate_attribute, assert_dataframe_not_empty, - assert_dictionary_not_empty, + assert_key_not_in_dict_or_df, assert_length_not_empty, filter_attributes, isnull, @@ -21,7 +21,6 @@ __all__ = ["list_to_dag", "dict_to_dag", "dataframe_to_dag"] -@optional_dependencies_pandas def list_to_dag( relations: List[Tuple[str, str]], node_type: Type[DAGNode] = DAGNode, @@ -45,13 +44,26 @@ def list_to_dag( """ assert_length_not_empty(relations, "Input list", "relations") - relation_data = pd.DataFrame(relations, columns=["parent", "child"]) - return dataframe_to_dag( - relation_data, child_col="child", parent_col="parent", node_type=node_type - ) + node_dict: Dict[str, DAGNode] = dict() + parent_node = DAGNode() + + for parent_name, child_name in relations: + if parent_name not in node_dict: + parent_node = node_type(parent_name) + node_dict[parent_name] = parent_node + else: + parent_node = node_dict[parent_name] + if child_name not in node_dict: + child_node = node_type(child_name) + node_dict[child_name] = child_node + else: + child_node = node_dict[child_name] + + child_node.parents = [parent_node] + + return parent_node -@optional_dependencies_pandas def dict_to_dag( relation_attrs: Dict[str, Any], parent_key: str = "parents", @@ -83,22 +95,36 @@ def dict_to_dag( Returns: (DAGNode) """ - assert_dictionary_not_empty(relation_attrs, "relation_attrs") + assert_length_not_empty(relation_attrs, "Dictionary", "relation_attrs") + + node_dict: Dict[str, DAGNode] = dict() + parent_node: DAGNode | None = None + + for child_name, node_attrs in relation_attrs.items(): + node_attrs = node_attrs.copy() + parent_names: List[str] = [] + if parent_key in node_attrs: + parent_names = node_attrs.pop(parent_key) + assert_key_not_in_dict_or_df(node_attrs, ["parent", "parents", "children"]) + + if child_name in node_dict: + child_node = node_dict[child_name] + child_node.set_attrs(node_attrs) + else: + child_node = node_type(child_name, **node_attrs) + node_dict[child_name] = child_node + + for parent_name in parent_names: + parent_node = node_dict.get(parent_name, node_type(parent_name)) + node_dict[parent_name] = parent_node + child_node.parents = [parent_node] - # Convert dictionary to dataframe - data = pd.DataFrame(relation_attrs).T.rename_axis("_tmp_child").reset_index() - if parent_key not in data: + if parent_node is None: raise ValueError( f"Parent key {parent_key} not in dictionary, check `relation_attrs` and `parent_key`" ) - data = data.explode(parent_key) - return dataframe_to_dag( - data, - child_col="_tmp_child", - parent_col=parent_key, - node_type=node_type, - ) + return parent_node @optional_dependencies_pandas @@ -164,6 +190,7 @@ def dataframe_to_dag( attribute_cols = list(data.columns) attribute_cols.remove(child_col) attribute_cols.remove(parent_col) + assert_key_not_in_dict_or_df(attribute_cols, ["parent", "parents", "children"]) data = data[[child_col, parent_col] + attribute_cols].copy() diff --git a/bigtree/dag/export.py b/bigtree/dag/export.py index c39978a0..1739eb60 100644 --- a/bigtree/dag/export.py +++ b/bigtree/dag/export.py @@ -3,6 +3,7 @@ from typing import Any, Dict, List, Tuple, TypeVar, Union from bigtree.node.dagnode import DAGNode +from bigtree.utils.assertions import assert_tree_type from bigtree.utils.exceptions import ( optional_dependencies_image, optional_dependencies_pandas, @@ -265,10 +266,7 @@ def dag_to_dot( dag = [dag] for _dag in dag: - if not isinstance(_dag, DAGNode): - raise TypeError( - "Tree should be of type `DAGNode`, or inherit from `DAGNode`" - ) + assert_tree_type(_dag, DAGNode, "DAGNode") _dag = _dag.copy() for parent_node, child_node in dag_iterator(_dag): diff --git a/bigtree/tree/construct.py b/bigtree/tree/construct.py index 3bb8f5f4..80ba92a3 100644 --- a/bigtree/tree/construct.py +++ b/bigtree/tree/construct.py @@ -10,7 +10,6 @@ assert_dataframe_no_duplicate_attribute, assert_dataframe_no_duplicate_children, assert_dataframe_not_empty, - assert_dictionary_not_empty, assert_length_not_empty, filter_attributes, isnull, @@ -185,7 +184,7 @@ def add_dict_to_tree_by_path( Returns: (Node) """ - assert_dictionary_not_empty(path_attrs, "path_attrs") + assert_length_not_empty(path_attrs, "Dictionary", "path_attrs") root_node = tree.root @@ -232,7 +231,7 @@ def add_dict_to_tree_by_name(tree: Node, name_attrs: Dict[str, Dict[str, Any]]) """ from bigtree.tree.search import findall - assert_dictionary_not_empty(name_attrs, "name_attrs") + assert_length_not_empty(name_attrs, "Dictionary", "name_attrs") attr_dict_names = set(name_attrs.keys()) @@ -642,7 +641,7 @@ def dict_to_tree( Returns: (Node) """ - assert_dictionary_not_empty(path_attrs, "path_attrs") + assert_length_not_empty(path_attrs, "Dictionary", "path_attrs") # Initial tree root_name = list(path_attrs.keys())[0].lstrip(sep).rstrip(sep).split(sep)[0] @@ -724,7 +723,7 @@ def nested_dict_to_tree( Returns: (Node) """ - assert_dictionary_not_empty(node_attrs, "node_attrs") + assert_length_not_empty(node_attrs, "Dictionary", "node_attrs") def _recursive_add_child( child_dict: Dict[str, Any], parent_node: Optional[Node] = None diff --git a/bigtree/tree/export.py b/bigtree/tree/export.py index c883bd4d..818d885c 100644 --- a/bigtree/tree/export.py +++ b/bigtree/tree/export.py @@ -8,6 +8,7 @@ assert_key_in_dict, assert_str_in_list, assert_style_in_dict, + assert_tree_type, isnull, ) from bigtree.utils.constants import ExportConstants, MermaidConstants, NewickCharacter @@ -1223,8 +1224,7 @@ def tree_to_dot( tree = [tree] for _tree in tree: - if not isinstance(_tree, Node): - raise TypeError("Tree should be of type `Node`, or inherit from `Node`") + assert_tree_type(_tree, Node, "Node") name_dict: Dict[str, List[str]] = collections.defaultdict(list) diff --git a/bigtree/tree/helper.py b/bigtree/tree/helper.py index afac7b68..86359d18 100644 --- a/bigtree/tree/helper.py +++ b/bigtree/tree/helper.py @@ -7,6 +7,7 @@ from bigtree.tree.construct import add_dict_to_tree_by_path, dataframe_to_tree from bigtree.tree.export import tree_to_dataframe from bigtree.tree.search import find_path +from bigtree.utils.assertions import assert_tree_type from bigtree.utils.exceptions import NotFoundError from bigtree.utils.iterators import levelordergroup_iter @@ -34,8 +35,7 @@ def clone_tree(tree: BaseNode, node_type: Type[BaseNodeT]) -> BaseNodeT: Returns: (BaseNode) """ - if not isinstance(tree, BaseNode): - raise TypeError("Tree should be of type `BaseNode`, or inherit from `BaseNode`") + assert_tree_type(tree, BaseNode, "BaseNode") # Start from root root_info = dict(tree.root.describe(exclude_prefix="_")) diff --git a/bigtree/utils/assertions.py b/bigtree/utils/assertions.py index 8a430d45..76237bd2 100644 --- a/bigtree/utils/assertions.py +++ b/bigtree/utils/assertions.py @@ -1,12 +1,27 @@ from __future__ import annotations -import math -from typing import Any, Dict, List, Union +from typing import TYPE_CHECKING, Any, Dict, List, Sized, Type, Union -try: +if TYPE_CHECKING: import pandas as pd -except ImportError: # pragma: no cover - pd = None + + from bigtree.node.basenode import BaseNode + from bigtree.node.dagnode import DAGNode + from bigtree.node.node import Node + + +__all__ = [ + "assert_style_in_dict", + "assert_str_in_list", + "assert_key_in_dict", + "assert_length_not_empty", + "assert_dataframe_not_empty", + "assert_dataframe_no_duplicate_attribute", + "assert_dataframe_no_duplicate_children", + "assert_tree_type", + "isnull", + "filter_attributes", +] def assert_style_in_dict( @@ -43,6 +58,23 @@ def assert_str_in_list( ) +def assert_key_not_in_dict_or_df( + parameter_dict: Union[Dict[str, Any], pd.DataFrame], + not_accepted_parameters: List[str], +) -> None: + """Raise ValueError is parameter is in key of dictionary + + Args: + parameter_dict (Dict[str, Any]/pd.DataFrame): argument input for parameter + not_accepted_parameters (List[str]): list of not accepted parameters + """ + for parameter in parameter_dict: + if parameter in not_accepted_parameters: + raise ValueError( + f"Invalid input, check `{parameter}` is not a valid key as it is a reserved keyword" + ) + + def assert_key_in_dict( parameter_name: str, parameter: Any, @@ -61,13 +93,11 @@ def assert_key_in_dict( ) -def assert_length_not_empty( - data: Union[str, List[Any]], argument_name: str, argument: str -) -> None: - """Raise ValueError if data (str, list, or iterable) does not have length +def assert_length_not_empty(data: Sized, argument_name: str, argument: str) -> None: + """Raise ValueError if data does not have length Args: - data (str/List[Any]): data to check + data (Sized): data to check argument_name: argument name for data, for error message argument (str): argument for data, for error message """ @@ -77,17 +107,6 @@ def assert_length_not_empty( ) -def assert_dictionary_not_empty(data_dict: Dict[Any, Any], argument: str) -> None: - """Raise ValueError is dictionary is empty - - Args: - data_dict (Dict[Any, Any]): dictionary to check - argument (str): argument for dictionary, for error message - """ - if not len(data_dict): - raise ValueError(f"Dictionary does not contain any data, check `{argument}`") - - def assert_dataframe_not_empty(data: pd.DataFrame) -> None: """Raise ValueError is dataframe is empty @@ -158,6 +177,24 @@ def assert_dataframe_no_duplicate_children( ) +def assert_tree_type( + tree: Union[BaseNode, Node, DAGNode], + tree_type: Union[Type[BaseNode], Type[Node], Type[DAGNode]], + tree_type_name: str, +) -> None: + """Raise TypeError is tree is not of `tree_type` + + Args: + tree (Union["BaseNode", "Node", "DAGNode"]): tree to check + tree_type: tree type to assert for + tree_type_name (str): tree type name + """ + if not isinstance(tree, tree_type): + raise TypeError( + f"Tree should be of type `{tree_type_name}`, or inherit from `{tree_type_name}`" + ) + + def isnull(value: Any) -> bool: """Check if value is null @@ -167,6 +204,8 @@ def isnull(value: Any) -> bool: Returns: (bool) """ + import math + if not value or (isinstance(value, float) and math.isnan(value)): return True return False diff --git a/mkdocs.yml b/mkdocs.yml index 434f11c6..49f3fd9c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -98,9 +98,9 @@ theme: plugins: - glightbox - search - - social: - cards_layout_options: - logo: docs/_static/favicon.svg +# - social: +# cards_layout_options: +# logo: docs/_static/favicon.svg - mkdocstrings: handlers: python: diff --git a/tests/dag/test_construct.py b/tests/dag/test_construct.py index 9abc852a..d73b3146 100644 --- a/tests/dag/test_construct.py +++ b/tests/dag/test_construct.py @@ -80,10 +80,48 @@ def test_dict_to_dag_empty_error(self): parameter="relation_attrs" ) - def test_dict_to_dag_parent_key_error(self): + @staticmethod + def test_dict_to_dag_parent_key_error(): + relation_dict = { + "a": {"age": 90}, + "b": {"age": 65}, + "c": {"parent1": ["a", "b"], "age": 60}, + "d": {"parent1": ["a", "c"], "age": 40}, + "e": {"parent1": ["d"], "age": 35}, + "f": {"parent1": ["c", "d"], "age": 38}, + "g": {"parent1": ["c"], "age": 10}, + "h": {"parent1": ["g"], "age": 6}, + } + with pytest.raises(ValueError) as exc_info: + dict_to_dag(relation_dict) + assert str(exc_info.value) == Constants.ERROR_DAG_DICT_PARENT_KEY.format( + parent_key="parents" + ) + + def test_dict_to_dag_parent_key_reserved_keyword_parents_error(self): with pytest.raises(ValueError) as exc_info: dict_to_dag(self.relation_dict, parent_key="parent") - assert str(exc_info.value) == Constants.ERROR_DAG_DICT_PARENT_KEY + assert str(exc_info.value) == Constants.ERROR_DAG_DICT_INVALID_KEY.format( + parameter="parents" + ) + + @staticmethod + def test_dict_to_dag_parent_key_reserved_keyword_parent_error(): + relation_dict = { + "a": {"age": 90}, + "b": {"age": 65}, + "c": {"parent": ["a", "b"], "age": 60}, + "d": {"parent": ["a", "c"], "age": 40}, + "e": {"parent": ["d"], "age": 35}, + "f": {"parent": ["c", "d"], "age": 38}, + "g": {"parent": ["c"], "age": 10}, + "h": {"parent": ["g"], "age": 6}, + } + with pytest.raises(ValueError) as exc_info: + dict_to_dag(relation_dict) + assert str(exc_info.value) == Constants.ERROR_DAG_DICT_INVALID_KEY.format( + parameter="parent" + ) def test_dict_to_dag_node_type(self): dag = dict_to_dag(self.relation_dict, node_type=DAGNodeA) @@ -178,6 +216,54 @@ def test_dataframe_to_dag_parent_col_error(self): parent_col=parent_col ) + @staticmethod + def test_dataframe_to_dag_parent_col_reserved_keyword_parents_error(): + data = pd.DataFrame( + [ + ["h", "g", "a", 6], + ["g", "c", "a", 10], + ["f", "d", "a", 38], + ["f", "c", "a", 38], + ["e", "d", "a", 35], + ["d", "c", "a", 40], + ["d", "a", "a", 40], + ["c", "b", "a", 60], + ["c", "a", "a", 60], + ["a", None, None, 90], + ["b", None, None, 65], + ], + columns=["child", "parent", "parents", "age"], + ) + with pytest.raises(ValueError) as exc_info: + dataframe_to_dag(data, parent_col="parent") + assert str(exc_info.value) == Constants.ERROR_DAG_DICT_INVALID_KEY.format( + parameter="parents" + ) + + @staticmethod + def test_dataframe_to_dag_parent_col_reserved_keyword_parent_error(): + data = pd.DataFrame( + [ + ["h", "g", "a", 6], + ["g", "c", "a", 10], + ["f", "d", "a", 38], + ["f", "c", "a", 38], + ["e", "d", "a", 35], + ["d", "c", "a", 40], + ["d", "a", "a", 40], + ["c", "b", "a", 60], + ["c", "a", "a", 60], + ["a", None, None, 90], + ["b", None, None, 65], + ], + columns=["child", "parent", "parents", "age"], + ) + with pytest.raises(ValueError) as exc_info: + dataframe_to_dag(data, parent_col="parents") + assert str(exc_info.value) == Constants.ERROR_DAG_DICT_INVALID_KEY.format( + parameter="parent" + ) + def test_dataframe_to_dag_attribute_cols(self): dag = dataframe_to_dag(self.data, attribute_cols=["age"]) assert_dag_structure_root(dag) diff --git a/tests/test_constants.py b/tests/test_constants.py index 0882da83..f66bd64d 100644 --- a/tests/test_constants.py +++ b/tests/test_constants.py @@ -20,9 +20,8 @@ class Constants: ERROR_CUSTOM_TYPE = "Node type is not `{type}`" # dag/construct - ERROR_DAG_DICT_PARENT_KEY = ( - "Parent key parent not in dictionary, check `relation_attrs` and `parent_key`" - ) + ERROR_DAG_DICT_INVALID_KEY = "Invalid input, check `{parameter}` is not a valid key as it is a reserved keyword" + ERROR_DAG_DICT_PARENT_KEY = "Parent key {parent_key} not in dictionary, check `relation_attrs` and `parent_key`" ERROR_DAG_DATAFRAME_EMPTY_CHILD = ( "Child name cannot be empty, check column: {child_col}" )