Merge pull request #220 from kayjan/optimization-timings

Optimization timings
kayjan · Apr 4, 2024 · e86c71d · e86c71d
2 parents 5da91ca + 992ea0e
commit e86c71d
Show file tree

Hide file tree

Showing 14 changed files with 810 additions and 384 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,35 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.17.0] - 2024-04-04
+### Added
+- Misc: Group tests for benchmark timings to compare the timings by multiplier more effectively.
+### Changed
+- Tree Constructor: `add_dict_to_tree_by_name` and `add_dataframe_to_tree_by_name` modifies tree in-place instead
+of returning new tree, and does not accept `join_type` as argument as pandas dataframe operation is phased out.
+If there are clashing attributes, only those that have values will be replaced.
+**This might not be backwards-compatible!**
+- Tree Constructor: `dataframe_to_tree` no longer relies on `add_dataframe_to_tree_by_path` as it performs
+assertion checks twice. This leads to 5% improvement in timings for a tree with 10000 nodes, averaged across 10 runs.
+- Misc: Abstract out assertion checks for empty dataframe and duplicate attribute.
+- Misc: Abstract out logic for checking null and filtering attributes.
+- Misc: Optimization in dictionary and dataframe operations.
+### Fixed
+- Tree Constructor: `dict_to_tree` no longer uses dataframe operations, leading to 33% improvement in timings for
+a tree with 10000 nodes, averaged across 10 runs. The resulting data type of node follows the dictionary exactly,
+compared to the previous dataframe operations that may change the dtypes for certain columns.
+**This might not be backwards-compatible!**
+- Tree Constructor: `dataframe_to_tree_by_relation` fix root node detection logic, ignore existing name column,
+ignore non-attribute columns, ignore null attribute columns.
+- Tree Constructor: `add_dataframe_to_tree_by_path` ignore existing name column, ignore non-attribute columns,
+ignore null attribute columns.
+- Tree Constructor: `add_dataframe_to_tree_by_name` ignore existing name column, ignore non-attribute columns,
+ignore null attribute columns.
+- Tree Constructor: `dataframe_to_tree` ignore existing name column, ignore non-attribute columns,
+ignore null attribute columns.
+- DAG Constructor: `dataframe_to_dag` ignore existing name column, ignore non-attribute columns,
+ignore null attribute columns.
+
 ## [0.16.4] - 2024-03-14
 ### Fixed
 - [#216] Tree Exporter: Fix nan checker when printing trees.
@@ -511,7 +540,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Utility Iterator: Tree traversal methods.
 - Workflow To Do App: Tree use case with to-do list implementation.
 
-[Unreleased]: https://github.com/kayjan/bigtree/compare/0.16.4...HEAD
+[Unreleased]: https://github.com/kayjan/bigtree/compare/0.17.0...HEAD
+[0.17.0]: https://github.com/kayjan/bigtree/compare/0.16.4...0.17.0
 [0.16.4]: https://github.com/kayjan/bigtree/compare/0.16.3...0.16.4
 [0.16.3]: https://github.com/kayjan/bigtree/compare/0.16.2...0.16.3
 [0.16.2]: https://github.com/kayjan/bigtree/compare/0.16.1...0.16.2

diff --git a/assets/docs/tree_construct.png b/assets/docs/tree_construct.png
diff --git a/bigtree/__init__.py b/bigtree/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.16.4"
+__version__ = "0.17.0"
 
 from bigtree.binarytree.construct import list_to_binarytree
 from bigtree.dag.construct import dataframe_to_dag, dict_to_dag, list_to_dag

diff --git a/bigtree/binarytree/construct.py b/bigtree/binarytree/construct.py
@@ -4,6 +4,8 @@
 
 __all__ = ["list_to_binarytree"]
 
+from bigtree.utils.assertions import assert_length_not_empty
+
 
 def list_to_binarytree(
     heapq_list: List[int], node_type: Type[BinaryNode] = BinaryNode
@@ -37,8 +39,7 @@ def list_to_binarytree(
     Returns:
         (BinaryNode)
     """
-    if not len(heapq_list):
-        raise ValueError("Input list does not contain any data, check `heapq_list`")
+    assert_length_not_empty(heapq_list, "Input list", "heapq_list")
 
     root_node = node_type(heapq_list[0])
     node_list = [root_node]

diff --git a/bigtree/dag/construct.py b/bigtree/dag/construct.py
@@ -3,6 +3,14 @@
 from typing import Any, Dict, List, Tuple, Type
 
 from bigtree.node.dagnode import DAGNode
+from bigtree.utils.assertions import (
+    assert_dataframe_no_duplicate_attribute,
+    assert_dataframe_not_empty,
+    assert_dictionary_not_empty,
+    assert_length_not_empty,
+    filter_attributes,
+    isnull,
+)
 from bigtree.utils.exceptions import optional_dependencies_pandas
 
 try:
@@ -35,15 +43,15 @@ def list_to_dag(
     Returns:
         (DAGNode)
     """
-    if not len(relations):
-        raise ValueError("Input list does not contain any data, check `relations`")
+    assert_length_not_empty(relations, "Input list", "relations")
 
     relation_data = pd.DataFrame(relations, columns=["parent", "child"])
     return dataframe_to_dag(
         relation_data, child_col="child", parent_col="parent", node_type=node_type
     )
 
 
+@optional_dependencies_pandas
 def dict_to_dag(
     relation_attrs: Dict[str, Any],
     parent_key: str = "parents",
@@ -75,8 +83,7 @@ def dict_to_dag(
     Returns:
         (DAGNode)
     """
-    if not len(relation_attrs):
-        raise ValueError("Dictionary does not contain any data, check `relation_attrs`")
+    assert_dictionary_not_empty(relation_attrs, "relation_attrs")
 
     # Convert dictionary to dataframe
     data = pd.DataFrame(relation_attrs).T.rename_axis("_tmp_child").reset_index()
@@ -110,6 +117,8 @@ def dataframe_to_dag(
     - If columns are not specified, `child_col` takes first column, `parent_col` takes second column, and all other
         columns are `attribute_cols`.
 
+    Only attributes in `attribute_cols` with non-null values will be added to the tree.
+
     Examples:
         >>> import pandas as pd
         >>> from bigtree import dataframe_to_dag, dag_iterator
@@ -141,12 +150,7 @@ def dataframe_to_dag(
     Returns:
         (DAGNode)
     """
-    data = data.copy()
-
-    if not len(data.columns):
-        raise ValueError("Data does not contain any columns, check `data`")
-    if not len(data):
-        raise ValueError("Data does not contain any rows, check `data`")
+    assert_dataframe_not_empty(data)
 
     if not child_col:
         child_col = data.columns[0]
@@ -160,27 +164,12 @@ def dataframe_to_dag(
         attribute_cols = list(data.columns)
         attribute_cols.remove(child_col)
         attribute_cols.remove(parent_col)
-    elif any([col not in data.columns for col in attribute_cols]):
-        raise ValueError(
-            f"One or more attribute column(s) not in data, check `attribute_cols`: {attribute_cols}"
-        )
 
-    data_check = data.copy()[[child_col, parent_col] + attribute_cols].drop_duplicates(
-        subset=[child_col] + attribute_cols
-    )
-    _duplicate_check = (
-        data_check[child_col]
-        .value_counts()
-        .to_frame("counts")
-        .rename_axis(child_col)
-        .reset_index()
+    data = data[[child_col, parent_col] + attribute_cols].copy()
+
+    assert_dataframe_no_duplicate_attribute(
+        data, "child name", child_col, attribute_cols
     )
-    _duplicate_check = _duplicate_check[_duplicate_check["counts"] > 1]
-    if len(_duplicate_check):
-        raise ValueError(
-            f"There exists duplicate child name with different attributes\n"
-            f"Check {_duplicate_check}"
-        )
     if sum(data[child_col].isnull()):
         raise ValueError(f"Child name cannot be empty, check column: {child_col}")
 
@@ -190,15 +179,14 @@ def dataframe_to_dag(
     for row in data.reset_index(drop=True).to_dict(orient="index").values():
         child_name = row[child_col]
         parent_name = row[parent_col]
-        node_attrs = row.copy()
-        del node_attrs[child_col]
-        del node_attrs[parent_col]
-        node_attrs = {k: v for k, v in node_attrs.items() if not pd.isnull(v)}
-        child_node = node_dict.get(child_name, node_type(child_name))
+        node_attrs = filter_attributes(
+            row, omit_keys=["name", child_col, parent_col], omit_null_values=True
+        )
+        child_node = node_dict.get(child_name, node_type(child_name, **node_attrs))
         child_node.set_attrs(node_attrs)
         node_dict[child_name] = child_node
 
-        if not pd.isnull(parent_name):
+        if not isnull(parent_name):
             parent_node = node_dict.get(parent_name, node_type(parent_name))
             node_dict[parent_name] = parent_node
             child_node.parents = [parent_node]