Skip to content

Commit

Permalink
Merge pull request #220 from kayjan/optimization-timings
Browse files Browse the repository at this point in the history
Optimization timings
  • Loading branch information
kayjan authored Apr 4, 2024
2 parents 5da91ca + 992ea0e commit e86c71d
Show file tree
Hide file tree
Showing 14 changed files with 810 additions and 384 deletions.
32 changes: 31 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,35 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.17.0] - 2024-04-04
### Added
- Misc: Group tests for benchmark timings to compare the timings by multiplier more effectively.
### Changed
- Tree Constructor: `add_dict_to_tree_by_name` and `add_dataframe_to_tree_by_name` modifies tree in-place instead
of returning new tree, and does not accept `join_type` as argument as pandas dataframe operation is phased out.
If there are clashing attributes, only those that have values will be replaced.
**This might not be backwards-compatible!**
- Tree Constructor: `dataframe_to_tree` no longer relies on `add_dataframe_to_tree_by_path` as it performs
assertion checks twice. This leads to 5% improvement in timings for a tree with 10000 nodes, averaged across 10 runs.
- Misc: Abstract out assertion checks for empty dataframe and duplicate attribute.
- Misc: Abstract out logic for checking null and filtering attributes.
- Misc: Optimization in dictionary and dataframe operations.
### Fixed
- Tree Constructor: `dict_to_tree` no longer uses dataframe operations, leading to 33% improvement in timings for
a tree with 10000 nodes, averaged across 10 runs. The resulting data type of node follows the dictionary exactly,
compared to the previous dataframe operations that may change the dtypes for certain columns.
**This might not be backwards-compatible!**
- Tree Constructor: `dataframe_to_tree_by_relation` fix root node detection logic, ignore existing name column,
ignore non-attribute columns, ignore null attribute columns.
- Tree Constructor: `add_dataframe_to_tree_by_path` ignore existing name column, ignore non-attribute columns,
ignore null attribute columns.
- Tree Constructor: `add_dataframe_to_tree_by_name` ignore existing name column, ignore non-attribute columns,
ignore null attribute columns.
- Tree Constructor: `dataframe_to_tree` ignore existing name column, ignore non-attribute columns,
ignore null attribute columns.
- DAG Constructor: `dataframe_to_dag` ignore existing name column, ignore non-attribute columns,
ignore null attribute columns.

## [0.16.4] - 2024-03-14
### Fixed
- [#216] Tree Exporter: Fix nan checker when printing trees.
Expand Down Expand Up @@ -511,7 +540,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Utility Iterator: Tree traversal methods.
- Workflow To Do App: Tree use case with to-do list implementation.

[Unreleased]: https://github.com/kayjan/bigtree/compare/0.16.4...HEAD
[Unreleased]: https://github.com/kayjan/bigtree/compare/0.17.0...HEAD
[0.17.0]: https://github.com/kayjan/bigtree/compare/0.16.4...0.17.0
[0.16.4]: https://github.com/kayjan/bigtree/compare/0.16.3...0.16.4
[0.16.3]: https://github.com/kayjan/bigtree/compare/0.16.2...0.16.3
[0.16.2]: https://github.com/kayjan/bigtree/compare/0.16.1...0.16.2
Expand Down
Binary file modified assets/docs/tree_construct.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion bigtree/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.16.4"
__version__ = "0.17.0"

from bigtree.binarytree.construct import list_to_binarytree
from bigtree.dag.construct import dataframe_to_dag, dict_to_dag, list_to_dag
Expand Down
5 changes: 3 additions & 2 deletions bigtree/binarytree/construct.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

__all__ = ["list_to_binarytree"]

from bigtree.utils.assertions import assert_length_not_empty


def list_to_binarytree(
heapq_list: List[int], node_type: Type[BinaryNode] = BinaryNode
Expand Down Expand Up @@ -37,8 +39,7 @@ def list_to_binarytree(
Returns:
(BinaryNode)
"""
if not len(heapq_list):
raise ValueError("Input list does not contain any data, check `heapq_list`")
assert_length_not_empty(heapq_list, "Input list", "heapq_list")

root_node = node_type(heapq_list[0])
node_list = [root_node]
Expand Down
58 changes: 23 additions & 35 deletions bigtree/dag/construct.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@
from typing import Any, Dict, List, Tuple, Type

from bigtree.node.dagnode import DAGNode
from bigtree.utils.assertions import (
assert_dataframe_no_duplicate_attribute,
assert_dataframe_not_empty,
assert_dictionary_not_empty,
assert_length_not_empty,
filter_attributes,
isnull,
)
from bigtree.utils.exceptions import optional_dependencies_pandas

try:
Expand Down Expand Up @@ -35,15 +43,15 @@ def list_to_dag(
Returns:
(DAGNode)
"""
if not len(relations):
raise ValueError("Input list does not contain any data, check `relations`")
assert_length_not_empty(relations, "Input list", "relations")

relation_data = pd.DataFrame(relations, columns=["parent", "child"])
return dataframe_to_dag(
relation_data, child_col="child", parent_col="parent", node_type=node_type
)


@optional_dependencies_pandas
def dict_to_dag(
relation_attrs: Dict[str, Any],
parent_key: str = "parents",
Expand Down Expand Up @@ -75,8 +83,7 @@ def dict_to_dag(
Returns:
(DAGNode)
"""
if not len(relation_attrs):
raise ValueError("Dictionary does not contain any data, check `relation_attrs`")
assert_dictionary_not_empty(relation_attrs, "relation_attrs")

# Convert dictionary to dataframe
data = pd.DataFrame(relation_attrs).T.rename_axis("_tmp_child").reset_index()
Expand Down Expand Up @@ -110,6 +117,8 @@ def dataframe_to_dag(
- If columns are not specified, `child_col` takes first column, `parent_col` takes second column, and all other
columns are `attribute_cols`.
Only attributes in `attribute_cols` with non-null values will be added to the tree.
Examples:
>>> import pandas as pd
>>> from bigtree import dataframe_to_dag, dag_iterator
Expand Down Expand Up @@ -141,12 +150,7 @@ def dataframe_to_dag(
Returns:
(DAGNode)
"""
data = data.copy()

if not len(data.columns):
raise ValueError("Data does not contain any columns, check `data`")
if not len(data):
raise ValueError("Data does not contain any rows, check `data`")
assert_dataframe_not_empty(data)

if not child_col:
child_col = data.columns[0]
Expand All @@ -160,27 +164,12 @@ def dataframe_to_dag(
attribute_cols = list(data.columns)
attribute_cols.remove(child_col)
attribute_cols.remove(parent_col)
elif any([col not in data.columns for col in attribute_cols]):
raise ValueError(
f"One or more attribute column(s) not in data, check `attribute_cols`: {attribute_cols}"
)

data_check = data.copy()[[child_col, parent_col] + attribute_cols].drop_duplicates(
subset=[child_col] + attribute_cols
)
_duplicate_check = (
data_check[child_col]
.value_counts()
.to_frame("counts")
.rename_axis(child_col)
.reset_index()
data = data[[child_col, parent_col] + attribute_cols].copy()

assert_dataframe_no_duplicate_attribute(
data, "child name", child_col, attribute_cols
)
_duplicate_check = _duplicate_check[_duplicate_check["counts"] > 1]
if len(_duplicate_check):
raise ValueError(
f"There exists duplicate child name with different attributes\n"
f"Check {_duplicate_check}"
)
if sum(data[child_col].isnull()):
raise ValueError(f"Child name cannot be empty, check column: {child_col}")

Expand All @@ -190,15 +179,14 @@ def dataframe_to_dag(
for row in data.reset_index(drop=True).to_dict(orient="index").values():
child_name = row[child_col]
parent_name = row[parent_col]
node_attrs = row.copy()
del node_attrs[child_col]
del node_attrs[parent_col]
node_attrs = {k: v for k, v in node_attrs.items() if not pd.isnull(v)}
child_node = node_dict.get(child_name, node_type(child_name))
node_attrs = filter_attributes(
row, omit_keys=["name", child_col, parent_col], omit_null_values=True
)
child_node = node_dict.get(child_name, node_type(child_name, **node_attrs))
child_node.set_attrs(node_attrs)
node_dict[child_name] = child_node

if not pd.isnull(parent_name):
if not isnull(parent_name):
parent_node = node_dict.get(parent_name, node_type(parent_name))
node_dict[parent_name] = parent_node
child_node.parents = [parent_node]
Expand Down
Loading

0 comments on commit e86c71d

Please sign in to comment.