Skip to content

Commit

Permalink
encode tree metadata and node data in seperate dfs
Browse files Browse the repository at this point in the history
  • Loading branch information
YYYasin19 committed Mar 5, 2023
1 parent 22067af commit 7409a2b
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 40 deletions.
37 changes: 19 additions & 18 deletions benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Callable, List

import lightgbm as lgb
import tabulate
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

from examples.utils import generate_dataset
Expand Down Expand Up @@ -64,36 +65,36 @@ def benchmark_model(name, train_func, dump_func) -> dict:
naive_dump_time = benchmark(pickle.dumps, model)
naive_pickled = pickle.dumps(model)
naive_pickled_size = len(naive_pickled)
naive_load_time = benchmark(pickle.loads, naive_pickled)
# naive_load_time = benchmark(pickle.loads, naive_pickled)

our_dump_time = benchmark(dump_func, model, io.BytesIO())
our_pickled_buf = io.BytesIO()
dump_func(model, our_pickled_buf)
our_pickled = our_pickled_buf.getvalue()
our_pickled_size = len(our_pickled)
our_load_time = benchmark(pickle.loads, our_pickled)
# our_load_time = benchmark(pickle.loads, our_pickled)
return {
"name": name,
"baseline": {
"size": naive_pickled_size,
"dump_time": naive_dump_time,
"load_time": naive_load_time,
# "load_time": naive_load_time,
},
"ours": {
"size": our_pickled_size,
"dump_time": our_dump_time,
"load_time": our_load_time,
# "load_time": our_load_time,
},
"change": {
"size": naive_pickled_size / our_pickled_size,
"dump_time": our_dump_time / naive_dump_time,
"load_time": our_load_time / naive_load_time,
# "load_time": our_load_time / naive_load_time,
},
}


def format_size(n_bytes: int) -> str:
MiB = 1024**2
MiB = 1024 ** 2
return f"{n_bytes / MiB:.1f} MiB"


Expand All @@ -111,9 +112,9 @@ def format_benchmarks_results_table(benchmark_results: List[dict]) -> str:
|--|--:|--:|--:|
"""

def format_row(results):
def format_row(results, min_width=10):
def format_cell(base, ours, change):
return f"{base} / {ours} / {change}"
return f"{base:<5} / {ours} / {change}"

column_data = [
results["name"],
Expand All @@ -127,11 +128,11 @@ def format_cell(base, ours, change):
format_time(results["ours"]["dump_time"]),
format_change(results["change"]["dump_time"]),
),
format_cell(
format_time(results["baseline"]["load_time"]),
format_time(results["ours"]["load_time"]),
format_change(results["change"]["load_time"]),
),
# format_cell(
# format_time(results["baseline"]["load_time"]),
# format_time(results["ours"]["load_time"]),
# format_change(results["change"]["load_time"]),
# ),
]
return " | ".join(column_data)

Expand All @@ -142,12 +143,12 @@ def format_cell(base, ours, change):

if __name__ == "__main__":
models_to_benchmark = [
("`RandomForestRegressor`", train_model_sklearn, dump_sklearn),
("`GradientBoostingRegressor`", train_gb_sklearn, dump_sklearn),
("`LGBMRegressor gbdt`", train_gbdt_lgbm, dump_lgbm),
("`LGBMRegressor gbdt large`", train_gbdt_large_lgbm, dump_lgbm),
# ("`RandomForestRegressor`", train_model_sklearn, dump_sklearn),
# ("`GradientBoostingRegressor`", train_gb_sklearn, dump_sklearn),
# ("`LGBMRegressor gbdt`", train_gbdt_lgbm, dump_lgbm),
# ("`LGBMRegressor gbdt large`", train_gbdt_large_lgbm, dump_lgbm),
("`LGBMRegressor rf`", train_rf_lgbm, dump_lgbm),
]
benchmark_results = [benchmark_model(*args) for args in models_to_benchmark]
print("Base results / Our results / Change")
print(format_benchmarks_results_table(benchmark_results))
print(format_benchmarks_results_table(benchmark_results))
1 change: 1 addition & 0 deletions model_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def get_type(s: str):
else:
return str


def pyarrow_table_to_bytes(table: pa.Table) -> bytes:
stream = pa.BufferOutputStream()
writer = pa.RecordBatchStreamWriter(stream, table.schema)
Expand Down
111 changes: 89 additions & 22 deletions slim_trees/lgbm_booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
import re
import sys
from typing import Any, BinaryIO, List, Tuple

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np

from slim_trees.compression_utils import (
Expand Down Expand Up @@ -58,7 +60,26 @@ def _decompress_booster_state(compressed_state: dict):
return state


def _compress_booster_handle(model_string: str) -> Tuple[str, List[dict], str]:
def pyarrow_table_to_bytes(table: pa.Table) -> bytes:
# TODO: move to utils
stream = pa.BufferOutputStream()
writer = pa.RecordBatchStreamWriter(stream, table.schema)
writer.write_table(table)
writer.close()
return stream.getvalue().to_pybytes()


def bytes_to_pandas_df(bytes_: bytes) -> pd.DataFrame:
"""
Given a bytes object, create a pandas DataFrame.
"""
stream = pa.BufferReader(bytes_)
reader = pa.RecordBatchStreamReader(stream)
table = reader.read_all()
return table.to_pandas()


def _compress_booster_handle(model_string: str) -> Tuple[str, bytes, bytes, str]:
if not model_string.startswith("tree\nversion=v3"):
raise ValueError("Only v3 is supported for the booster string format.")
FRONT_STRING_REGEX = r"(?:\w+(?:=.*)?\n)*\n(?=Tree)"
Expand All @@ -82,6 +103,7 @@ def _extract_feature(feature_line):
raise ValueError("Could not find back string.")
back_str = back_str_match.group()
tree_matches = re.findall(TREE_GROUP_REGEX, model_string)
nodes: List[dict] = []
trees: List[dict] = []
for i, tree_match in enumerate(tree_matches):
tree_name, features_list = tree_match
Expand All @@ -106,30 +128,61 @@ def parse(str_list, dtype):
assert len(feats_map["is_linear"]) == 1
assert len(feats_map["shrinkage"]) == 1

trees.append(
{
"num_leaves": int(feats_map["num_leaves"][0]),
"num_cat": int(feats_map["num_cat"][0]),
"split_feature": parse(feats_map["split_feature"], split_feature_dtype),
"threshold": compress_half_int_float_array(
parse(feats_map["threshold"], threshold_dtype)
),
"decision_type": parse(feats_map["decision_type"], decision_type_dtype),
"left_child": parse(feats_map["left_child"], left_child_dtype),
"right_child": parse(feats_map["right_child"], right_child_dtype),
"leaf_value": parse(feats_map["leaf_value"], leaf_value_dtype),
"is_linear": int(feats_map["is_linear"][0]),
"shrinkage": float(feats_map["shrinkage"][0]),
}
)
return front_str, trees, back_str
"""
strategy: we have two datastructures: one on tree_level and one on node_level
here, this looks like just splitting the features into two dicts
but one of them can be "exploded" later (node level) while the tree level is for meta information
"""

tree = {
"tree_idx": tree_idx,
"num_leaves": int(feats_map["num_leaves"][0]),
"num_cat": int(feats_map["num_cat"][0]),
"last_leaf_value": parse(feats_map["leaf_value"], leaf_value_dtype)[-1],
"is_linear": int(feats_map["is_linear"][0]),
"is_shrinkage": float(feats_map["shrinkage"][0]),
}
trees.append(tree)

node = {
"tree_idx": tree_idx, # TODO: this is new, have to recover this as well
"node_idx": list(range(int(feats_map["num_leaves"][0]) - 1)),
# all of these attributes have length num_leaves - 1
"num_leaves": int(feats_map["num_leaves"][0]),
"num_cat": int(feats_map["num_cat"][0]),
"split_feature": parse(feats_map["split_feature"], split_feature_dtype),
# "threshold": compress_half_int_float_array(
# parse(feats_map["threshold"], threshold_dtype)
# ),
"threshold": parse(feats_map["threshold"], threshold_dtype),
"decision_type": parse(feats_map["decision_type"], decision_type_dtype),
"left_child": parse(feats_map["left_child"], left_child_dtype),
"right_child": parse(feats_map["right_child"], right_child_dtype),
"leaf_value": parse(feats_map["leaf_value"], leaf_value_dtype)[:-1], # don't get the last value
}
nodes.append(node)

trees_df = pd.DataFrame(trees)
# create .parquet file in bytes from trees_df
trees_df_bytes = pyarrow_table_to_bytes(pa.Table.from_pandas(trees_df))

# transform nodes_df s.t. each feature is a column
nodes_df = pd.DataFrame(nodes)
nodes_df = nodes_df.explode(
["node_idx", "split_feature", "threshold", "decision_type", "left_child", "right_child", "leaf_value"]
)
nodes_df_bytes = pyarrow_table_to_bytes(pa.Table.from_pandas(nodes_df))

return front_str, trees_df_bytes, nodes_df_bytes, back_str

def _decompress_booster_handle(compressed_state: Tuple[str, List[dict], str]) -> str:
front_str, trees, back_str = compressed_state

def _decompress_booster_handle(compressed_state: Tuple[str, bytes, bytes, str]) -> str:
front_str, trees_df_bytes, nodes_df_bytes, back_str = compressed_state
assert type(front_str) == str
assert type(trees) == list
# assert type(trees) == list
assert type(back_str) == str
trees_df = bytes_to_pandas_df(trees_df_bytes)
nodes_df = bytes_to_pandas_df(nodes_df_bytes)

handle = front_str

Expand Down Expand Up @@ -175,3 +228,17 @@ def _decompress_booster_handle(compressed_state: Tuple[str, List[dict], str]) ->
handle += tree_str
handle += back_str
return handle


def compress_handle_parquet(trees: List[dict]) -> bytes:
"""
Take the list of dictionaries (tree) and create a pyarrow Table.
"""

# step 1: turn features into pyarrow arrays
# loop over all tree dicts in trees and create one dict per node with all features
for tree in trees:
pass
# step 2: create pyarrow table
# step 3: write table to parquet
# step 4: return bytes

0 comments on commit 7409a2b

Please sign in to comment.