chdb-io · auxten · Aug 8, 2024 · Jul 24, 2024 · Jul 29, 2024 · Aug 8, 2024
diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@
 </div>
 
 ## Get Started
-Get started with **chdb** using our [Installation and Usage Examples](https://doc.chdb.io)
+Get started with **chdb** using our [Installation and Usage Examples](https://clickhouse.com/docs/en/chdb)
 
 <br>
 
@@ -276,15 +276,22 @@ For more examples, see [examples](examples) and [tests](tests).
 
 ## Demos and Examples
 
-- [Project Documentation](https://doc.chdb.io) and [Usage Examples](https://chdb-io.github.io/#/install?id=installation-1)
+- [Project Documentation](https://clickhouse.com/docs/en/chdb) and [Usage Examples](https://clickhouse.com/docs/en/chdb/install/python)
 - [Colab Notebooks](https://colab.research.google.com/drive/1-zKB6oKfXeptggXi0kUX87iR8ZTSr4P3?usp=sharing) and other [Script Examples](examples)
 
 ## Benchmark
 
 - [ClickBench of embedded engines](https://benchmark.clickhouse.com/#eyJzeXN0ZW0iOnsiQXRoZW5hIChwYXJ0aXRpb25lZCkiOnRydWUsIkF0aGVuYSAoc2luZ2xlKSI6dHJ1ZSwiQXVyb3JhIGZvciBNeVNRTCI6dHJ1ZSwiQXVyb3JhIGZvciBQb3N0Z3JlU1FMIjp0cnVlLCJCeXRlSG91c2UiOnRydWUsImNoREIiOnRydWUsIkNpdHVzIjp0cnVlLCJjbGlja2hvdXNlLWxvY2FsIChwYXJ0aXRpb25lZCkiOnRydWUsImNsaWNraG91c2UtbG9jYWwgKHNpbmdsZSkiOnRydWUsIkNsaWNrSG91c2UiOnRydWUsIkNsaWNrSG91c2UgKHR1bmVkKSI6dHJ1ZSwiQ2xpY2tIb3VzZSAoenN0ZCkiOnRydWUsIkNsaWNrSG91c2UgQ2xvdWQiOnRydWUsIkNsaWNrSG91c2UgKHdlYikiOnRydWUsIkNyYXRlREIiOnRydWUsIkRhdGFiZW5kIjp0cnVlLCJEYXRhRnVzaW9uIChzaW5nbGUpIjp0cnVlLCJBcGFjaGUgRG9yaXMiOnRydWUsIkRydWlkIjp0cnVlLCJEdWNrREIgKFBhcnF1ZXQpIjp0cnVlLCJEdWNrREIiOnRydWUsIkVsYXN0aWNzZWFyY2giOnRydWUsIkVsYXN0aWNzZWFyY2ggKHR1bmVkKSI6ZmFsc2UsIkdyZWVucGx1bSI6dHJ1ZSwiSGVhdnlBSSI6dHJ1ZSwiSHlkcmEiOnRydWUsIkluZm9icmlnaHQiOnRydWUsIktpbmV0aWNhIjp0cnVlLCJNYXJpYURCIENvbHVtblN0b3JlIjp0cnVlLCJNYXJpYURCIjpmYWxzZSwiTW9uZXREQiI6dHJ1ZSwiTW9uZ29EQiI6dHJ1ZSwiTXlTUUwgKE15SVNBTSkiOnRydWUsIk15U1FMIjp0cnVlLCJQaW5vdCI6dHJ1ZSwiUG9zdGdyZVNRTCI6dHJ1ZSwiUG9zdGdyZVNRTCAodHVuZWQpIjpmYWxzZSwiUXVlc3REQiAocGFydGl0aW9uZWQpIjp0cnVlLCJRdWVzdERCIjp0cnVlLCJSZWRzaGlmdCI6dHJ1ZSwiU2VsZWN0REIiOnRydWUsIlNpbmdsZVN0b3JlIjp0cnVlLCJTbm93Zmxha2UiOnRydWUsIlNRTGl0ZSI6dHJ1ZSwiU3RhclJvY2tzIjp0cnVlLCJUaW1lc2NhbGVEQiAoY29tcHJlc3Npb24pIjp0cnVlLCJUaW1lc2NhbGVEQiI6dHJ1ZX0sInR5cGUiOnsic3RhdGVsZXNzIjpmYWxzZSwibWFuYWdlZCI6ZmFsc2UsIkphdmEiOmZhbHNlLCJjb2x1bW4tb3JpZW50ZWQiOmZhbHNlLCJDKysiOmZhbHNlLCJNeVNRTCBjb21wYXRpYmxlIjpmYWxzZSwicm93LW9yaWVudGVkIjpmYWxzZSwiQyI6ZmFsc2UsIlBvc3RncmVTUUwgY29tcGF0aWJsZSI6ZmFsc2UsIkNsaWNrSG91c2UgZGVyaXZhdGl2ZSI6ZmFsc2UsImVtYmVkZGVkIjp0cnVlLCJzZXJ2ZXJsZXNzIjpmYWxzZSwiUnVzdCI6ZmFsc2UsInNlYXJjaCI6ZmFsc2UsImRvY3VtZW50IjpmYWxzZSwidGltZS1zZXJpZXMiOmZhbHNlfSwibWFjaGluZSI6eyJzZXJ2ZXJsZXNzIjp0cnVlLCIxNmFjdSI6dHJ1ZSwiTCI6dHJ1ZSwiTSI6dHJ1ZSwiUyI6dHJ1ZSwiWFMiOnRydWUsImM2YS5tZXRhbCwgNTAwZ2IgZ3AyIjp0cnVlLCJjNmEuNHhsYXJnZSwgNTAwZ2IgZ3AyIjp0cnVlLCJjNS40eGxhcmdlLCA1MDBnYiBncDIiOnRydWUsIjE2IHRocmVhZHMiOnRydWUsIjIwIHRocmVhZHMiOnRydWUsIjI0IHRocmVhZHMiOnRydWUsIjI4IHRocmVhZHMiOnRydWUsIjMwIHRocmVhZHMiOnRydWUsIjQ4IHRocmVhZHMiOnRydWUsIjYwIHRocmVhZHMiOnRydWUsIm01ZC4yNHhsYXJnZSI6dHJ1ZSwiYzVuLjR4bGFyZ2UsIDIwMGdiIGdwMiI6dHJ1ZSwiYzZhLjR4bGFyZ2UsIDE1MDBnYiBncDIiOnRydWUsImRjMi44eGxhcmdlIjp0cnVlLCJyYTMuMTZ4bGFyZ2UiOnRydWUsInJhMy40eGxhcmdlIjp0cnVlLCJyYTMueGxwbHVzIjp0cnVlLCJTMjQiOnRydWUsIlMyIjp0cnVlLCIyWEwiOnRydWUsIjNYTCI6dHJ1ZSwiNFhMIjp0cnVlLCJYTCI6dHJ1ZX0sImNsdXN0ZXJfc2l6ZSI6eyIxIjp0cnVlLCIyIjp0cnVlLCI0Ijp0cnVlLCI4Ijp0cnVlLCIxNiI6dHJ1ZSwiMzIiOnRydWUsIjY0Ijp0cnVlLCIxMjgiOnRydWUsInNlcnZlcmxlc3MiOnRydWUsInVuZGVmaW5lZCI6dHJ1ZX0sIm1ldHJpYyI6ImhvdCIsInF1ZXJpZXMiOlt0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlXX0=)
 
+- [chDB vs Pandas](https://colab.research.google.com/drive/1FogLujJ_-ds7RGurDrUnK-U0IW8a8Qd0)
+
+<div align="center">
+    <img src="https://github.com/chdb-io/chdb/raw/main/docs/_static/chdb-vs-pandas.jpg" width="800">
+</div>
+
+
 ## Documentation
-- For chdb specific examples and documentation refer to [doc.chdb.io](https://doc.chdb.io)
+- For chdb specific examples and documentation refer to [chDB docs](https://clickhouse.com/docs/en/chdb)
 - For SQL syntax, please refer to [ClickHouse SQL Reference](https://clickhouse.com/docs/en/sql-reference/syntax)
 
 

diff --git a/chdb/__init__.py b/chdb/__init__.py
@@ -18,7 +18,7 @@ class ChdbError(Exception):
 # UDF script path will be f"{g_udf_path}/{func_name}.py"
 g_udf_path = ""
 
-chdb_version = ("0", "6", "0")
+chdb_version = ('0', '6', '0')
 if sys.version_info[:2] >= (3, 7):
     # get the path of the current file
     current_path = os.path.dirname(os.path.abspath(__file__))
@@ -84,6 +84,8 @@ def query(sql, output_format="CSV", path="", udf_path=""):
 
 PyReader = _chdb.PyReader
 
+from . import dataframe, dbapi, session, udf, utils
+
 __all__ = [
     "PyReader",
     "ChdbError",
@@ -93,4 +95,9 @@ def query(sql, output_format="CSV", path="", udf_path=""):
     "engine_version",
     "to_df",
     "to_arrowTable",
+    "dataframe",
+    "dbapi",
+    "session",
+    "udf",
+    "utils",
 ]
diff --git a/chdb/utils/__init__.py b/chdb/utils/__init__.py
@@ -0,0 +1,3 @@
+from .types import *
+
+__all__ = ["flatten_dict", "convert_to_columnar", "infer_data_type", "infer_data_types"]
diff --git a/chdb/utils/types.py b/chdb/utils/types.py
@@ -0,0 +1,236 @@
+from collections import defaultdict
+from typing import List, Dict, Any
+import json
+import decimal
+
+
+def convert_to_columnar(items: List[Dict[str, Any]]) -> Dict[str, List[Any]]:
+    """
+    Converts a list of dictionaries into a columnar format.
+
+    This function takes a list of dictionaries and converts it into a dictionary
+    where each key corresponds to a column and each value is a list of column values.
+    Missing values in the dictionaries are represented as None.
+
+    Parameters:
+    - items (List[Dict[str, Any]]): A list of dictionaries to convert.
+
+    Returns:
+    - Dict[str, List[Any]]: A dictionary with keys as column names and values as lists
+      of column values.
+
+    Example:
+    >>> items = [
+    ...     {"name": "Alice", "age": 30, "city": "New York"},
+    ...     {"name": "Bob", "age": 25},
+    ...     {"name": "Charlie", "city": "San Francisco"}
+    ... ]
+    >>> convert_to_columnar(items)
+    {
+        'name': ['Alice', 'Bob', 'Charlie'],
+        'age': [30, 25, None],
+        'city': ['New York', None, 'San Francisco']
+    }
+    """
+    if not items:
+        return {}
+
+    flattened_items = [flatten_dict(item) for item in items]
+    columns = defaultdict(list)
+    keys = set()
+
+    # Collect all possible keys
+    for flattened_item in flattened_items:
+        keys.update(flattened_item.keys())
+
+    # Fill the column lists
+    for flattened_item in flattened_items:
+        for key in keys:
+            columns[key].append(flattened_item.get(key, None))
+
+    return dict(columns)
+
+
+def flatten_dict(
+    d: Dict[str, Any], parent_key: str = "", sep: str = "_"
+) -> Dict[str, Any]:
+    """
+    Flattens a nested dictionary.
+
+    This function takes a nested dictionary and flattens it, concatenating nested keys
+    with a separator. Lists of dictionaries are serialized to JSON strings.
+
+    Parameters:
+    - d (Dict[str, Any]): The dictionary to flatten.
+    - parent_key (str, optional): The base key to prepend to each key. Defaults to "".
+    - sep (str, optional): The separator to use between concatenated keys. Defaults to "_".
+
+    Returns:
+    - Dict[str, Any]: A flattened dictionary.
+
+    Example:
+    >>> nested_dict = {
+    ...     "a": 1,
+    ...     "b": {
+    ...         "c": 2,
+    ...         "d": {
+    ...             "e": 3
+    ...         }
+    ...     },
+    ...     "f": [4, 5, {"g": 6}],
+    ...     "h": [{"i": 7}, {"j": 8}]
+    ... }
+    >>> flatten_dict(nested_dict)
+    {
+        'a': 1,
+        'b_c': 2,
+        'b_d_e': 3,
+        'f_0': 4,
+        'f_1': 5,
+        'f_2_g': 6,
+        'h': '[{"i": 7}, {"j": 8}]'
+    }
+    """
+    items = []
+    for k, v in d.items():
+        new_key = f"{parent_key}{sep}{k}" if parent_key else k
+        if isinstance(v, dict):
+            items.extend(flatten_dict(v, new_key, sep=sep).items())
+        elif isinstance(v, list):
+            if all(isinstance(i, dict) for i in v):
+                items.append((new_key, json.dumps(v)))
+            else:
+                for i, item in enumerate(v):
+                    if isinstance(item, dict):
+                        items.extend(
+                            flatten_dict(item, f"{new_key}{sep}{i}", sep=sep).items()
+                        )
+                    else:
+                        items.append((f"{new_key}{sep}{i}", item))
+        else:
+            items.append((new_key, v))
+    return dict(items)
+
+
+def infer_data_types(
+    column_data: Dict[str, List[Any]], n_rows: int = 10000
+) -> List[tuple]:
+    """
+    Infers data types for each column in a columnar data structure.
+
+    This function analyzes the values in each column and infers the most suitable
+    data type for each column, based on a sample of the data.
+
+    Parameters:
+    - column_data (Dict[str, List[Any]]): A dictionary where keys are column names
+      and values are lists of column values.
+    - n_rows (int, optional): The number of rows to sample for type inference. Defaults to 10000.
+
+    Returns:
+    - List[tuple]: A list of tuples, each containing a column name and its inferred data type.
+    """
+    data_types = []
+    for column, values in column_data.items():
+        sampled_values = values[:n_rows]
+        inferred_type = infer_data_type(sampled_values)
+        data_types.append((column, inferred_type))
+    return data_types
+
+
+def infer_data_type(values: List[Any]) -> str:
+    """
+    Infers the most suitable data type for a list of values.
+
+    This function examines a list of values and determines the most appropriate
+    data type that can represent all the values in the list. It considers integer,
+    unsigned integer, decimal, and float types, and defaults to "string" if the
+    values cannot be represented by any numeric type or if all values are None.
+
+    Parameters:
+    - values (List[Any]): A list of values to analyze. The values can be of any type.
+
+    Returns:
+    - str: A string representing the inferred data type. Possible return values are:
+      "int8", "int16", "int32", "int64", "int128", "int256", "uint8", "uint16",
+      "uint32", "uint64", "uint128", "uint256", "decimal128", "decimal256",
+      "float32", "float64", or "string".
+
+    Notes:
+    - If all values in the list are None, the function returns "string".
+    - If any value in the list is a string, the function immediately returns "string".
+    - The function assumes that numeric values can be represented as integers,
+      decimals, or floats based on their range and precision.
+    """
+
+    int_range = {
+        "int8": (-(2**7), 2**7 - 1),
+        "int16": (-(2**15), 2**15 - 1),
+        "int32": (-(2**31), 2**31 - 1),
+        "int64": (-(2**63), 2**63 - 1),
+        "int128": (-(2**127), 2**127 - 1),
+        "int256": (-(2**255), 2**255 - 1),
+    }
+    uint_range = {
+        "uint8": (0, 2**8 - 1),
+        "uint16": (0, 2**16 - 1),
+        "uint32": (0, 2**32 - 1),
+        "uint64": (0, 2**64 - 1),
+        "uint128": (0, 2**128 - 1),
+        "uint256": (0, 2**256 - 1),
+    }
+
+    max_val = float("-inf")
+    min_val = float("inf")
+    is_int = True
+    is_uint = True
+    is_decimal = True
+    is_float = True
+
+    all_none = True
+
+    for val in values:
+        if val is None:
+            continue
+        all_none = False
+        if isinstance(val, str):
+            return "string"
+
+        try:
+            num = int(val)
+            max_val = max(max_val, num)
+            min_val = min(min_val, num)
+        except (ValueError, TypeError):
+            is_int = False
+            is_uint = False
+            try:
+                num = decimal.Decimal(val)
+                max_val = max(max_val, float(num))
+                min_val = min(min_val, float(num))
+            except (decimal.InvalidOperation, TypeError):
+                is_decimal = False
+                try:
+                    num = float(val)
+                    max_val = max(max_val, num)
+                    min_val = min(min_val, num)
+                except (ValueError, TypeError):
+                    is_float = False
+                    return "string"
+
+    if all_none:
+        return "string"
+
+    if is_int:
+        for dtype, (min_val_dtype, max_val_dtype) in int_range.items():
+            if min_val_dtype <= min_val and max_val <= max_val_dtype:
+                return dtype
+        for dtype, (_, max_val_dtype) in uint_range.items():
+            if max_val <= max_val_dtype:
+                return dtype
+
+    if is_decimal:
+        return "decimal128" if abs(max_val) < 10**38 else "decimal256"
+
+    if is_float:
+        return "float32" if abs(max_val) < 3.4e38 else "float64"
+
+    return "string"
diff --git a/docs/_static/chdb-vs-pandas.jpg b/docs/_static/chdb-vs-pandas.jpg