diff --git a/README.md b/README.md index 3f4e582a912..9df702459f8 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ ## Get Started -Get started with **chdb** using our [Installation and Usage Examples](https://doc.chdb.io) +Get started with **chdb** using our [Installation and Usage Examples](https://clickhouse.com/docs/en/chdb)
@@ -276,15 +276,22 @@ For more examples, see [examples](examples) and [tests](tests). ## Demos and Examples -- [Project Documentation](https://doc.chdb.io) and [Usage Examples](https://chdb-io.github.io/#/install?id=installation-1) +- [Project Documentation](https://clickhouse.com/docs/en/chdb) and [Usage Examples](https://clickhouse.com/docs/en/chdb/install/python) - [Colab Notebooks](https://colab.research.google.com/drive/1-zKB6oKfXeptggXi0kUX87iR8ZTSr4P3?usp=sharing) and other [Script Examples](examples) ## Benchmark - [ClickBench of embedded engines](https://benchmark.clickhouse.com/#eyJzeXN0ZW0iOnsiQXRoZW5hIChwYXJ0aXRpb25lZCkiOnRydWUsIkF0aGVuYSAoc2luZ2xlKSI6dHJ1ZSwiQXVyb3JhIGZvciBNeVNRTCI6dHJ1ZSwiQXVyb3JhIGZvciBQb3N0Z3JlU1FMIjp0cnVlLCJCeXRlSG91c2UiOnRydWUsImNoREIiOnRydWUsIkNpdHVzIjp0cnVlLCJjbGlja2hvdXNlLWxvY2FsIChwYXJ0aXRpb25lZCkiOnRydWUsImNsaWNraG91c2UtbG9jYWwgKHNpbmdsZSkiOnRydWUsIkNsaWNrSG91c2UiOnRydWUsIkNsaWNrSG91c2UgKHR1bmVkKSI6dHJ1ZSwiQ2xpY2tIb3VzZSAoenN0ZCkiOnRydWUsIkNsaWNrSG91c2UgQ2xvdWQiOnRydWUsIkNsaWNrSG91c2UgKHdlYikiOnRydWUsIkNyYXRlREIiOnRydWUsIkRhdGFiZW5kIjp0cnVlLCJEYXRhRnVzaW9uIChzaW5nbGUpIjp0cnVlLCJBcGFjaGUgRG9yaXMiOnRydWUsIkRydWlkIjp0cnVlLCJEdWNrREIgKFBhcnF1ZXQpIjp0cnVlLCJEdWNrREIiOnRydWUsIkVsYXN0aWNzZWFyY2giOnRydWUsIkVsYXN0aWNzZWFyY2ggKHR1bmVkKSI6ZmFsc2UsIkdyZWVucGx1bSI6dHJ1ZSwiSGVhdnlBSSI6dHJ1ZSwiSHlkcmEiOnRydWUsIkluZm9icmlnaHQiOnRydWUsIktpbmV0aWNhIjp0cnVlLCJNYXJpYURCIENvbHVtblN0b3JlIjp0cnVlLCJNYXJpYURCIjpmYWxzZSwiTW9uZXREQiI6dHJ1ZSwiTW9uZ29EQiI6dHJ1ZSwiTXlTUUwgKE15SVNBTSkiOnRydWUsIk15U1FMIjp0cnVlLCJQaW5vdCI6dHJ1ZSwiUG9zdGdyZVNRTCI6dHJ1ZSwiUG9zdGdyZVNRTCAodHVuZWQpIjpmYWxzZSwiUXVlc3REQiAocGFydGl0aW9uZWQpIjp0cnVlLCJRdWVzdERCIjp0cnVlLCJSZWRzaGlmdCI6dHJ1ZSwiU2VsZWN0REIiOnRydWUsIlNpbmdsZVN0b3JlIjp0cnVlLCJTbm93Zmxha2UiOnRydWUsIlNRTGl0ZSI6dHJ1ZSwiU3RhclJvY2tzIjp0cnVlLCJUaW1lc2NhbGVEQiAoY29tcHJlc3Npb24pIjp0cnVlLCJUaW1lc2NhbGVEQiI6dHJ1ZX0sInR5cGUiOnsic3RhdGVsZXNzIjpmYWxzZSwibWFuYWdlZCI6ZmFsc2UsIkphdmEiOmZhbHNlLCJjb2x1bW4tb3JpZW50ZWQiOmZhbHNlLCJDKysiOmZhbHNlLCJNeVNRTCBjb21wYXRpYmxlIjpmYWxzZSwicm93LW9yaWVudGVkIjpmYWxzZSwiQyI6ZmFsc2UsIlBvc3RncmVTUUwgY29tcGF0aWJsZSI6ZmFsc2UsIkNsaWNrSG91c2UgZGVyaXZhdGl2ZSI6ZmFsc2UsImVtYmVkZGVkIjp0cnVlLCJzZXJ2ZXJsZXNzIjpmYWxzZSwiUnVzdCI6ZmFsc2UsInNlYXJjaCI6ZmFsc2UsImRvY3VtZW50IjpmYWxzZSwidGltZS1zZXJpZXMiOmZhbHNlfSwibWFjaGluZSI6eyJzZXJ2ZXJsZXNzIjp0cnVlLCIxNmFjdSI6dHJ1ZSwiTCI6dHJ1ZSwiTSI6dHJ1ZSwiUyI6dHJ1ZSwiWFMiOnRydWUsImM2YS5tZXRhbCwgNTAwZ2IgZ3AyIjp0cnVlLCJjNmEuNHhsYXJnZSwgNTAwZ2IgZ3AyIjp0cnVlLCJjNS40eGxhcmdlLCA1MDBnYiBncDIiOnRydWUsIjE2IHRocmVhZHMiOnRydWUsIjIwIHRocmVhZHMiOnRydWUsIjI0IHRocmVhZHMiOnRydWUsIjI4IHRocmVhZHMiOnRydWUsIjMwIHRocmVhZHMiOnRydWUsIjQ4IHRocmVhZHMiOnRydWUsIjYwIHRocmVhZHMiOnRydWUsIm01ZC4yNHhsYXJnZSI6dHJ1ZSwiYzVuLjR4bGFyZ2UsIDIwMGdiIGdwMiI6dHJ1ZSwiYzZhLjR4bGFyZ2UsIDE1MDBnYiBncDIiOnRydWUsImRjMi44eGxhcmdlIjp0cnVlLCJyYTMuMTZ4bGFyZ2UiOnRydWUsInJhMy40eGxhcmdlIjp0cnVlLCJyYTMueGxwbHVzIjp0cnVlLCJTMjQiOnRydWUsIlMyIjp0cnVlLCIyWEwiOnRydWUsIjNYTCI6dHJ1ZSwiNFhMIjp0cnVlLCJYTCI6dHJ1ZX0sImNsdXN0ZXJfc2l6ZSI6eyIxIjp0cnVlLCIyIjp0cnVlLCI0Ijp0cnVlLCI4Ijp0cnVlLCIxNiI6dHJ1ZSwiMzIiOnRydWUsIjY0Ijp0cnVlLCIxMjgiOnRydWUsInNlcnZlcmxlc3MiOnRydWUsInVuZGVmaW5lZCI6dHJ1ZX0sIm1ldHJpYyI6ImhvdCIsInF1ZXJpZXMiOlt0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlXX0=) +- [chDB vs Pandas](https://colab.research.google.com/drive/1FogLujJ_-ds7RGurDrUnK-U0IW8a8Qd0) + +
+ +
+ + ## Documentation -- For chdb specific examples and documentation refer to [doc.chdb.io](https://doc.chdb.io) +- For chdb specific examples and documentation refer to [chDB docs](https://clickhouse.com/docs/en/chdb) - For SQL syntax, please refer to [ClickHouse SQL Reference](https://clickhouse.com/docs/en/sql-reference/syntax) diff --git a/chdb/__init__.py b/chdb/__init__.py index f2396a997da..c365b3e6432 100644 --- a/chdb/__init__.py +++ b/chdb/__init__.py @@ -18,7 +18,7 @@ class ChdbError(Exception): # UDF script path will be f"{g_udf_path}/{func_name}.py" g_udf_path = "" -chdb_version = ("0", "6", "0") +chdb_version = ('0', '6', '0') if sys.version_info[:2] >= (3, 7): # get the path of the current file current_path = os.path.dirname(os.path.abspath(__file__)) @@ -84,6 +84,8 @@ def query(sql, output_format="CSV", path="", udf_path=""): PyReader = _chdb.PyReader +from . import dataframe, dbapi, session, udf, utils + __all__ = [ "PyReader", "ChdbError", @@ -93,4 +95,9 @@ def query(sql, output_format="CSV", path="", udf_path=""): "engine_version", "to_df", "to_arrowTable", + "dataframe", + "dbapi", + "session", + "udf", + "utils", ] diff --git a/chdb/utils/__init__.py b/chdb/utils/__init__.py new file mode 100644 index 00000000000..a31950f6a7d --- /dev/null +++ b/chdb/utils/__init__.py @@ -0,0 +1,3 @@ +from .types import * + +__all__ = ["flatten_dict", "convert_to_columnar", "infer_data_type", "infer_data_types"] diff --git a/chdb/utils/types.py b/chdb/utils/types.py new file mode 100644 index 00000000000..72c54c60d47 --- /dev/null +++ b/chdb/utils/types.py @@ -0,0 +1,236 @@ +from collections import defaultdict +from typing import List, Dict, Any +import json +import decimal + + +def convert_to_columnar(items: List[Dict[str, Any]]) -> Dict[str, List[Any]]: + """ + Converts a list of dictionaries into a columnar format. + + This function takes a list of dictionaries and converts it into a dictionary + where each key corresponds to a column and each value is a list of column values. + Missing values in the dictionaries are represented as None. + + Parameters: + - items (List[Dict[str, Any]]): A list of dictionaries to convert. + + Returns: + - Dict[str, List[Any]]: A dictionary with keys as column names and values as lists + of column values. + + Example: + >>> items = [ + ... {"name": "Alice", "age": 30, "city": "New York"}, + ... {"name": "Bob", "age": 25}, + ... {"name": "Charlie", "city": "San Francisco"} + ... ] + >>> convert_to_columnar(items) + { + 'name': ['Alice', 'Bob', 'Charlie'], + 'age': [30, 25, None], + 'city': ['New York', None, 'San Francisco'] + } + """ + if not items: + return {} + + flattened_items = [flatten_dict(item) for item in items] + columns = defaultdict(list) + keys = set() + + # Collect all possible keys + for flattened_item in flattened_items: + keys.update(flattened_item.keys()) + + # Fill the column lists + for flattened_item in flattened_items: + for key in keys: + columns[key].append(flattened_item.get(key, None)) + + return dict(columns) + + +def flatten_dict( + d: Dict[str, Any], parent_key: str = "", sep: str = "_" +) -> Dict[str, Any]: + """ + Flattens a nested dictionary. + + This function takes a nested dictionary and flattens it, concatenating nested keys + with a separator. Lists of dictionaries are serialized to JSON strings. + + Parameters: + - d (Dict[str, Any]): The dictionary to flatten. + - parent_key (str, optional): The base key to prepend to each key. Defaults to "". + - sep (str, optional): The separator to use between concatenated keys. Defaults to "_". + + Returns: + - Dict[str, Any]: A flattened dictionary. + + Example: + >>> nested_dict = { + ... "a": 1, + ... "b": { + ... "c": 2, + ... "d": { + ... "e": 3 + ... } + ... }, + ... "f": [4, 5, {"g": 6}], + ... "h": [{"i": 7}, {"j": 8}] + ... } + >>> flatten_dict(nested_dict) + { + 'a': 1, + 'b_c': 2, + 'b_d_e': 3, + 'f_0': 4, + 'f_1': 5, + 'f_2_g': 6, + 'h': '[{"i": 7}, {"j": 8}]' + } + """ + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(flatten_dict(v, new_key, sep=sep).items()) + elif isinstance(v, list): + if all(isinstance(i, dict) for i in v): + items.append((new_key, json.dumps(v))) + else: + for i, item in enumerate(v): + if isinstance(item, dict): + items.extend( + flatten_dict(item, f"{new_key}{sep}{i}", sep=sep).items() + ) + else: + items.append((f"{new_key}{sep}{i}", item)) + else: + items.append((new_key, v)) + return dict(items) + + +def infer_data_types( + column_data: Dict[str, List[Any]], n_rows: int = 10000 +) -> List[tuple]: + """ + Infers data types for each column in a columnar data structure. + + This function analyzes the values in each column and infers the most suitable + data type for each column, based on a sample of the data. + + Parameters: + - column_data (Dict[str, List[Any]]): A dictionary where keys are column names + and values are lists of column values. + - n_rows (int, optional): The number of rows to sample for type inference. Defaults to 10000. + + Returns: + - List[tuple]: A list of tuples, each containing a column name and its inferred data type. + """ + data_types = [] + for column, values in column_data.items(): + sampled_values = values[:n_rows] + inferred_type = infer_data_type(sampled_values) + data_types.append((column, inferred_type)) + return data_types + + +def infer_data_type(values: List[Any]) -> str: + """ + Infers the most suitable data type for a list of values. + + This function examines a list of values and determines the most appropriate + data type that can represent all the values in the list. It considers integer, + unsigned integer, decimal, and float types, and defaults to "string" if the + values cannot be represented by any numeric type or if all values are None. + + Parameters: + - values (List[Any]): A list of values to analyze. The values can be of any type. + + Returns: + - str: A string representing the inferred data type. Possible return values are: + "int8", "int16", "int32", "int64", "int128", "int256", "uint8", "uint16", + "uint32", "uint64", "uint128", "uint256", "decimal128", "decimal256", + "float32", "float64", or "string". + + Notes: + - If all values in the list are None, the function returns "string". + - If any value in the list is a string, the function immediately returns "string". + - The function assumes that numeric values can be represented as integers, + decimals, or floats based on their range and precision. + """ + + int_range = { + "int8": (-(2**7), 2**7 - 1), + "int16": (-(2**15), 2**15 - 1), + "int32": (-(2**31), 2**31 - 1), + "int64": (-(2**63), 2**63 - 1), + "int128": (-(2**127), 2**127 - 1), + "int256": (-(2**255), 2**255 - 1), + } + uint_range = { + "uint8": (0, 2**8 - 1), + "uint16": (0, 2**16 - 1), + "uint32": (0, 2**32 - 1), + "uint64": (0, 2**64 - 1), + "uint128": (0, 2**128 - 1), + "uint256": (0, 2**256 - 1), + } + + max_val = float("-inf") + min_val = float("inf") + is_int = True + is_uint = True + is_decimal = True + is_float = True + + all_none = True + + for val in values: + if val is None: + continue + all_none = False + if isinstance(val, str): + return "string" + + try: + num = int(val) + max_val = max(max_val, num) + min_val = min(min_val, num) + except (ValueError, TypeError): + is_int = False + is_uint = False + try: + num = decimal.Decimal(val) + max_val = max(max_val, float(num)) + min_val = min(min_val, float(num)) + except (decimal.InvalidOperation, TypeError): + is_decimal = False + try: + num = float(val) + max_val = max(max_val, num) + min_val = min(min_val, num) + except (ValueError, TypeError): + is_float = False + return "string" + + if all_none: + return "string" + + if is_int: + for dtype, (min_val_dtype, max_val_dtype) in int_range.items(): + if min_val_dtype <= min_val and max_val <= max_val_dtype: + return dtype + for dtype, (_, max_val_dtype) in uint_range.items(): + if max_val <= max_val_dtype: + return dtype + + if is_decimal: + return "decimal128" if abs(max_val) < 10**38 else "decimal256" + + if is_float: + return "float32" if abs(max_val) < 3.4e38 else "float64" + + return "string" diff --git a/docs/_static/chdb-vs-pandas.jpg b/docs/_static/chdb-vs-pandas.jpg new file mode 100644 index 00000000000..7bf2c48c029 Binary files /dev/null and b/docs/_static/chdb-vs-pandas.jpg differ diff --git a/src/Processors/Sources/PythonSource.cpp b/src/Processors/Sources/PythonSource.cpp index 6915cb710e8..25f4f0ebf04 100644 --- a/src/Processors/Sources/PythonSource.cpp +++ b/src/Processors/Sources/PythonSource.cpp @@ -1,10 +1,11 @@ #include -#include "base/scope_guard.h" #if USE_PYTHON #include #include +#include #include +#include #include #include #include @@ -19,7 +20,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -69,7 +72,27 @@ void PythonSource::insert_from_list(const py::list & obj, const MutableColumnPtr { py::gil_scoped_acquire acquire; for (auto && item : obj) - column->insert(item.cast()); + { + if constexpr (std::is_same_v) + { + if (PyBool_Check(item.ptr())) + { + column->insert(static_cast(py::cast(item) ? 1 : 0)); + } + else + { + column->insert(py::cast(item)); + } + } + else if (item.is_none()) + { + column->insertDefault(); + } + else + { + column->insert(item.cast()); + } + } } void PythonSource::insert_string_from_array(const py::handle obj, const MutableColumnPtr & column) @@ -106,7 +129,16 @@ void PythonSource::convert_string_array_to_block( offsets.reserve(row_count); for (size_t i = offset; i < offset + row_count; ++i) { - FillColumnString(buf[i], string_column); + auto * obj = buf[i]; + if (!PyUnicode_Check(obj)) + { + LOG_ERROR( + logger, + "Unsupported Python object type {}, Unicode string expected here. Try convert column type to str with `astype(str)`", + Py_TYPE(obj)->tp_name); + throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Unsupported Python object type {}", Py_TYPE(obj)->tp_name); + } + FillColumnString(obj, string_column); // Try to help reserve memory for the string column data every 100 rows to avoid frequent reallocations // Check the avg size of the string column data and reserve memory accordingly if ((i - offset) % 10 == 9) @@ -278,11 +310,34 @@ Chunk PythonSource::genChunk(size_t & num_rows, PyObjectVecPtr data) type->getName(), description.sample_block.getByPosition(i).name); } - catch (const Exception & e) + catch (Exception & e) { destory(data); - LOG_ERROR(logger, "Error processing column {}: {}", i, e.what()); - throw; + LOG_ERROR(logger, "Error processing column \"{}\": {}", description.sample_block.getByPosition(i).name, e.what()); + throw Exception( + ErrorCodes::PY_EXCEPTION_OCCURED, + "Error processing column \"{}\": {}", + description.sample_block.getByPosition(i).name, + e.what()); + } + catch (std::exception & e) + { + destory(data); + LOG_ERROR(logger, "Error processing column \"{}\": {}", description.sample_block.getByPosition(i).name, e.what()); + throw Exception( + ErrorCodes::PY_EXCEPTION_OCCURED, + "Error processing column \"{}\": {}", + description.sample_block.getByPosition(i).name, + e.what()); + } + catch (...) + { + destory(data); + LOG_ERROR(logger, "Error processing column \"{}\": unknown exception", description.sample_block.getByPosition(i).name); + throw Exception( + ErrorCodes::PY_EXCEPTION_OCCURED, + "Error processing column \"{}\": unknown exception", + description.sample_block.getByPosition(i).name); } } @@ -415,10 +470,20 @@ Chunk PythonSource::scanDataToChunk() // LOG_DEBUG(logger, "Column {} data: {}", col.name, ss.str()); } } - catch (const Exception & e) + catch (Exception & e) + { + LOG_ERROR(logger, "Error processing column \"{}\": {}", col.name, e.what()); + throw Exception(ErrorCodes::PY_EXCEPTION_OCCURED, "Error processing column \"{}\": {}", col.name, e.what()); + } + catch (std::exception & e) + { + LOG_ERROR(logger, "Error processing column \"{}\": {}", col.name, e.what()); + throw Exception(ErrorCodes::PY_EXCEPTION_OCCURED, "Error processing column \"{}\": {}", col.name, e.what()); + } + catch (...) { - LOG_ERROR(logger, "Error processing column {}: {}", i, e.what()); - throw; + LOG_ERROR(logger, "Error processing column \"{}\": unknown exception", col.name); + throw Exception(ErrorCodes::PY_EXCEPTION_OCCURED, "Error processing column \"{}\": unknown exception", col.name); } } cursor += count; diff --git a/src/Storages/StoragePython.cpp b/src/Storages/StoragePython.cpp index 368acdf28a5..d0b108e9712 100644 --- a/src/Storages/StoragePython.cpp +++ b/src/Storages/StoragePython.cpp @@ -151,6 +151,7 @@ ColumnsDescription StoragePython::getTableStructureFromData(py::object data_sour RE2 pattern_int(R"(\bint(\d+))"); RE2 pattern_generic_int(R"(\bint\b|)"); // Matches generic 'int' RE2 pattern_uint(R"(\buint(\d+))"); + RE2 pattern_bool(R"(\bBool|bool)"); RE2 pattern_float(R"(\b(float|double)(\d+)?)"); RE2 pattern_decimal128(R"(decimal128\((\d+),\s*(\d+)\))"); RE2 pattern_decimal256(R"(decimal256\((\d+),\s*(\d+)\))"); @@ -199,6 +200,10 @@ dtype\('S|dtype\('O||||(); } + else if (RE2::PartialMatch(typeStr, pattern_bool)) + { + data_type = std::make_shared(); + } else if (RE2::PartialMatch(typeStr, pattern_generic_int)) { data_type = std::make_shared(); // Default to 64-bit integers for generic 'int' @@ -334,7 +339,7 @@ std::vector> PyReader::getSchemaFromPyObj(co throw Exception( ErrorCodes::UNKNOWN_FORMAT, - "Unknown data type {} for schema inference. Consider inheriting PyReader and overriding getSchema().", + "Unknown data type {} for schema inference. Consider inheriting PyReader and overriding get_schema().", py::str(data.attr("__class__")).cast()); } diff --git a/tests/test_issue251.py b/tests/test_issue251.py new file mode 100644 index 00000000000..76b9a83d89f --- /dev/null +++ b/tests/test_issue251.py @@ -0,0 +1,56 @@ +#!python3 + +import os +import unittest +import zipfile +import urllib.request + +import pandas as pd +import chdb + + +class TestIssue251(unittest.TestCase): + def setUp(self): + # if /tmp/issue251/artifacts/create_final_community_reports.parquet not exists, + # download https://github.com/user-attachments/files/16361689/parquet-test-data.zip + # and unzip it to /tmp/issue251/ + if not os.path.exists( + "/tmp/issue251/artifacts/create_final_community_reports.parquet" + ): + print("Downloading parquet-test-data.zip") + + url = "https://github.com/user-attachments/files/16361689/parquet-test-data.zip" + os.makedirs("/tmp/issue251/", exist_ok=True) + urllib.request.urlretrieve(url, "/tmp/issue251/parquet-test-data.zip") + with zipfile.ZipFile("/tmp/issue251/parquet-test-data.zip", "r") as zip_ref: + zip_ref.extractall("/tmp/issue251/") + + def test_issue251(self): + df = pd.read_parquet( + "/tmp/issue251/artifacts/create_final_community_reports.parquet", + columns=[ + "id", + "community", + "level", + "title", + "summary", + "findings", + "rank", + "rank_explanation", + ], + ) + + # make pandas show all columns + pd.set_option("display.max_columns", None) + print(df.head(2)) + print(df.dtypes) + try: + chdb.query("FROM Python(df) SELECT * LIMIT 10") + except Exception as e: + self.assertTrue( + "Unsupported Python object type numpy.ndarray" in str(e) + ) + + +if __name__ == "__main__": + unittest.main()