diff --git a/README.md b/README.md
index 3f4e582a912..9df702459f8 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@
## Get Started
-Get started with **chdb** using our [Installation and Usage Examples](https://doc.chdb.io)
+Get started with **chdb** using our [Installation and Usage Examples](https://clickhouse.com/docs/en/chdb)
@@ -276,15 +276,22 @@ For more examples, see [examples](examples) and [tests](tests).
## Demos and Examples
-- [Project Documentation](https://doc.chdb.io) and [Usage Examples](https://chdb-io.github.io/#/install?id=installation-1)
+- [Project Documentation](https://clickhouse.com/docs/en/chdb) and [Usage Examples](https://clickhouse.com/docs/en/chdb/install/python)
- [Colab Notebooks](https://colab.research.google.com/drive/1-zKB6oKfXeptggXi0kUX87iR8ZTSr4P3?usp=sharing) and other [Script Examples](examples)
## Benchmark
- [ClickBench of embedded engines](https://benchmark.clickhouse.com/#eyJzeXN0ZW0iOnsiQXRoZW5hIChwYXJ0aXRpb25lZCkiOnRydWUsIkF0aGVuYSAoc2luZ2xlKSI6dHJ1ZSwiQXVyb3JhIGZvciBNeVNRTCI6dHJ1ZSwiQXVyb3JhIGZvciBQb3N0Z3JlU1FMIjp0cnVlLCJCeXRlSG91c2UiOnRydWUsImNoREIiOnRydWUsIkNpdHVzIjp0cnVlLCJjbGlja2hvdXNlLWxvY2FsIChwYXJ0aXRpb25lZCkiOnRydWUsImNsaWNraG91c2UtbG9jYWwgKHNpbmdsZSkiOnRydWUsIkNsaWNrSG91c2UiOnRydWUsIkNsaWNrSG91c2UgKHR1bmVkKSI6dHJ1ZSwiQ2xpY2tIb3VzZSAoenN0ZCkiOnRydWUsIkNsaWNrSG91c2UgQ2xvdWQiOnRydWUsIkNsaWNrSG91c2UgKHdlYikiOnRydWUsIkNyYXRlREIiOnRydWUsIkRhdGFiZW5kIjp0cnVlLCJEYXRhRnVzaW9uIChzaW5nbGUpIjp0cnVlLCJBcGFjaGUgRG9yaXMiOnRydWUsIkRydWlkIjp0cnVlLCJEdWNrREIgKFBhcnF1ZXQpIjp0cnVlLCJEdWNrREIiOnRydWUsIkVsYXN0aWNzZWFyY2giOnRydWUsIkVsYXN0aWNzZWFyY2ggKHR1bmVkKSI6ZmFsc2UsIkdyZWVucGx1bSI6dHJ1ZSwiSGVhdnlBSSI6dHJ1ZSwiSHlkcmEiOnRydWUsIkluZm9icmlnaHQiOnRydWUsIktpbmV0aWNhIjp0cnVlLCJNYXJpYURCIENvbHVtblN0b3JlIjp0cnVlLCJNYXJpYURCIjpmYWxzZSwiTW9uZXREQiI6dHJ1ZSwiTW9uZ29EQiI6dHJ1ZSwiTXlTUUwgKE15SVNBTSkiOnRydWUsIk15U1FMIjp0cnVlLCJQaW5vdCI6dHJ1ZSwiUG9zdGdyZVNRTCI6dHJ1ZSwiUG9zdGdyZVNRTCAodHVuZWQpIjpmYWxzZSwiUXVlc3REQiAocGFydGl0aW9uZWQpIjp0cnVlLCJRdWVzdERCIjp0cnVlLCJSZWRzaGlmdCI6dHJ1ZSwiU2VsZWN0REIiOnRydWUsIlNpbmdsZVN0b3JlIjp0cnVlLCJTbm93Zmxha2UiOnRydWUsIlNRTGl0ZSI6dHJ1ZSwiU3RhclJvY2tzIjp0cnVlLCJUaW1lc2NhbGVEQiAoY29tcHJlc3Npb24pIjp0cnVlLCJUaW1lc2NhbGVEQiI6dHJ1ZX0sInR5cGUiOnsic3RhdGVsZXNzIjpmYWxzZSwibWFuYWdlZCI6ZmFsc2UsIkphdmEiOmZhbHNlLCJjb2x1bW4tb3JpZW50ZWQiOmZhbHNlLCJDKysiOmZhbHNlLCJNeVNRTCBjb21wYXRpYmxlIjpmYWxzZSwicm93LW9yaWVudGVkIjpmYWxzZSwiQyI6ZmFsc2UsIlBvc3RncmVTUUwgY29tcGF0aWJsZSI6ZmFsc2UsIkNsaWNrSG91c2UgZGVyaXZhdGl2ZSI6ZmFsc2UsImVtYmVkZGVkIjp0cnVlLCJzZXJ2ZXJsZXNzIjpmYWxzZSwiUnVzdCI6ZmFsc2UsInNlYXJjaCI6ZmFsc2UsImRvY3VtZW50IjpmYWxzZSwidGltZS1zZXJpZXMiOmZhbHNlfSwibWFjaGluZSI6eyJzZXJ2ZXJsZXNzIjp0cnVlLCIxNmFjdSI6dHJ1ZSwiTCI6dHJ1ZSwiTSI6dHJ1ZSwiUyI6dHJ1ZSwiWFMiOnRydWUsImM2YS5tZXRhbCwgNTAwZ2IgZ3AyIjp0cnVlLCJjNmEuNHhsYXJnZSwgNTAwZ2IgZ3AyIjp0cnVlLCJjNS40eGxhcmdlLCA1MDBnYiBncDIiOnRydWUsIjE2IHRocmVhZHMiOnRydWUsIjIwIHRocmVhZHMiOnRydWUsIjI0IHRocmVhZHMiOnRydWUsIjI4IHRocmVhZHMiOnRydWUsIjMwIHRocmVhZHMiOnRydWUsIjQ4IHRocmVhZHMiOnRydWUsIjYwIHRocmVhZHMiOnRydWUsIm01ZC4yNHhsYXJnZSI6dHJ1ZSwiYzVuLjR4bGFyZ2UsIDIwMGdiIGdwMiI6dHJ1ZSwiYzZhLjR4bGFyZ2UsIDE1MDBnYiBncDIiOnRydWUsImRjMi44eGxhcmdlIjp0cnVlLCJyYTMuMTZ4bGFyZ2UiOnRydWUsInJhMy40eGxhcmdlIjp0cnVlLCJyYTMueGxwbHVzIjp0cnVlLCJTMjQiOnRydWUsIlMyIjp0cnVlLCIyWEwiOnRydWUsIjNYTCI6dHJ1ZSwiNFhMIjp0cnVlLCJYTCI6dHJ1ZX0sImNsdXN0ZXJfc2l6ZSI6eyIxIjp0cnVlLCIyIjp0cnVlLCI0Ijp0cnVlLCI4Ijp0cnVlLCIxNiI6dHJ1ZSwiMzIiOnRydWUsIjY0Ijp0cnVlLCIxMjgiOnRydWUsInNlcnZlcmxlc3MiOnRydWUsInVuZGVmaW5lZCI6dHJ1ZX0sIm1ldHJpYyI6ImhvdCIsInF1ZXJpZXMiOlt0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlLHRydWUsdHJ1ZSx0cnVlXX0=)
+- [chDB vs Pandas](https://colab.research.google.com/drive/1FogLujJ_-ds7RGurDrUnK-U0IW8a8Qd0)
+
+
+
+
+
+
## Documentation
-- For chdb specific examples and documentation refer to [doc.chdb.io](https://doc.chdb.io)
+- For chdb specific examples and documentation refer to [chDB docs](https://clickhouse.com/docs/en/chdb)
- For SQL syntax, please refer to [ClickHouse SQL Reference](https://clickhouse.com/docs/en/sql-reference/syntax)
diff --git a/chdb/__init__.py b/chdb/__init__.py
index f2396a997da..c365b3e6432 100644
--- a/chdb/__init__.py
+++ b/chdb/__init__.py
@@ -18,7 +18,7 @@ class ChdbError(Exception):
# UDF script path will be f"{g_udf_path}/{func_name}.py"
g_udf_path = ""
-chdb_version = ("0", "6", "0")
+chdb_version = ('0', '6', '0')
if sys.version_info[:2] >= (3, 7):
# get the path of the current file
current_path = os.path.dirname(os.path.abspath(__file__))
@@ -84,6 +84,8 @@ def query(sql, output_format="CSV", path="", udf_path=""):
PyReader = _chdb.PyReader
+from . import dataframe, dbapi, session, udf, utils
+
__all__ = [
"PyReader",
"ChdbError",
@@ -93,4 +95,9 @@ def query(sql, output_format="CSV", path="", udf_path=""):
"engine_version",
"to_df",
"to_arrowTable",
+ "dataframe",
+ "dbapi",
+ "session",
+ "udf",
+ "utils",
]
diff --git a/chdb/utils/__init__.py b/chdb/utils/__init__.py
new file mode 100644
index 00000000000..a31950f6a7d
--- /dev/null
+++ b/chdb/utils/__init__.py
@@ -0,0 +1,3 @@
+from .types import *
+
+__all__ = ["flatten_dict", "convert_to_columnar", "infer_data_type", "infer_data_types"]
diff --git a/chdb/utils/types.py b/chdb/utils/types.py
new file mode 100644
index 00000000000..72c54c60d47
--- /dev/null
+++ b/chdb/utils/types.py
@@ -0,0 +1,236 @@
+from collections import defaultdict
+from typing import List, Dict, Any
+import json
+import decimal
+
+
+def convert_to_columnar(items: List[Dict[str, Any]]) -> Dict[str, List[Any]]:
+ """
+ Converts a list of dictionaries into a columnar format.
+
+ This function takes a list of dictionaries and converts it into a dictionary
+ where each key corresponds to a column and each value is a list of column values.
+ Missing values in the dictionaries are represented as None.
+
+ Parameters:
+ - items (List[Dict[str, Any]]): A list of dictionaries to convert.
+
+ Returns:
+ - Dict[str, List[Any]]: A dictionary with keys as column names and values as lists
+ of column values.
+
+ Example:
+ >>> items = [
+ ... {"name": "Alice", "age": 30, "city": "New York"},
+ ... {"name": "Bob", "age": 25},
+ ... {"name": "Charlie", "city": "San Francisco"}
+ ... ]
+ >>> convert_to_columnar(items)
+ {
+ 'name': ['Alice', 'Bob', 'Charlie'],
+ 'age': [30, 25, None],
+ 'city': ['New York', None, 'San Francisco']
+ }
+ """
+ if not items:
+ return {}
+
+ flattened_items = [flatten_dict(item) for item in items]
+ columns = defaultdict(list)
+ keys = set()
+
+ # Collect all possible keys
+ for flattened_item in flattened_items:
+ keys.update(flattened_item.keys())
+
+ # Fill the column lists
+ for flattened_item in flattened_items:
+ for key in keys:
+ columns[key].append(flattened_item.get(key, None))
+
+ return dict(columns)
+
+
+def flatten_dict(
+ d: Dict[str, Any], parent_key: str = "", sep: str = "_"
+) -> Dict[str, Any]:
+ """
+ Flattens a nested dictionary.
+
+ This function takes a nested dictionary and flattens it, concatenating nested keys
+ with a separator. Lists of dictionaries are serialized to JSON strings.
+
+ Parameters:
+ - d (Dict[str, Any]): The dictionary to flatten.
+ - parent_key (str, optional): The base key to prepend to each key. Defaults to "".
+ - sep (str, optional): The separator to use between concatenated keys. Defaults to "_".
+
+ Returns:
+ - Dict[str, Any]: A flattened dictionary.
+
+ Example:
+ >>> nested_dict = {
+ ... "a": 1,
+ ... "b": {
+ ... "c": 2,
+ ... "d": {
+ ... "e": 3
+ ... }
+ ... },
+ ... "f": [4, 5, {"g": 6}],
+ ... "h": [{"i": 7}, {"j": 8}]
+ ... }
+ >>> flatten_dict(nested_dict)
+ {
+ 'a': 1,
+ 'b_c': 2,
+ 'b_d_e': 3,
+ 'f_0': 4,
+ 'f_1': 5,
+ 'f_2_g': 6,
+ 'h': '[{"i": 7}, {"j": 8}]'
+ }
+ """
+ items = []
+ for k, v in d.items():
+ new_key = f"{parent_key}{sep}{k}" if parent_key else k
+ if isinstance(v, dict):
+ items.extend(flatten_dict(v, new_key, sep=sep).items())
+ elif isinstance(v, list):
+ if all(isinstance(i, dict) for i in v):
+ items.append((new_key, json.dumps(v)))
+ else:
+ for i, item in enumerate(v):
+ if isinstance(item, dict):
+ items.extend(
+ flatten_dict(item, f"{new_key}{sep}{i}", sep=sep).items()
+ )
+ else:
+ items.append((f"{new_key}{sep}{i}", item))
+ else:
+ items.append((new_key, v))
+ return dict(items)
+
+
+def infer_data_types(
+ column_data: Dict[str, List[Any]], n_rows: int = 10000
+) -> List[tuple]:
+ """
+ Infers data types for each column in a columnar data structure.
+
+ This function analyzes the values in each column and infers the most suitable
+ data type for each column, based on a sample of the data.
+
+ Parameters:
+ - column_data (Dict[str, List[Any]]): A dictionary where keys are column names
+ and values are lists of column values.
+ - n_rows (int, optional): The number of rows to sample for type inference. Defaults to 10000.
+
+ Returns:
+ - List[tuple]: A list of tuples, each containing a column name and its inferred data type.
+ """
+ data_types = []
+ for column, values in column_data.items():
+ sampled_values = values[:n_rows]
+ inferred_type = infer_data_type(sampled_values)
+ data_types.append((column, inferred_type))
+ return data_types
+
+
+def infer_data_type(values: List[Any]) -> str:
+ """
+ Infers the most suitable data type for a list of values.
+
+ This function examines a list of values and determines the most appropriate
+ data type that can represent all the values in the list. It considers integer,
+ unsigned integer, decimal, and float types, and defaults to "string" if the
+ values cannot be represented by any numeric type or if all values are None.
+
+ Parameters:
+ - values (List[Any]): A list of values to analyze. The values can be of any type.
+
+ Returns:
+ - str: A string representing the inferred data type. Possible return values are:
+ "int8", "int16", "int32", "int64", "int128", "int256", "uint8", "uint16",
+ "uint32", "uint64", "uint128", "uint256", "decimal128", "decimal256",
+ "float32", "float64", or "string".
+
+ Notes:
+ - If all values in the list are None, the function returns "string".
+ - If any value in the list is a string, the function immediately returns "string".
+ - The function assumes that numeric values can be represented as integers,
+ decimals, or floats based on their range and precision.
+ """
+
+ int_range = {
+ "int8": (-(2**7), 2**7 - 1),
+ "int16": (-(2**15), 2**15 - 1),
+ "int32": (-(2**31), 2**31 - 1),
+ "int64": (-(2**63), 2**63 - 1),
+ "int128": (-(2**127), 2**127 - 1),
+ "int256": (-(2**255), 2**255 - 1),
+ }
+ uint_range = {
+ "uint8": (0, 2**8 - 1),
+ "uint16": (0, 2**16 - 1),
+ "uint32": (0, 2**32 - 1),
+ "uint64": (0, 2**64 - 1),
+ "uint128": (0, 2**128 - 1),
+ "uint256": (0, 2**256 - 1),
+ }
+
+ max_val = float("-inf")
+ min_val = float("inf")
+ is_int = True
+ is_uint = True
+ is_decimal = True
+ is_float = True
+
+ all_none = True
+
+ for val in values:
+ if val is None:
+ continue
+ all_none = False
+ if isinstance(val, str):
+ return "string"
+
+ try:
+ num = int(val)
+ max_val = max(max_val, num)
+ min_val = min(min_val, num)
+ except (ValueError, TypeError):
+ is_int = False
+ is_uint = False
+ try:
+ num = decimal.Decimal(val)
+ max_val = max(max_val, float(num))
+ min_val = min(min_val, float(num))
+ except (decimal.InvalidOperation, TypeError):
+ is_decimal = False
+ try:
+ num = float(val)
+ max_val = max(max_val, num)
+ min_val = min(min_val, num)
+ except (ValueError, TypeError):
+ is_float = False
+ return "string"
+
+ if all_none:
+ return "string"
+
+ if is_int:
+ for dtype, (min_val_dtype, max_val_dtype) in int_range.items():
+ if min_val_dtype <= min_val and max_val <= max_val_dtype:
+ return dtype
+ for dtype, (_, max_val_dtype) in uint_range.items():
+ if max_val <= max_val_dtype:
+ return dtype
+
+ if is_decimal:
+ return "decimal128" if abs(max_val) < 10**38 else "decimal256"
+
+ if is_float:
+ return "float32" if abs(max_val) < 3.4e38 else "float64"
+
+ return "string"
diff --git a/docs/_static/chdb-vs-pandas.jpg b/docs/_static/chdb-vs-pandas.jpg
new file mode 100644
index 00000000000..7bf2c48c029
Binary files /dev/null and b/docs/_static/chdb-vs-pandas.jpg differ
diff --git a/src/Processors/Sources/PythonSource.cpp b/src/Processors/Sources/PythonSource.cpp
index 6915cb710e8..25f4f0ebf04 100644
--- a/src/Processors/Sources/PythonSource.cpp
+++ b/src/Processors/Sources/PythonSource.cpp
@@ -1,10 +1,11 @@
#include
-#include "base/scope_guard.h"
#if USE_PYTHON
#include
#include
+#include
#include
+#include
#include
#include
#include
@@ -19,7 +20,9 @@
#include
#include
#include
+#include
#include
+#include
#include
#include
#include
@@ -69,7 +72,27 @@ void PythonSource::insert_from_list(const py::list & obj, const MutableColumnPtr
{
py::gil_scoped_acquire acquire;
for (auto && item : obj)
- column->insert(item.cast());
+ {
+ if constexpr (std::is_same_v)
+ {
+ if (PyBool_Check(item.ptr()))
+ {
+ column->insert(static_cast(py::cast(item) ? 1 : 0));
+ }
+ else
+ {
+ column->insert(py::cast(item));
+ }
+ }
+ else if (item.is_none())
+ {
+ column->insertDefault();
+ }
+ else
+ {
+ column->insert(item.cast());
+ }
+ }
}
void PythonSource::insert_string_from_array(const py::handle obj, const MutableColumnPtr & column)
@@ -106,7 +129,16 @@ void PythonSource::convert_string_array_to_block(
offsets.reserve(row_count);
for (size_t i = offset; i < offset + row_count; ++i)
{
- FillColumnString(buf[i], string_column);
+ auto * obj = buf[i];
+ if (!PyUnicode_Check(obj))
+ {
+ LOG_ERROR(
+ logger,
+ "Unsupported Python object type {}, Unicode string expected here. Try convert column type to str with `astype(str)`",
+ Py_TYPE(obj)->tp_name);
+ throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Unsupported Python object type {}", Py_TYPE(obj)->tp_name);
+ }
+ FillColumnString(obj, string_column);
// Try to help reserve memory for the string column data every 100 rows to avoid frequent reallocations
// Check the avg size of the string column data and reserve memory accordingly
if ((i - offset) % 10 == 9)
@@ -278,11 +310,34 @@ Chunk PythonSource::genChunk(size_t & num_rows, PyObjectVecPtr data)
type->getName(),
description.sample_block.getByPosition(i).name);
}
- catch (const Exception & e)
+ catch (Exception & e)
{
destory(data);
- LOG_ERROR(logger, "Error processing column {}: {}", i, e.what());
- throw;
+ LOG_ERROR(logger, "Error processing column \"{}\": {}", description.sample_block.getByPosition(i).name, e.what());
+ throw Exception(
+ ErrorCodes::PY_EXCEPTION_OCCURED,
+ "Error processing column \"{}\": {}",
+ description.sample_block.getByPosition(i).name,
+ e.what());
+ }
+ catch (std::exception & e)
+ {
+ destory(data);
+ LOG_ERROR(logger, "Error processing column \"{}\": {}", description.sample_block.getByPosition(i).name, e.what());
+ throw Exception(
+ ErrorCodes::PY_EXCEPTION_OCCURED,
+ "Error processing column \"{}\": {}",
+ description.sample_block.getByPosition(i).name,
+ e.what());
+ }
+ catch (...)
+ {
+ destory(data);
+ LOG_ERROR(logger, "Error processing column \"{}\": unknown exception", description.sample_block.getByPosition(i).name);
+ throw Exception(
+ ErrorCodes::PY_EXCEPTION_OCCURED,
+ "Error processing column \"{}\": unknown exception",
+ description.sample_block.getByPosition(i).name);
}
}
@@ -415,10 +470,20 @@ Chunk PythonSource::scanDataToChunk()
// LOG_DEBUG(logger, "Column {} data: {}", col.name, ss.str());
}
}
- catch (const Exception & e)
+ catch (Exception & e)
+ {
+ LOG_ERROR(logger, "Error processing column \"{}\": {}", col.name, e.what());
+ throw Exception(ErrorCodes::PY_EXCEPTION_OCCURED, "Error processing column \"{}\": {}", col.name, e.what());
+ }
+ catch (std::exception & e)
+ {
+ LOG_ERROR(logger, "Error processing column \"{}\": {}", col.name, e.what());
+ throw Exception(ErrorCodes::PY_EXCEPTION_OCCURED, "Error processing column \"{}\": {}", col.name, e.what());
+ }
+ catch (...)
{
- LOG_ERROR(logger, "Error processing column {}: {}", i, e.what());
- throw;
+ LOG_ERROR(logger, "Error processing column \"{}\": unknown exception", col.name);
+ throw Exception(ErrorCodes::PY_EXCEPTION_OCCURED, "Error processing column \"{}\": unknown exception", col.name);
}
}
cursor += count;
diff --git a/src/Storages/StoragePython.cpp b/src/Storages/StoragePython.cpp
index 368acdf28a5..d0b108e9712 100644
--- a/src/Storages/StoragePython.cpp
+++ b/src/Storages/StoragePython.cpp
@@ -151,6 +151,7 @@ ColumnsDescription StoragePython::getTableStructureFromData(py::object data_sour
RE2 pattern_int(R"(\bint(\d+))");
RE2 pattern_generic_int(R"(\bint\b|)"); // Matches generic 'int'
RE2 pattern_uint(R"(\buint(\d+))");
+ RE2 pattern_bool(R"(\bBool|bool)");
RE2 pattern_float(R"(\b(float|double)(\d+)?)");
RE2 pattern_decimal128(R"(decimal128\((\d+),\s*(\d+)\))");
RE2 pattern_decimal256(R"(decimal256\((\d+),\s*(\d+)\))");
@@ -199,6 +200,10 @@ dtype\('S|dtype\('O||||();
}
+ else if (RE2::PartialMatch(typeStr, pattern_bool))
+ {
+ data_type = std::make_shared();
+ }
else if (RE2::PartialMatch(typeStr, pattern_generic_int))
{
data_type = std::make_shared(); // Default to 64-bit integers for generic 'int'
@@ -334,7 +339,7 @@ std::vector> PyReader::getSchemaFromPyObj(co
throw Exception(
ErrorCodes::UNKNOWN_FORMAT,
- "Unknown data type {} for schema inference. Consider inheriting PyReader and overriding getSchema().",
+ "Unknown data type {} for schema inference. Consider inheriting PyReader and overriding get_schema().",
py::str(data.attr("__class__")).cast());
}
diff --git a/tests/test_issue251.py b/tests/test_issue251.py
new file mode 100644
index 00000000000..76b9a83d89f
--- /dev/null
+++ b/tests/test_issue251.py
@@ -0,0 +1,56 @@
+#!python3
+
+import os
+import unittest
+import zipfile
+import urllib.request
+
+import pandas as pd
+import chdb
+
+
+class TestIssue251(unittest.TestCase):
+ def setUp(self):
+ # if /tmp/issue251/artifacts/create_final_community_reports.parquet not exists,
+ # download https://github.com/user-attachments/files/16361689/parquet-test-data.zip
+ # and unzip it to /tmp/issue251/
+ if not os.path.exists(
+ "/tmp/issue251/artifacts/create_final_community_reports.parquet"
+ ):
+ print("Downloading parquet-test-data.zip")
+
+ url = "https://github.com/user-attachments/files/16361689/parquet-test-data.zip"
+ os.makedirs("/tmp/issue251/", exist_ok=True)
+ urllib.request.urlretrieve(url, "/tmp/issue251/parquet-test-data.zip")
+ with zipfile.ZipFile("/tmp/issue251/parquet-test-data.zip", "r") as zip_ref:
+ zip_ref.extractall("/tmp/issue251/")
+
+ def test_issue251(self):
+ df = pd.read_parquet(
+ "/tmp/issue251/artifacts/create_final_community_reports.parquet",
+ columns=[
+ "id",
+ "community",
+ "level",
+ "title",
+ "summary",
+ "findings",
+ "rank",
+ "rank_explanation",
+ ],
+ )
+
+ # make pandas show all columns
+ pd.set_option("display.max_columns", None)
+ print(df.head(2))
+ print(df.dtypes)
+ try:
+ chdb.query("FROM Python(df) SELECT * LIMIT 10")
+ except Exception as e:
+ self.assertTrue(
+ "Unsupported Python object type numpy.ndarray" in str(e)
+ )
+
+
+if __name__ == "__main__":
+ unittest.main()