diff --git a/chdb/__init__.py b/chdb/__init__.py index c5e9109c2fb..f2396a997da 100644 --- a/chdb/__init__.py +++ b/chdb/__init__.py @@ -82,7 +82,10 @@ def query(sql, output_format="CSV", path="", udf_path=""): # alias for query sql = query +PyReader = _chdb.PyReader + __all__ = [ + "PyReader", "ChdbError", "query", "sql", diff --git a/chdb/rwabc.py b/chdb/rwabc.py new file mode 100644 index 00000000000..1b868030bcd --- /dev/null +++ b/chdb/rwabc.py @@ -0,0 +1,65 @@ +from abc import ABC, abstractmethod +from typing import List, Any + + +class PyReader(ABC): + def __init__(self, data: Any): + """ + Initialize the reader with data. The exact type and structure of `data` can vary. + + Args: + data (Any): The data with which to initialize the reader, format and type are not strictly defined. + """ + self.data = data + + @abstractmethod + def read(self, col_names: List[str], count: int) -> List[Any]: + """ + Read a specified number of rows from the given columns and return a list of objects, + where each object is a sequence of values for a column. + + Args: + col_names (List[str]): List of column names to read. + count (int): Maximum number of rows to read. + + Returns: + List[Any]: List of sequences, one for each column. + """ + pass + + +class PyWriter(ABC): + def __init__(self, col_names: List[str], types: List[type], data: Any): + """ + Initialize the writer with column names, their types, and initial data. + + Args: + col_names (List[str]): List of column names. + types (List[type]): List of types corresponding to each column. + data (Any): Initial data to setup the writer, format and type are not strictly defined. + """ + self.col_names = col_names + self.types = types + self.data = data + self.blocks = [] + + @abstractmethod + def write(self, col_names: List[str], columns: List[List[Any]]) -> None: + """ + Save columns of data to blocks. Must be implemented by subclasses. + + Args: + col_names (List[str]): List of column names that are being written. + columns (List[List[Any]]): List of columns data, each column is represented by a list. + """ + pass + + @abstractmethod + def finalize(self) -> bytes: + """ + Assemble and return the final data from blocks. Must be implemented by subclasses. + + Returns: + bytes: The final serialized data. + """ + pass diff --git a/programs/local/LocalChdb.cpp b/programs/local/LocalChdb.cpp index b632a034c89..bc2c1324e0a 100644 --- a/programs/local/LocalChdb.cpp +++ b/programs/local/LocalChdb.cpp @@ -1,6 +1,6 @@ #include "LocalChdb.h" - -#include +#include +#include extern bool inside_main = true; @@ -51,6 +51,7 @@ local_result_v2 * queryToBuffer( for (auto & arg : argv) argv_char.push_back(const_cast(arg.c_str())); + py::gil_scoped_release release; return query_stable_v2(argv_char.size(), argv_char.data()); } @@ -147,6 +148,34 @@ PYBIND11_MODULE(_chdb, m) .def("has_error", &query_result::has_error) .def("error_message", &query_result::error_message); + py::class_>(m, "PyReader") + .def( + py::init(), + "Initialize the reader with data. The exact type and structure of `data` can vary." + "you must hold the data with `self.data` in your inherit class\n\n" + "Args:\n" + " data (Any): The data with which to initialize the reader, format and type are not strictly defined.") + .def( + "read", + [](DB::PyReader & self, const std::vector & col_names, int count) + { + // GIL is held when called from Python code. Release it to avoid deadlock + py::gil_scoped_release release; + return std::move(self.read(col_names, count)); + }, + "Read a specified number of rows from the given columns and return a list of objects, " + "where each object is a sequence of values for a column.\n\n" + "Args:\n" + " col_names (List[str]): List of column names to read.\n" + " count (int): Maximum number of rows to read.\n\n" + "Returns:\n" + " List[Any]: List of sequences, one for each column.") + .def( + "get_schema", + &DB::PyReader::getSchema, + "Return a list of column names and their types.\n\n" + "Returns:\n" + " List[str, str]: List of column name and type pairs."); m.def( "query", diff --git a/pyproject.toml b/pyproject.toml index e49ce6b218c..556bcbe6e16 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,3 +4,8 @@ build-backend = "setuptools.build_meta" [tool.cibuildwheel] build-frontend = "pip" + +[tool.pyright] +include = ["chdb"] +exclude = ["src", "contrib", "programs", "build", "buildlib", "dist", "venv", ".venv", ".vscode", ".git", "__pycache__", ".mypy_cache", ".pytest_cache"] + diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5c4d38ff662..176913f5b54 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -193,7 +193,7 @@ if (TARGET ch_contrib::jemalloc) target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::jemalloc) endif() -target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::sparsehash ch_contrib::incbin) +target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::sparsehash ch_contrib::incbin ch_contrib::icu) add_subdirectory(Access/Common) add_subdirectory(Common/ZooKeeper) @@ -264,6 +264,83 @@ target_link_libraries (dbms PRIVATE ch_contrib::libdivide) if (TARGET ch_contrib::jemalloc) target_link_libraries (dbms PRIVATE ch_contrib::jemalloc) endif() + +# Include path from shell cmd "python3 -m pybind11 --includes" +execute_process(COMMAND python3 -m pybind11 --includes + OUTPUT_VARIABLE PYBIND11_INCLUDES + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +# Extract and set include directories specifically for source using pybind11 +string(REGEX MATCHALL "-I([^ ]+)" INCLUDE_DIRS_MATCHES ${PYBIND11_INCLUDES}) +set(PYTHON_INCLUDE_DIRS "") +foreach(INCLUDE_DIR_MATCH ${INCLUDE_DIRS_MATCHES}) + string(REGEX REPLACE "-I" "" INCLUDE_DIR_MATCH ${INCLUDE_DIR_MATCH}) + # Accumulate all include directories + set(PYTHON_INCLUDE_DIRS "${PYTHON_INCLUDE_DIRS};${INCLUDE_DIR_MATCH}") +endforeach() + +# Apply the include directories to Storages/StoragePython.cpp and Processors/Sources/PythonSource.cpp +set_source_files_properties(Storages/StoragePython.cpp PROPERTIES INCLUDE_DIRECTORIES "${PYTHON_INCLUDE_DIRS}") +set_source_files_properties(Processors/Sources/PythonSource.cpp PROPERTIES INCLUDE_DIRECTORIES "${PYTHON_INCLUDE_DIRS}") +set_source_files_properties(Columns/ColumnPyObject.cpp PROPERTIES INCLUDE_DIRECTORIES "${PYTHON_INCLUDE_DIRS}") +set_source_files_properties(Common/PythonUtils.cpp PROPERTIES INCLUDE_DIRECTORIES "${PYTHON_INCLUDE_DIRS}") + +# get python version, something like python3.x +execute_process(COMMAND python3 -c "import sys; print('python3.'+str(sys.version_info[1]))" + OUTPUT_VARIABLE PYTHON_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +# remove all warning, because pybind11 will generate a lot of warning +if (OS_LINUX) + # pybind11 will try to find x86_64-linux-gnu/${PYTHON_VERSION}/pyconfig.h + # use -idirafter to make it find the right one and not polute the include path + # set_source_files_properties(Storages/StoragePython.cpp PROPERTIES COMPILE_FLAGS + # "-w -idirafter /usr/include -include x86_64-linux-gnu/${PYTHON_VERSION}/pyconfig.h" + # ) + if (PYTHON_VERSION STREQUAL "python3.6" OR PYTHON_VERSION STREQUAL "python3.7" OR PYTHON_VERSION STREQUAL "python3.8") + set_source_files_properties(Storages/StoragePython.cpp PROPERTIES COMPILE_FLAGS + "-w -idirafter /usr/include -include crypt.h" + ) + set_source_files_properties(Processors/Sources/PythonSource.cpp PROPERTIES COMPILE_FLAGS + "-w -idirafter /usr/include -include crypt.h" + ) + set_source_files_properties(Columns/ColumnPyObject.cpp PROPERTIES COMPILE_FLAGS + "-w -idirafter /usr/include -include crypt.h" + ) + set_source_files_properties(Common/PythonUtils.cpp PROPERTIES COMPILE_FLAGS + "-w -idirafter /usr/include -include crypt.h" + ) + else() + set_source_files_properties(Storages/StoragePython.cpp PROPERTIES COMPILE_FLAGS + "-w" + ) + set_source_files_properties(Processors/Sources/PythonSource.cpp PROPERTIES COMPILE_FLAGS + "-w" + ) + set_source_files_properties(Columns/ColumnPyObject.cpp PROPERTIES COMPILE_FLAGS + "-w" + ) + set_source_files_properties(Common/PythonUtils.cpp PROPERTIES COMPILE_FLAGS + "-w" + ) + endif() +elseif (OS_DARWIN) + set_source_files_properties(Storages/StoragePython.cpp PROPERTIES COMPILE_FLAGS + "-w" + ) + set_source_files_properties(Processors/Sources/PythonSource.cpp PROPERTIES COMPILE_FLAGS + "-w" + ) + set_source_files_properties(Columns/ColumnPyObject.cpp PROPERTIES COMPILE_FLAGS + "-w" + ) + set_source_files_properties(Common/PythonUtils.cpp PROPERTIES COMPILE_FLAGS + "-w" + ) +endif() + set (all_modules dbms) macro (dbms_target_include_directories) diff --git a/src/Columns/ColumnString.h b/src/Columns/ColumnString.h index e8e5ebbcbf9..f507cbeb629 100644 --- a/src/Columns/ColumnString.h +++ b/src/Columns/ColumnString.h @@ -1,7 +1,8 @@ #pragma once -#include #include +#include +#include #include #include @@ -154,7 +155,6 @@ class ColumnString final : public COWHelper { const size_t old_size = chars.size(); const size_t new_size = old_size + length + 1; - chars.resize(new_size); if (length) memcpy(chars.data() + old_size, pos, length); diff --git a/src/Columns/ColumnVectorHelper.h b/src/Columns/ColumnVectorHelper.h index b8ea6ca427f..e75784e09c5 100644 --- a/src/Columns/ColumnVectorHelper.h +++ b/src/Columns/ColumnVectorHelper.h @@ -40,6 +40,14 @@ class ColumnVectorHelper : public IColumn reinterpret_cast(this) + sizeof(*this)) ->push_back_raw(ptr); } + + template + void appendRawData(const char * ptr, size_t count) + { + return reinterpret_cast, PADDING_FOR_SIMD - 1, PADDING_FOR_SIMD> *>( + reinterpret_cast(this) + sizeof(*this)) + ->append_raw(ptr, count); + } }; } diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 28f8e6c6021..0b86b4e2f15 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -586,6 +586,8 @@ M(704, CANNOT_USE_QUERY_CACHE_WITH_NONDETERMINISTIC_FUNCTIONS) \ M(705, TABLE_NOT_EMPTY) \ M(706, LIBSSH_ERROR) \ + M(707, PY_EXCEPTION_OCCURED) \ + M(708, PY_OBJECT_NOT_FOUND) \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ M(1001, STD_EXCEPTION) \ diff --git a/src/Common/PODArray.h b/src/Common/PODArray.h index 68c1e325f0c..f386a2e397c 100644 --- a/src/Common/PODArray.h +++ b/src/Common/PODArray.h @@ -277,6 +277,18 @@ class PODArrayBase : private boost::noncopyable, private TAllocator /// empty c_end += ELEMENT_SIZE; } + template + void append_raw(const void * ptr, size_t count, TAllocatorParams &&... allocator_params) /// NOLINT + { + size_t bytes_to_copy = byte_size(count); + size_t required_capacity = size() + bytes_to_copy; + if (unlikely(required_capacity > capacity())) + reserve(required_capacity, std::forward(allocator_params)...); + + memcpy(c_end, ptr, bytes_to_copy); + c_end += bytes_to_copy; + } + void protect() { #ifndef NDEBUG diff --git a/src/Common/PythonUtils.cpp b/src/Common/PythonUtils.cpp new file mode 100644 index 00000000000..5eda802bab1 --- /dev/null +++ b/src/Common/PythonUtils.cpp @@ -0,0 +1,318 @@ +#include + +#include +#include +#include +#include +#include +#include +#include "Columns/ColumnString.h" + +namespace DB +{ + +const char * ConvertPyUnicodeToUtf8(const void * input, int kind, size_t codepoint_cnt, size_t & output_size) +{ + if (input == nullptr) + return nullptr; + + char * output_buffer = new char[4 * codepoint_cnt]; // Allocate buffer for UTF-8 output + + size_t real_size = 0; + + switch (kind) + { + case 1: { // Handle 1-byte characters (Latin1/ASCII equivalent in ICU) + const char * start = (const char *)input; + const char * end = start + codepoint_cnt; + char code_unit; + char * target = output_buffer; + int32_t append_size = 0; + + while (start < end) + { + code_unit = *start++; + U8_APPEND_UNSAFE(target, append_size, code_unit); + } + real_size += append_size; + output_buffer[real_size] = '\0'; // Null terminate the output string + // LOG_DEBUG(&Poco::Logger::get("PythonUtils"), "Coverted 1byte String: {}", output_buffer); + break; + } + case 2: { // Handle 2-byte characters (UTF-16 equivalent) + const UChar * start = (const UChar *)input; + const UChar * end = start + codepoint_cnt; + UChar code_unit; + char * target = output_buffer; + int32_t append_size = 0; + + while (start < end) + { + code_unit = *start++; + U8_APPEND_UNSAFE(target, append_size, code_unit); + } + real_size += append_size; + output_buffer[real_size] = '\0'; // Null terminate the output string + // LOG_DEBUG(&Poco::Logger::get("PythonUtils"), "Coverted 2byte String: {}", output_buffer); + break; + } + case 4: { // Handle 4-byte characters (Assume UCS-4/UTF-32) + const UInt32 * start = (const UInt32 *)input; + const UInt32 * end = start + codepoint_cnt; + UInt32 code_unit; + char * target = output_buffer; + int32_t append_size = 0; + + while (start < end) + { + code_unit = *start++; + U8_APPEND_UNSAFE(target, append_size, code_unit); + } + real_size += append_size; + output_buffer[real_size] = '\0'; // Null terminate the output string + // LOG_DEBUG(&Poco::Logger::get("PythonUtils"), "Coverted 4byte String: {}", output_buffer); + break; + } + default: + delete[] output_buffer; // Clean up memory allocation if kind is unsupported + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported unicode kind {}", kind); + } + + output_size = real_size; + return output_buffer; +} + +size_t +ConvertPyUnicodeToUtf8(const void * input, int kind, size_t codepoint_cnt, ColumnString::Offsets & offsets, ColumnString::Chars & chars) +{ + if (input == nullptr) + return 0; + + size_t estimated_size = codepoint_cnt * 4 + 1; // Allocate buffer for UTF-8 output + size_t chars_cursor = chars.size(); + size_t target_size = chars_cursor + estimated_size; + chars.resize(target_size); + + switch (kind) + { + case 1: { // Handle 1-byte characters (Latin1/ASCII equivalent in ICU) + const char * start = (const char *)input; + const char * end = start + codepoint_cnt; + char code_unit; + int32_t append_size = 0; + + while (start < end) + { + code_unit = *start++; + U8_APPEND_UNSAFE(chars.data(), chars_cursor, code_unit); + } + break; + } + case 2: { // Handle 2-byte characters (UTF-16 equivalent) + const UChar * start = (const UChar *)input; + const UChar * end = start + codepoint_cnt; + UChar code_unit; + int32_t append_size = 0; + + while (start < end) + { + code_unit = *start++; + U8_APPEND_UNSAFE(chars.data(), chars_cursor, code_unit); + } + break; + } + case 4: { // Handle 4-byte characters (Assume UCS-4/UTF-32) + const UInt32 * start = (const UInt32 *)input; + const UInt32 * end = start + codepoint_cnt; + UInt32 code_unit; + int32_t append_size = 0; + + while (start < end) + { + code_unit = *start++; + U8_APPEND_UNSAFE(chars.data(), chars_cursor, code_unit); + } + break; + } + default: + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported unicode kind {}", kind); + } + + chars[chars_cursor++] = '\0'; // Null terminate the output string and increase the cursor + offsets.push_back(chars_cursor); + chars.resize_assume_reserved(chars_cursor); + + return chars_cursor; +} + +void FillColumnString(PyObject * obj, ColumnString * column) +{ + ColumnString::Offsets & offsets = column->getOffsets(); + ColumnString::Chars & chars = column->getChars(); + if (PyUnicode_IS_COMPACT_ASCII(obj)) + { + const char * data = reinterpret_cast(PyUnicode_1BYTE_DATA(obj)); + size_t unicode_len = PyUnicode_GET_LENGTH(obj); + column->insertData(data, unicode_len); + } + else + { + PyCompactUnicodeObject * unicode = reinterpret_cast(obj); + if (unicode->utf8 != nullptr) + { + // It's utf8 string, treat it like ASCII + const char * data = reinterpret_cast(unicode->utf8); + column->insertData(data, unicode->utf8_length); + } + else if (PyUnicode_IS_COMPACT(obj)) + { + auto kind = PyUnicode_KIND(obj); + const char * data; + size_t codepoint_cnt; + + if (kind == PyUnicode_1BYTE_KIND) + data = reinterpret_cast(PyUnicode_1BYTE_DATA(obj)); + else if (kind == PyUnicode_2BYTE_KIND) + data = reinterpret_cast(PyUnicode_2BYTE_DATA(obj)); + else if (kind == PyUnicode_4BYTE_KIND) + data = reinterpret_cast(PyUnicode_4BYTE_DATA(obj)); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported unicode kind {}", kind); + codepoint_cnt = PyUnicode_GET_LENGTH(obj); + ConvertPyUnicodeToUtf8(data, kind, codepoint_cnt, offsets, chars); + } + else + { + // always convert it to utf8, but this case is rare, here goes the slow path + py::gil_scoped_acquire acquire; + Py_ssize_t bytes_size = -1; + const char * data = PyUnicode_AsUTF8AndSize(obj, &bytes_size); + if (bytes_size < 0) + throw Exception(ErrorCodes::PY_EXCEPTION_OCCURED, "Failed to convert Python unicode object to UTF-8"); + column->insertData(data, bytes_size); + } + } +} + + +const char * GetPyUtf8StrData(PyObject * obj, size_t & buf_len) +{ + // See: https://github.com/python/cpython/blob/3.9/Include/cpython/unicodeobject.h#L81 + if (PyUnicode_IS_COMPACT_ASCII(obj)) + { + const char * data = reinterpret_cast(PyUnicode_1BYTE_DATA(obj)); + buf_len = PyUnicode_GET_LENGTH(obj); + return data; + } + else + { + PyCompactUnicodeObject * unicode = reinterpret_cast(obj); + if (unicode->utf8 != nullptr) + { + // It's utf8 string, treat it like ASCII + const char * data = reinterpret_cast(unicode->utf8); + buf_len = unicode->utf8_length; + return data; + } + else if (PyUnicode_IS_COMPACT(obj)) + { + auto kind = PyUnicode_KIND(obj); + /// We could not use the implementation provided by CPython like below because it requires GIL holded by the caller + // if (kind == PyUnicode_1BYTE_KIND || kind == PyUnicode_2BYTE_KIND || kind == PyUnicode_4BYTE_KIND) + // { + // // always convert it to utf8 + // const char * data = PyUnicode_AsUTF8AndSize(obj, &unicode->utf8_length); + // buf_len = unicode->utf8_length; + // // set the utf8 buffer back + // unicode->utf8 = const_cast(data); + // return data; + // } + const char * data; + size_t codepoint_cnt; + + if (kind == PyUnicode_1BYTE_KIND) + data = reinterpret_cast(PyUnicode_1BYTE_DATA(obj)); + else if (kind == PyUnicode_2BYTE_KIND) + data = reinterpret_cast(PyUnicode_2BYTE_DATA(obj)); + else if (kind == PyUnicode_4BYTE_KIND) + data = reinterpret_cast(PyUnicode_4BYTE_DATA(obj)); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported unicode kind {}", kind); + // always convert it to utf8, and we can't use as function provided by CPython because it requires GIL + // holded by the caller. So we have to do it manually with libicu + codepoint_cnt = PyUnicode_GET_LENGTH(obj); + data = ConvertPyUnicodeToUtf8(data, kind, codepoint_cnt, buf_len); + // set the utf8 buffer back like PyUnicode_AsUTF8AndSize does, so that we can reuse it + // and also we can avoid the memory leak + unicode->utf8 = const_cast(data); + unicode->utf8_length = buf_len; + return data; + } + else + { + // always convert it to utf8, but this case is rare, here goes the slow path + py::gil_scoped_acquire acquire; + // PyUnicode_AsUTF8AndSize caches the UTF-8 encoded string in the unicodeobject + // and subsequent calls will return the same string. The memory is released + // when the unicodeobject is deallocated. + Py_ssize_t bytes_size = -1; + const char * data = PyUnicode_AsUTF8AndSize(obj, &bytes_size); + if (bytes_size < 0) + throw Exception(ErrorCodes::PY_EXCEPTION_OCCURED, "Failed to convert Python unicode object to UTF-8"); + buf_len = bytes_size; + return data; + } + } +} + +bool _isInheritsFromPyReader(const py::handle & obj) +{ + // Check directly if obj is an instance of a class named "PyReader" + if (py::str(obj.attr("__class__").attr("__name__")).cast() == "PyReader") + return true; + + // Check the direct base classes of obj's class for "PyReader" + py::tuple bases = obj.attr("__class__").attr("__bases__"); + for (auto base : bases) + if (py::str(base.attr("__name__")).cast() == "PyReader") + return true; + + return false; +} + +// Will try to get the ref of py::array from pandas Series, or PyArrow Table +// without import numpy or pyarrow. Just from class name for now. +const void * tryGetPyArray(const py::object & obj, py::handle & result, std::string & type_name, size_t & row_count) +{ + py::gil_scoped_acquire acquire; + type_name = py::str(obj.attr("__class__").attr("__name__")).cast(); + if (type_name == "ndarray") + { + // Return the handle of py::array directly + row_count = py::len(obj); + py::array array = obj.cast(); + result = array; + return array.data(); + } + else if (type_name == "Series") + { + // Try to get the handle of py::array from pandas Series + py::array array = obj.attr("values"); + row_count = py::len(obj); + result = array; + return array.data(); + } + else if (type_name == "Table") + { + // Try to get the handle of py::array from PyArrow Table + py::array array = obj.attr("to_pandas")(); + row_count = py::len(obj); + result = array; + return array.data(); + } + + // chdb todo: maybe convert list to py::array? + + return nullptr; +} +} diff --git a/src/Common/PythonUtils.h b/src/Common/PythonUtils.h new file mode 100644 index 00000000000..9069febb68f --- /dev/null +++ b/src/Common/PythonUtils.h @@ -0,0 +1,203 @@ +#pragma once + +#include +#include +// #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int NOT_IMPLEMENTED; +extern const int PY_EXCEPTION_OCCURED; +} + +namespace py = pybind11; + + +struct ColumnWrapper +{ + void * buf; // we may modify the data when cast it to PyObject **, so we need a non-const pointer + size_t row_count; + py::handle data; + DataTypePtr dest_type; + std::string py_type; //py::handle type, eg. numpy.ndarray; + std::string row_format; + std::string encoding; // utf8, utf16, utf32, etc. + std::string name; +}; + +using PyObjectVec = std::vector; +using PyObjectVecPtr = std::shared_ptr; +using PyColumnVec = std::vector; +using PyColumnVecPtr = std::shared_ptr; + +// Template wrapper function to handle any return type +template +auto execWithGIL(Func func, Args &&... args) -> decltype(func(std::forward(args)...)) +{ + py::gil_scoped_acquire acquire; + return func(std::forward(args)...); +} + +// Helper function to convert Python 1,2,4 bytes unicode string to utf8 with icu4c +// kind: 1 for 1-byte characters (Latin1/ASCII equivalent in ICU) +// 2 for 2-byte characters (UTF-16 equivalent) +// 4 for 4-byte characters (Assume UCS-4/UTF-32) +const char * ConvertPyUnicodeToUtf8(const void * input, int kind, size_t codepoint_cnt, size_t & output_size); + +size_t +ConvertPyUnicodeToUtf8(const void * input, int kind, size_t codepoint_cnt, ColumnString::Offsets & offsets, ColumnString::Chars & chars); + +const char * GetPyUtf8StrData(PyObject * obj, size_t & buf_len); + +void FillColumnString(PyObject * obj, ColumnString * column); + +inline const char * GetPyUtf8StrDataWithGIL(PyObject * obj, size_t & buf_len) +{ + return execWithGIL([&]() { return GetPyUtf8StrData(obj, buf_len); }); +} + + +// Helper function to check if an object's class is or inherits from PyReader with a maximum depth +bool _isInheritsFromPyReader(const py::handle & obj); + +inline bool isInheritsFromPyReader(const py::object & obj) +{ + return execWithGIL([&]() { return _isInheritsFromPyReader(obj); }); +} + +// Helper function to check if object is a pandas DataFrame +inline bool isPandasDf(const py::object & obj) +{ + return execWithGIL( + [&]() + { + auto pd_data_frame_type = py::module_::import("pandas").attr("DataFrame"); + return py::isinstance(obj, pd_data_frame_type); + }); +} + +// Helper function to check if object is a PyArrow Table +inline bool isPyarrowTable(const py::object & obj) +{ + return execWithGIL( + [&]() + { + auto table_type = py::module_::import("pyarrow").attr("Table"); + return py::isinstance(obj, table_type); + }); +} + +// Specific wrappers for common use cases +inline auto castToPyList(const py::object & obj) +{ + return execWithGIL([&]() { return obj.cast(); }); +} + +inline auto castToPyArray(const py::object & obj) +{ + return execWithGIL([&]() { return obj.cast(); }); +} + +inline std::string castToStr(const py::object & obj) +{ + return execWithGIL([&]() { return py::str(obj).cast(); }); +} + +inline std::string getPyType(const py::object & obj) +{ + return execWithGIL([&]() { return obj.get_type().attr("__name__").cast(); }); +} + +template +inline std::vector castToVector(const py::object & obj) +{ + return execWithGIL([&]() { return obj.cast>(); }); +} + +inline std::vector castToPyHandleVector(const py::handle obj) +{ + return execWithGIL([&]() { return obj.cast>(); }); +} + +template +inline std::shared_ptr> castToSharedPtrVector(const py::object & obj) +{ + return execWithGIL([&]() { return std::make_shared>(obj.cast>()); }); +} + +inline size_t getObjectLength(const py::object & obj) +{ + return execWithGIL([&]() { return py::len(obj); }); +} + +inline py::object getValueByKey(const py::object & obj, const std::string & key) +{ + return execWithGIL([&]() { return obj[py::str(key)]; }); +} + +inline size_t getLengthOfValueByKey(const py::object & obj, const std::string & key) +{ + return execWithGIL([&]() { return py::len(obj[py::str(key)]); }); +} + +template +inline T castObject(const py::object & obj) +{ + return execWithGIL([&]() { return obj.cast(); }); +} + +inline bool hasAttribute(const py::object & obj, const char * attr_name) +{ + return execWithGIL([&]() { return py::hasattr(obj, attr_name); }); +} + +inline std::string getStringAttribute(const py::object & obj, const char * attr_name) +{ + return execWithGIL([&]() { return obj.attr(attr_name).cast(); }); +} + +template +inline bool isInstanceOf(const py::object & obj) +{ + return execWithGIL([&]() { return py::isinstance(obj); }); +} + +inline size_t getPythonObjectLength(const py::object & obj) +{ + return execWithGIL([&]() { return py::len(obj); }); +} + +inline py::object getAttribute(const py::object & obj, const char * name) +{ + return execWithGIL([&]() { return obj.attr(name); }); +} + +inline py::object callMethod(const py::object & obj, const char * method_name) +{ + return execWithGIL([&]() { return obj.attr(method_name)(); }); +} + +inline std::vector readData(const py::object & data_source, const std::vector & names, size_t cursor, size_t count) +{ + return execWithGIL([&]() { return data_source.attr("read")(names, cursor, count).cast>(); }); +} + +const void * tryGetPyArray(const py::object & obj, py::handle & result, std::string & type_name, size_t & row_count); + +} // namespace DB diff --git a/src/Core/TypeId.h b/src/Core/TypeId.h index 9c634d2321c..e8b2223e190 100644 --- a/src/Core/TypeId.h +++ b/src/Core/TypeId.h @@ -49,6 +49,7 @@ enum class TypeIndex IPv4, IPv6, JSONPaths, + PyObject, }; /** diff --git a/src/DataTypes/Utils.cpp b/src/DataTypes/Utils.cpp index e58331a8bcb..e637a8aa379 100644 --- a/src/DataTypes/Utils.cpp +++ b/src/DataTypes/Utils.cpp @@ -224,6 +224,11 @@ bool canBeSafelyCasted(const DataTypePtr & from_type, const DataTypePtr & to_typ case TypeIndex::Nothing: case TypeIndex::JSONPaths: return false; + case TypeIndex::PyObject: { + if (to_which_type.isString()) + return true; + return false; + } } return true; diff --git a/src/Processors/Sources/PythonSource.cpp b/src/Processors/Sources/PythonSource.cpp new file mode 100644 index 00000000000..6fe9e3eff12 --- /dev/null +++ b/src/Processors/Sources/PythonSource.cpp @@ -0,0 +1,432 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace py = pybind11; + +namespace ErrorCodes +{ +extern const int PY_OBJECT_NOT_FOUND; +extern const int PY_EXCEPTION_OCCURED; +} + +PythonSource::PythonSource( + py::object & data_source_, + const Block & sample_block_, + PyColumnVecPtr column_cache, + size_t data_source_row_count, + size_t max_block_size_, + size_t stream_index, + size_t num_streams) + : ISource(sample_block_.cloneEmpty()) + , data_source(data_source_) + , sample_block(sample_block_) + , column_cache(column_cache) + , data_source_row_count(data_source_row_count) + , max_block_size(max_block_size_) + , stream_index(stream_index) + , num_streams(num_streams) + , cursor(0) +{ + description.init(sample_block_); +} + +template +void PythonSource::insert_from_list(const py::list & obj, const MutableColumnPtr & column) +{ + py::gil_scoped_acquire acquire; + for (auto && item : obj) + column->insert(item.cast()); +} + +void PythonSource::insert_string_from_array(const py::handle obj, const MutableColumnPtr & column) +{ + auto array = castToPyHandleVector(obj); + for (auto && item : array) + { + size_t str_len; + const char * ptr = GetPyUtf8StrData(item.ptr(), str_len); + column->insertData(ptr, str_len); + } +} + +void PythonSource::insert_string_from_array_raw( + PyObject ** buf, const MutableColumnPtr & column, const size_t offset, const size_t row_count) +{ + column->reserve(row_count); + for (size_t i = offset; i < offset + row_count; ++i) + { + size_t str_len; + const char * ptr = GetPyUtf8StrData(buf[i], str_len); + column->insertData(ptr, str_len); + } +} + +void PythonSource::convert_string_array_to_block( + PyObject ** buf, const MutableColumnPtr & column, const size_t offset, const size_t row_count) +{ + ColumnString * string_column = typeid_cast(column.get()); + if (string_column == nullptr) + throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Column is not a string column"); + ColumnString::Chars & data = string_column->getChars(); + ColumnString::Offsets & offsets = string_column->getOffsets(); + offsets.reserve(row_count); + for (size_t i = offset; i < offset + row_count; ++i) + { + FillColumnString(buf[i], string_column); + // Try to help reserve memory for the string column data every 100 rows to avoid frequent reallocations + // Check the avg size of the string column data and reserve memory accordingly + if ((i - offset) % 100 == 99) + { + size_t data_size = data.size(); + size_t counter = i - offset + 1; + size_t avg_size = data_size / counter; + size_t reserve_size = avg_size * row_count; + if (reserve_size > data.capacity()) + { + LOG_DEBUG(logger, "Reserving memory for string column data: {} bytes", reserve_size); + data.reserve(reserve_size); + } + } + } +} + +template +void PythonSource::insert_from_ptr(const void * ptr, const MutableColumnPtr & column, const size_t offset, const size_t row_count) +{ + column->reserve(row_count); + // get the raw data from the array and memcpy it into the column + ColumnVectorHelper * helper = static_cast(column.get()); + const char * start = static_cast(ptr) + offset * sizeof(T); + helper->appendRawData(start, row_count); +} + + +template +ColumnPtr PythonSource::convert_and_insert(const py::object & obj, UInt32 scale) +{ + MutableColumnPtr column; + if constexpr (std::is_same_v || std::is_same_v || std::is_same_v) + column = ColumnDecimal::create(0, scale); + else if constexpr (std::is_same_v) + column = ColumnString::create(); + else + column = ColumnVector::create(); + + std::string type_name; + size_t row_count; + py::handle py_array; + const void * data = tryGetPyArray(obj, py_array, type_name, row_count); + if (!py_array.is_none()) + { + if constexpr (std::is_same_v) + insert_string_from_array(py_array, column); + else + insert_from_ptr(data, column, 0, row_count); + return column; + } + + if (type_name == "list") + { + //reserve the size of the column + column->reserve(row_count); + insert_from_list(obj, column); + return column; + } + + throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Unsupported type {} for value {}", getPyType(obj), castToStr(obj)); +} + + +template +ColumnPtr PythonSource::convert_and_insert_array(const ColumnWrapper & col_wrap, size_t & cursor, const size_t count, UInt32 scale) +{ + MutableColumnPtr column; + if constexpr (std::is_same_v || std::is_same_v || std::is_same_v) + column = ColumnDecimal::create(0, scale); + else if constexpr (std::is_same_v) + column = ColumnString::create(); + else + column = ColumnVector::create(); + + if (col_wrap.data.is_none()) + throw Exception(ErrorCodes::PY_EXCEPTION_OCCURED, "Column data is None"); + + if constexpr (std::is_same_v) + convert_string_array_to_block(static_cast(col_wrap.buf), column, cursor, count); + else + insert_from_ptr(col_wrap.buf, column, cursor, count); + + return column; +} + +void PythonSource::destory(PyObjectVecPtr & data) +{ + // manually destory PyObjectVec and trigger the py::object dec_ref with GIL holded + py::gil_scoped_acquire acquire; + data->clear(); + data.reset(); +} + +Chunk PythonSource::genChunk(size_t & num_rows, PyObjectVecPtr data) +{ + Columns columns(description.sample_block.columns()); + for (size_t i = 0; i < data->size(); ++i) + { + if (i == 0) + num_rows = getObjectLength((*data)[i]); + const auto & column = (*data)[i]; + const auto & type = description.sample_block.getByPosition(i).type; + WhichDataType which(type); + + try + { + // Dispatch to the appropriate conversion function based on data type + if (which.isUInt8()) + columns[i] = convert_and_insert(column); + else if (which.isUInt16()) + columns[i] = convert_and_insert(column); + else if (which.isUInt32()) + columns[i] = convert_and_insert(column); + else if (which.isUInt64()) + columns[i] = convert_and_insert(column); + else if (which.isUInt128()) + columns[i] = convert_and_insert(column); + else if (which.isUInt256()) + columns[i] = convert_and_insert(column); + else if (which.isInt8()) + columns[i] = convert_and_insert(column); + else if (which.isInt16()) + columns[i] = convert_and_insert(column); + else if (which.isInt32()) + columns[i] = convert_and_insert(column); + else if (which.isInt64()) + columns[i] = convert_and_insert(column); + else if (which.isInt128()) + columns[i] = convert_and_insert(column); + else if (which.isInt256()) + columns[i] = convert_and_insert(column); + else if (which.isFloat32()) + columns[i] = convert_and_insert(column); + else if (which.isFloat64()) + columns[i] = convert_and_insert(column); + else if (which.isDecimal128()) + { + const auto & dtype = typeid_cast *>(type.get()); + columns[i] = convert_and_insert(column, dtype->getScale()); + } + else if (which.isDecimal256()) + { + const auto & dtype = typeid_cast *>(type.get()); + columns[i] = convert_and_insert(column, dtype->getScale()); + } + else if (which.isDateTime()) + columns[i] = convert_and_insert(column); + else if (which.isDateTime64()) + columns[i] = convert_and_insert(column); + else if (which.isString()) + columns[i] = convert_and_insert(column); + else + throw Exception( + ErrorCodes::BAD_TYPE_OF_FIELD, + "Unsupported type {} for column {}", + type->getName(), + description.sample_block.getByPosition(i).name); + } + catch (const Exception & e) + { + destory(data); + LOG_ERROR(logger, "Error processing column {}: {}", i, e.what()); + throw; + } + } + + destory(data); + + if (num_rows == 0) + return {}; + + return Chunk(std::move(columns), num_rows); +} + +std::shared_ptr +PythonSource::scanData(const py::object & data, const std::vector & col_names, size_t & cursor, size_t count) +{ + py::gil_scoped_acquire acquire; + auto block = std::make_shared(); + // Access columns directly by name and slice + for (const auto & col : col_names) + { + py::object col_data = data[py::str(col)]; // Use dictionary-style access + block->push_back(col_data.attr("__getitem__")(py::slice(cursor, cursor + count, 1))); + } + + if (!block->empty()) + cursor += py::len((*block)[0]); // Update cursor based on the length of the first column slice + + return std::move(block); +} + + + +Chunk PythonSource::scanDataToChunk() +{ + auto names = description.sample_block.getNames(); + if (names.empty()) + return {}; + + // 1. Try to get the column data from the data source by column name with GIL + // 2. Get the raw data from the array to bypass GIL + // 3. Insert the raw data into the column with given cursor and count + // a. If the column is a string column, convert it to UTF-8 + // b. If the column is a numeric column, directly insert the raw data + Columns columns(description.sample_block.columns()); + if (names.size() != columns.size()) + throw Exception(ErrorCodes::PY_EXCEPTION_OCCURED, "Column cache size mismatch"); + + auto rows_per_stream = data_source_row_count / num_streams; + auto start = stream_index * rows_per_stream; + auto end = (stream_index + 1) * rows_per_stream; + if (stream_index == num_streams - 1) + end = data_source_row_count; + if (cursor == 0) + cursor = start; + auto count = std::min(max_block_size, end - cursor); + if (count == 0) + return {}; + LOG_DEBUG(logger, "Stream index {} Reading {} rows from {}", stream_index, count, cursor); + + for (size_t i = 0; i < columns.size(); ++i) + { + const auto & col = (*column_cache)[i]; + const auto & type = description.sample_block.getByPosition(i).type; + + WhichDataType which(type); + try + { + // Dispatch to the appropriate conversion function based on data type + if (which.isUInt8()) + columns[i] = convert_and_insert_array(col, cursor, count); + else if (which.isUInt16()) + columns[i] = convert_and_insert_array(col, cursor, count); + else if (which.isUInt32()) + columns[i] = convert_and_insert_array(col, cursor, count); + else if (which.isUInt64()) + columns[i] = convert_and_insert_array(col, cursor, count); + else if (which.isUInt128()) + columns[i] = convert_and_insert_array(col, cursor, count); + else if (which.isUInt256()) + columns[i] = convert_and_insert_array(col, cursor, count); + else if (which.isInt8()) + columns[i] = convert_and_insert_array(col, cursor, count); + else if (which.isInt16()) + columns[i] = convert_and_insert_array(col, cursor, count); + else if (which.isInt32()) + columns[i] = convert_and_insert_array(col, cursor, count); + else if (which.isInt64()) + columns[i] = convert_and_insert_array(col, cursor, count); + else if (which.isInt128()) + columns[i] = convert_and_insert_array(col, cursor, count); + else if (which.isInt256()) + columns[i] = convert_and_insert_array(col, cursor, count); + else if (which.isFloat32()) + columns[i] = convert_and_insert_array(col, cursor, count); + else if (which.isFloat64()) + columns[i] = convert_and_insert_array(col, cursor, count); + else if (which.isDecimal128()) + { + const auto & dtype = typeid_cast *>(type.get()); + columns[i] = convert_and_insert_array(col, cursor, count, dtype->getScale()); + } + else if (which.isDecimal256()) + { + const auto & dtype = typeid_cast *>(type.get()); + columns[i] = convert_and_insert_array(col, cursor, count, dtype->getScale()); + } + else if (which.isDateTime()) + columns[i] = convert_and_insert_array(col, cursor, count); + else if (which.isDateTime64()) + columns[i] = convert_and_insert_array(col, cursor, count); + else if (which.isString()) + columns[i] = convert_and_insert_array(col, cursor, count); + else + throw Exception(ErrorCodes::BAD_TYPE_OF_FIELD, "Unsupported type {} for column {}", type->getName(), col.name); + + if (logger->debug()) + { + // log first 10 rows of the column + std::stringstream ss; + LOG_DEBUG(logger, "Column {} structure: {}", col.name, columns[i]->dumpStructure()); + for (size_t j = 0; j < std::min(count, static_cast(10)); ++j) + { + Field value; + columns[i]->get(j, value); + ss << toString(value) << ", "; + } + LOG_DEBUG(logger, "Column {} data: {}", col.name, ss.str()); + } + } + catch (const Exception & e) + { + LOG_ERROR(logger, "Error processing column {}: {}", i, e.what()); + throw; + } + } + cursor += count; + + return Chunk(std::move(columns), count); +} + + +Chunk PythonSource::generate() +{ + size_t num_rows = 0; + auto names = description.sample_block.getNames(); + if (names.empty()) + return {}; + + if (isInheritsFromPyReader(data_source)) + { + PyObjectVecPtr data; + py::gil_scoped_acquire acquire; + data = std::move(castToSharedPtrVector(data_source.attr("read")(names, max_block_size))); + if (data->empty()) + return {}; + + return std::move(genChunk(num_rows, data)); + } + else + { + return std::move(scanDataToChunk()); + } +} +} diff --git a/src/Processors/Sources/PythonSource.h b/src/Processors/Sources/PythonSource.h new file mode 100644 index 00000000000..5fe1b12f817 --- /dev/null +++ b/src/Processors/Sources/PythonSource.h @@ -0,0 +1,77 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace py = pybind11; + +class PyReader; + + +class PythonSource : public ISource +{ +public: + PythonSource( + py::object & data_source_, + const Block & sample_block_, + PyColumnVecPtr column_cache, + size_t data_source_row_count, + size_t max_block_size_, + size_t stream_index, + size_t num_streams); + + ~PythonSource() override = default; + + String getName() const override { return "Python"; } + Chunk genChunk(size_t & num_rows, PyObjectVecPtr data); + Chunk generate() override; + + +private: + py::object & data_source; // Do not own the reference + + Block sample_block; + PyColumnVecPtr column_cache; + size_t data_source_row_count; + const UInt64 max_block_size; + // Caller will only pass stream index and total stream count + // to the constructor, we need to calculate the start offset and end offset. + const size_t stream_index; + const size_t num_streams; + size_t cursor; + + Poco::Logger * logger = &Poco::Logger::get("TableFunctionPython"); + ExternalResultDescription description; + + PyObjectVecPtr scanData(const py::object & data, const std::vector & col_names, size_t & cursor, size_t count); + template + ColumnPtr convert_and_insert_array(const ColumnWrapper & col_wrap, size_t & cursor, size_t count, UInt32 scale = 0); + template + ColumnPtr convert_and_insert(const py::object & obj, UInt32 scale = 0); + template + void insert_from_ptr(const void * ptr, const MutableColumnPtr & column, size_t offset, size_t row_count); + + void convert_string_array_to_block(PyObject ** buf, const MutableColumnPtr & column, size_t offset, size_t row_count); + + + template + void insert_from_list(const py::list & obj, const MutableColumnPtr & column); + + void insert_string_from_array(py::handle obj, const MutableColumnPtr & column); + + void insert_string_from_array_raw(PyObject ** buf, const MutableColumnPtr & column, size_t offset, size_t row_count); + void prepareColumnCache(Names & names, Columns & columns); + Chunk scanDataToChunk(); + void destory(PyObjectVecPtr & data); +}; +} diff --git a/src/Storages/StoragePython.cpp b/src/Storages/StoragePython.cpp new file mode 100644 index 00000000000..01ed814f9e0 --- /dev/null +++ b/src/Storages/StoragePython.cpp @@ -0,0 +1,344 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int BAD_ARGUMENTS; +extern const int LOGICAL_ERROR; +extern const int BAD_TYPE_OF_FIELD; +} + + +StoragePython::StoragePython( + const StorageID & table_id_, + const ColumnsDescription & columns_, + const ConstraintsDescription & constraints_, + py::object reader_, + ContextPtr context_) + : IStorage(table_id_), data_source(reader_), WithContext(context_->getGlobalContext()) +{ + StorageInMemoryMetadata storage_metadata; + storage_metadata.setColumns(columns_); + storage_metadata.setConstraints(constraints_); + setInMemoryMetadata(storage_metadata); +} + +Pipe StoragePython::read( + const Names & column_names, + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & /*query_info*/, + ContextPtr /*context_*/, + QueryProcessingStage::Enum /*processed_stage*/, + size_t max_block_size, + size_t num_streams) +{ + storage_snapshot->check(column_names); + + Block sample_block = prepareSampleBlock(column_names, storage_snapshot); + + // num_streams = 3; // for chdb testing + + prepareColumnCache(column_names, sample_block.getColumns(), sample_block); + + if (isInheritsFromPyReader(data_source)) + return Pipe(std::make_shared(data_source, sample_block, column_cache, data_source_row_count, max_block_size, 0, 1)); + + Pipes pipes; + for (size_t stream = 0; stream < num_streams; ++stream) + pipes.emplace_back(std::make_shared( + data_source, sample_block, column_cache, data_source_row_count, max_block_size, stream, num_streams)); + return Pipe::unitePipes(std::move(pipes)); +} + +Block StoragePython::prepareSampleBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) +{ + Block sample_block; + for (const String & column_name : column_names) + { + auto column_data = storage_snapshot->metadata->getColumns().getPhysical(column_name); + sample_block.insert({column_data.type, column_data.name}); + } + return sample_block; +} + +void StoragePython::prepareColumnCache(const Names & names, const Columns & columns, const Block & sample_block) +{ + // check column cache with GIL holded + py::gil_scoped_acquire acquire; + if (column_cache == nullptr) + { + // fill in the cache + column_cache = std::make_shared(columns.size()); + for (size_t i = 0; i < columns.size(); ++i) + { + const auto & col_name = names[i]; + auto & col = (*column_cache)[i]; + col.name = col_name; + try + { + py::object col_data = data_source[py::str(col_name)]; + col.buf = const_cast(tryGetPyArray(col_data, col.data, col.py_type, col.row_count)); + if (col.buf == nullptr) + throw Exception( + ErrorCodes::PY_EXCEPTION_OCCURED, "Convert to array failed for column {} type {}", col_name, col.py_type); + col.dest_type = sample_block.getByPosition(i).type; + data_source_row_count = col.row_count; + } + catch (const Exception & e) + { + LOG_ERROR(logger, "Error processing column {}: {}", col_name, e.what()); + throw; + } + } + } +} + +ColumnsDescription StoragePython::getTableStructureFromData(py::object data_source) +{ + if (!data_source) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Python reader not initialized"); + py::gil_scoped_acquire acquire; + std::vector> schema; + if (isInheritsFromPyReader(data_source)) + schema = data_source.attr("get_schema")().cast>>(); + else + schema = PyReader::getSchemaFromPyObj(data_source); + + auto * logger = &Poco::Logger::get("StoragePython"); + if (logger->debug()) + { + LOG_DEBUG(logger, "Schema content:"); + for (const auto & item : schema) + LOG_DEBUG(logger, "Column: {}, Type: {}", String(item.first), String(item.second)); + } + + NamesAndTypesList names_and_types; + + // Define regular expressions for different data types + RE2 pattern_int(R"(\bint(\d+))"); + RE2 pattern_generic_int(R"(\bint\b|)"); // Matches generic 'int' + RE2 pattern_uint(R"(\buint(\d+))"); + RE2 pattern_float(R"(\b(float|double)(\d+))"); + RE2 pattern_decimal128(R"(decimal128\((\d+),\s*(\d+)\))"); + RE2 pattern_decimal256(R"(decimal256\((\d+),\s*(\d+)\))"); + RE2 pattern_date32(R"(\bdate32\b)"); + RE2 pattern_date64(R"(\bdate64\b)"); + RE2 pattern_time32(R"(\btime32\b)"); + RE2 pattern_time64_us(R"(\btime64\[us\]\b)"); + RE2 pattern_time64_ns(R"(\btime64\[ns\]\b||str|DataType\(string\)|DataType\(binary\)|binary\[pyarrow\]|dtype\[object_\]|dtype\('S|dtype\('O'\))"); + + // Iterate through each pair of name and type string in the schema + for (const auto & [name, typeStr] : schema) + { + std::shared_ptr data_type; + + std::string bits, precision, scale; + if (RE2::PartialMatch(typeStr, pattern_int, &bits)) + { + if (bits == "8") + data_type = std::make_shared(); + else if (bits == "16") + data_type = std::make_shared(); + else if (bits == "32") + data_type = std::make_shared(); + else if (bits == "64") + data_type = std::make_shared(); + else if (bits == "128") + data_type = std::make_shared(); + else if (bits == "256") + data_type = std::make_shared(); + } + else if (RE2::PartialMatch(typeStr, pattern_uint, &bits)) + { + if (bits == "8") + data_type = std::make_shared(); + else if (bits == "16") + data_type = std::make_shared(); + else if (bits == "32") + data_type = std::make_shared(); + else if (bits == "64") + data_type = std::make_shared(); + else if (bits == "128") + data_type = std::make_shared(); + else if (bits == "256") + data_type = std::make_shared(); + } + else if (RE2::PartialMatch(typeStr, pattern_generic_int)) + { + data_type = std::make_shared(); // Default to 64-bit integers for generic 'int' + } + else if (RE2::PartialMatch(typeStr, pattern_float, &bits)) + { + if (bits == "32") + data_type = std::make_shared(); + else if (bits == "64") + data_type = std::make_shared(); + } + else if (RE2::PartialMatch(typeStr, pattern_decimal128, &precision, &scale)) + { + data_type = std::make_shared(std::stoi(precision), std::stoi(scale)); + } + else if (RE2::PartialMatch(typeStr, pattern_decimal256, &precision, &scale)) + { + data_type = std::make_shared(std::stoi(precision), std::stoi(scale)); + } + else if (RE2::PartialMatch(typeStr, pattern_date32)) + { + data_type = std::make_shared(); + } + else if (RE2::PartialMatch(typeStr, pattern_date64)) + { + data_type = std::make_shared(3); // date64 corresponds to DateTime64(3) + } + else if (RE2::PartialMatch(typeStr, pattern_time32)) + { + data_type = std::make_shared(); + } + else if (RE2::PartialMatch(typeStr, pattern_time64_us)) + { + data_type = std::make_shared(6); // time64[us] corresponds to DateTime64(6) + } + else if (RE2::PartialMatch(typeStr, pattern_time64_ns)) + { + data_type = std::make_shared(9); // time64[ns] corresponds to DateTime64(9) + } + else if (RE2::PartialMatch(typeStr, pattern_string_binary)) + { + data_type = std::make_shared(); + } + else + { + throw Exception(ErrorCodes::TYPE_MISMATCH, "Unrecognized data type: {}", typeStr); + } + + names_and_types.push_back({name, data_type}); + } + + return ColumnsDescription(names_and_types); +} + +std::vector> PyReader::getSchemaFromPyObj(const py::object data) +{ + std::vector> schema; + if (!py::hasattr(data, "__class__")) + { + throw Exception( + ErrorCodes::UNKNOWN_FORMAT, "Unknown data type for schema inference. Consider inheriting PyReader and overriding getSchema()."); + } + + auto type_name = data.attr("__class__").attr("__name__").cast(); + + if (py::isinstance(data)) + { + // If the data is a Python dictionary + for (auto item : data.cast()) + { + std::string key = py::str(item.first).cast(); + py::list values = py::cast(item.second); + std::string dtype = py::str(values[0].attr("__class__").attr("__name__")).cast(); + if (!values.empty()) + schema.emplace_back(key, dtype); + } + return schema; + } + + if (py::hasattr(data, "dtypes")) + { + // If the data is a Pandas DataFrame + py::object dtypes = data.attr("dtypes"); + py::list columns = data.attr("columns"); + for (size_t i = 0; i < py::len(columns); ++i) + { + std::string name = py::str(columns[i]).cast(); + std::string dtype = py::str(py::repr(dtypes[columns[i]])).cast(); + schema.emplace_back(name, dtype); + } + return schema; + } + + if (py::hasattr(data, "schema")) + { + // If the data is a Pyarrow Table + py::object tbl_schema = data.attr("schema"); + auto names = tbl_schema.attr("names").cast(); + auto types = tbl_schema.attr("types").cast(); + for (size_t i = 0; i < py::len(names); ++i) + { + std::string name = py::str(names[i]).cast(); + std::string dtype = py::str(types[i]).cast(); + schema.emplace_back(name, dtype); + } + return schema; + } + + if (type_name == "recarray") + { + // if it's numpy.recarray + py::object dtype = data.attr("dtype"); + py::list fields = dtype.attr("fields"); + py::dict fields_dict = fields.cast(); + // fields_dict looks like: + // {'TIME': (dtype('int64'), 0), + // 'FX' : (dtype('int64'), 8), + // 'FY' : (dtype('int64'), 16), + // 'FZ' : (dtype('S68'), 24)} + for (auto field : fields_dict) + { + std::string name = field.first.cast(); + std::string dtype_str = py::str(field.second).cast(); + schema.emplace_back(name, dtype_str); + } + return schema; + } + + throw Exception( + ErrorCodes::UNKNOWN_FORMAT, + "Unknown data type {} for schema inference. Consider inheriting PyReader and overriding getSchema().", + py::str(data.attr("__class__")).cast()); +} + +void registerStoragePython(StorageFactory & factory) +{ + factory.registerStorage( + "Python", + [](const StorageFactory::Arguments & args) -> StoragePtr + { + if (args.engine_args.size() != 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Python engine requires 1 argument: PyReader object"); + + py::object reader = std::any_cast(args.engine_args[0]); + return std::make_shared(args.table_id, args.columns, args.constraints, reader, args.getLocalContext()); + }, + {.supports_settings = true, .supports_parallel_insert = false}); +} +} diff --git a/src/Storages/StoragePython.h b/src/Storages/StoragePython.h new file mode 100644 index 00000000000..219171fddd1 --- /dev/null +++ b/src/Storages/StoragePython.h @@ -0,0 +1,183 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace py = pybind11; + +namespace ErrorCodes +{ +extern const int UNKNOWN_FORMAT; +extern const int NOT_IMPLEMENTED; +extern const int PY_EXCEPTION_OCCURED; +} +class PyReader +{ +public: + explicit PyReader(const py::object & data) : data(data) { } + ~PyReader() + { + py::gil_scoped_acquire acquire; + if (data.is_none()) + return; + data.release(); + } + + // Read `count` rows from the data, and return a list of columns + // chdb todo: maybe return py::list is better, but this is just a shallow copy + std::vector read(const std::vector & /*col_names*/, int /*count*/) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "read() method is not implemented"); + } + + // // readDataPtr is similar to readData but return pointer to py::object + // static std::shared_ptr>> + // readDataPtr(const py::object & data, const std::vector & col_names, size_t & cursor, size_t count) + // { + // py::gil_scoped_acquire acquire; + // auto block = std::make_shared>>(); + // // Access columns directly by name and slice + // for (const auto & col : col_names) + // { + // py::object col_data = data[py::str(col)]; // Use dictionary-style access + // auto col_block = std::make_shared>(); + // for (size_t i = cursor; i < cursor + count; i++) + // col_block->push_back(col_data.attr("__getitem__")(i).ptr()); + // block->push_back(col_block); + // } + + // if (!block->empty()) + // cursor += py::len((*block)[0]); // Update cursor based on the length of the first column slice + + // return block; + // } + + // Return a vector of column names and their types, as a list of pairs. + // The order is important, and should match the order of the data. + // This is the default implementation, which trys to infer the schema from the every first row + // of this.data column. + // The logic is: + // 1. If the data is a map with column names as keys and column data as values, then we use + // the key and type of every first element in the value list. + // eg: + // d = {'a': [1, 2, 3], 'b': ['x', 'y', 'z'], 'c': [1.0, 1e10, 1.2e100]} + // schema = {name: repr(type(value[0])) for name, value in d.items()} + // out: + // schema = {'a': "", 'b': "", 'c': ""} + // 2. If the data is a Pandas DataFrame, then we use the column names and dtypes. + // We use the repr of the dtype, which is a string representation of the dtype. + // eg: + // df = pd.DataFrame(d) + // schema = {name: repr(dtype) for name, dtype in zip(df.columns, df.dtypes)} + // out: + // schema = {'a': "dtype('int64')", 'b': "dtype('O')", 'c': "dtype('float64')"} + // Note: + // 1. dtype('O') means object type, which is a catch-all for any types. we just treat it as string. + // 2. the dtype of a Pandas DataFrame is a numpy.dtype object, which is not a Python type object. + // + // When using Pandas >= 2.0, we can use the pyarrow as dtype_backend: + // eg: + // df_arr = pd.read_json('{"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [1.0, 1.111, 2.222]}', dtype_backend="pyarrow") + // schema = {name: repr(dtype) for name, dtype in zip(df_arr.columns, df_arr.dtypes)} + // out: + // schema = {'a': 'int64[pyarrow]', 'b': 'string[pyarrow]', 'c': 'double[pyarrow]'} + // 3. if the data is a Pyarrow Table, then we use the column names and types. + // eg: + // tbl = pa.Table.from_pandas(df) + // schema = {field.name: repr(field.type) for field in tbl.schema} + // out: + // schema = {'a': 'DataType(int64)', 'b': 'DataType(string)', 'c': 'DataType(double)'} + // 4. User can override this function to provide a more accurate schema. + // eg: "DataTypeUInt8", "DataTypeUInt16", "DataTypeUInt32", "DataTypeUInt64", "DataTypeUInt128", "DataTypeUInt256", + // "DataTypeInt8", "DataTypeInt16", "DataTypeInt32", "DataTypeInt64", "DataTypeInt128", "DataTypeInt256", + // "DataTypeFloat32", "DataTypeFloat64", "DataTypeString", + + static std::vector> getSchemaFromPyObj(py::object data); + + std::vector> getSchema() { return getSchemaFromPyObj(data); } + +protected: + py::object data; +}; + +// // Trampoline class +// // see: https://pybind11.readthedocs.io/en/stable/advanced/classes.html#trampolines +// class PyReaderTrampoline : public PyReader +// { +// public: +// using PyReader::PyReader; // Inherit constructors + +// // Just forward the virtual function call to Python +// std::vector read(const std::vector & col_names, int count) override +// { +// PYBIND11_OVERRIDE_PURE( +// std::vector, // Return type List[object] +// PyReader, // Parent class +// read, // Name of the function in C++ (must match Python name) +// col_names, // Argument(s) +// count); +// } +// }; + +class StoragePython : public IStorage, public WithContext +{ +public: + StoragePython( + const StorageID & table_id_, + const ColumnsDescription & columns_, + const ConstraintsDescription & constraints_, + py::object reader_, + ContextPtr context_); + + ~StoragePython() override + { + // Destroy the reader with the GIL + py::gil_scoped_acquire acquire; + data_source.dec_ref(); + data_source.release(); + } + + std::string getName() const override { return "Python"; } + + Pipe read( + const Names & column_names, + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & query_info, + ContextPtr context_, + QueryProcessingStage::Enum processed_stage, + size_t max_block_size, + size_t num_streams) override; + + Block prepareSampleBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot); + + static ColumnsDescription getTableStructureFromData(py::object data_source); + +private: + void prepareColumnCache(const Names & names, const Columns & columns, const Block & sample_block); + py::object data_source; + PyColumnVecPtr column_cache; + size_t data_source_row_count; + Poco::Logger * logger = &Poco::Logger::get("StoragePython"); +}; + +void registerStoragePython(StorageFactory & factory); + + +} diff --git a/src/Storages/registerStorages.cpp b/src/Storages/registerStorages.cpp index b971e9b623a..453bf4bab53 100644 --- a/src/Storages/registerStorages.cpp +++ b/src/Storages/registerStorages.cpp @@ -25,6 +25,8 @@ void registerStorageLiveView(StorageFactory & factory); void registerStorageGenerateRandom(StorageFactory & factory); void registerStorageExecutable(StorageFactory & factory); void registerStorageWindowView(StorageFactory & factory); +//chdb todo: add a #if USE_PYTHON here +void registerStoragePython(StorageFactory & factory); #if USE_AWS_S3 void registerStorageS3(StorageFactory & factory); @@ -123,8 +125,9 @@ void registerStorages() registerStorageGenerateRandom(factory); registerStorageExecutable(factory); registerStorageWindowView(factory); + registerStoragePython(factory); - #if USE_AWS_S3 +#if USE_AWS_S3 registerStorageS3(factory); registerStorageCOS(factory); registerStorageOSS(factory); diff --git a/src/TableFunctions/CMakeLists.txt b/src/TableFunctions/CMakeLists.txt index b02a0e79f9c..37ce96a8244 100644 --- a/src/TableFunctions/CMakeLists.txt +++ b/src/TableFunctions/CMakeLists.txt @@ -17,6 +17,41 @@ extract_into_parent_list(clickhouse_table_functions_headers dbms_headers TableFunctionFactory.h ) +# Include path from shell cmd "python3 -m pybind11 --includes" +execute_process(COMMAND python3 -m pybind11 --includes + OUTPUT_VARIABLE PYBIND11_INCLUDES + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +# Extract and set include directories specifically for source using pybind11 +string(REGEX MATCHALL "-I([^ ]+)" INCLUDE_DIRS_MATCHES ${PYBIND11_INCLUDES}) +set(PYTHON_INCLUDE_DIRS "") +foreach(INCLUDE_DIR_MATCH ${INCLUDE_DIRS_MATCHES}) + string(REGEX REPLACE "-I" "" INCLUDE_DIR_MATCH ${INCLUDE_DIR_MATCH}) + # Accumulate all include directories + set(PYTHON_INCLUDE_DIRS "${PYTHON_INCLUDE_DIRS};${INCLUDE_DIR_MATCH}") +endforeach() + +# Add include directories for pybind11 +set_source_files_properties(TableFunctionPython.cpp PROPERTIES INCLUDE_DIRECTORIES "${PYTHON_INCLUDE_DIRS}") + +# remove all warning, because pybind11 will generate a lot of warning +if (OS_LINUX) + if (PYTHON_VERSION STREQUAL "python3.6" OR PYTHON_VERSION STREQUAL "python3.7" OR PYTHON_VERSION STREQUAL "python3.8") + set_source_files_properties(TableFunctionPython.cpp PROPERTIES COMPILE_FLAGS + "-w -idirafter /usr/include -include crypt.h" + ) + else() + set_source_files_properties(TableFunctionPython.cpp PROPERTIES COMPILE_FLAGS + "-w" + ) + endif() +elseif (OS_DARWIN) + set_source_files_properties(TableFunctionPython.cpp PROPERTIES COMPILE_FLAGS + "-w" + ) +endif() + add_library(clickhouse_table_functions ${clickhouse_table_functions_headers} ${clickhouse_table_functions_sources}) target_link_libraries(clickhouse_table_functions PRIVATE clickhouse_parsers clickhouse_storages_system dbms) diff --git a/src/TableFunctions/TableFunctionPython.cpp b/src/TableFunctions/TableFunctionPython.cpp new file mode 100644 index 00000000000..5a7ef58078a --- /dev/null +++ b/src/TableFunctions/TableFunctionPython.cpp @@ -0,0 +1,139 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +extern const int PY_OBJECT_NOT_FOUND; +extern const int PY_EXCEPTION_OCCURED; +} + +// Function to find instance of PyReader, pandas DataFrame, or PyArrow Table, filtered by variable name +py::object find_instances_of_pyreader(const std::string & var_name) +{ + py::module inspect = py::module_::import("inspect"); + py::object current_frame = inspect.attr("currentframe")(); + + while (!current_frame.is_none()) + { + auto local_dict = py::reinterpret_borrow(current_frame.attr("f_locals")); + auto global_dict = py::reinterpret_borrow(current_frame.attr("f_globals")); + + for (const auto & dict : {local_dict, global_dict}) + { + if (dict.contains(var_name)) + { + py::object obj = dict[var_name.data()]; + if (isInheritsFromPyReader(obj) || isPandasDf(obj) || isPyarrowTable(obj)) + return obj; + } + } + + current_frame = current_frame.attr("f_back"); + } + + // not found + return py::none(); +} + +void TableFunctionPython::parseArguments(const ASTPtr & ast_function, ContextPtr context) +{ + py::gil_scoped_acquire acquire; + const auto & func_args = ast_function->as(); + + if (!func_args.arguments) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function 'python' must have arguments."); + + ASTs & args = func_args.arguments->children; + + if (args.size() != 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Python table requires 1 argument: PyReader object"); + + auto py_reader_arg = evaluateConstantExpressionOrIdentifierAsLiteral(args[0], context); + + try + { + // get the py_reader_arg without quotes + auto py_reader_arg_str = py_reader_arg->as().value.safeGet(); + LOG_DEBUG(logger, "Python object name: {}", py_reader_arg_str); + + // strip all quotes like '"` if any. eg. 'PyReader' -> PyReader, "PyReader" -> PyReader + py_reader_arg_str.erase( + std::remove_if(py_reader_arg_str.begin(), py_reader_arg_str.end(), [](char c) { return c == '\'' || c == '\"' || c == '`'; }), + py_reader_arg_str.end()); + + auto instance = find_instances_of_pyreader(py_reader_arg_str); + if (instance.is_none()) + throw Exception( + ErrorCodes::PY_OBJECT_NOT_FOUND, + "Python object not found in the Python environment\n" + "Ensure that the object is type of PyReader, pandas DataFrame, or PyArrow Table and is in the global or local scope"); + + LOG_DEBUG( + logger, + "Python object found in Python environment with name: {} type: {}", + py_reader_arg_str, + py::str(instance.attr("__class__")).cast()); + + reader = instance; + } + catch (py::error_already_set & e) + { + throw Exception(ErrorCodes::PY_EXCEPTION_OCCURED, e.what()); + } +} + +StoragePtr TableFunctionPython::executeImpl( + const ASTPtr & /*ast_function*/, + ContextPtr context, + const String & table_name, + ColumnsDescription /*cached_columns*/, + bool is_insert_query) const +{ + if (!reader) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Python data source not initialized"); + + auto columns = getActualTableStructure(context, is_insert_query); + + auto storage + = std::make_shared(StorageID(getDatabaseName(), table_name), columns, ConstraintsDescription{}, reader, context); + storage->startup(); + return storage; +} + +ColumnsDescription TableFunctionPython::getActualTableStructure(ContextPtr /*context*/, bool /*is_insert_query*/) const +{ + return StoragePython::getTableStructureFromData(reader); +} + +void registerTableFunctionPython(TableFunctionFactory & factory) +{ + factory.registerFunction( + {.documentation + = {.description = R"( +Passing Pandas DataFrame or Pyarrow Table to ClickHouse engine. +For any other data structure, you can also create a table interface to a Python data source and reads data +from a PyReader object. +This table function requires a single argument which is a PyReader object used to read data from Python. +)", + .examples = {{"1", "SELECT * FROM Python(PyReader)", ""}}}}, + TableFunctionFactory::CaseInsensitive); +} + +} diff --git a/src/TableFunctions/TableFunctionPython.h b/src/TableFunctions/TableFunctionPython.h new file mode 100644 index 00000000000..6297a1dd2ed --- /dev/null +++ b/src/TableFunctions/TableFunctionPython.h @@ -0,0 +1,41 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace DB +{ + +class TableFunctionPython : public ITableFunction +{ +public: + static constexpr auto name = "python"; + std::string getName() const override { return name; } + ~TableFunctionPython() override + { + // Acquire the GIL before destroying the reader object + py::gil_scoped_acquire acquire; + reader.dec_ref(); + reader.release(); + } + +private: + Poco::Logger * logger = &Poco::Logger::get("TableFunctionPython"); + StoragePtr executeImpl( + const ASTPtr & ast_function, + ContextPtr context, + const std::string & table_name, + ColumnsDescription cached_columns, + bool is_insert_query) const override; + const char * getStorageTypeName() const override { return "Python"; } + + void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; + + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; + py::object reader; +}; + +} diff --git a/src/TableFunctions/registerTableFunctions.cpp b/src/TableFunctions/registerTableFunctions.cpp index de29c8074b1..966020f6408 100644 --- a/src/TableFunctions/registerTableFunctions.cpp +++ b/src/TableFunctions/registerTableFunctions.cpp @@ -22,6 +22,8 @@ void registerTableFunctions() registerTableFunctionGenerate(factory); registerTableFunctionMongoDB(factory); registerTableFunctionRedis(factory); + //chdb todo: add a #if USE_PYTHON here + registerTableFunctionPython(factory); #if USE_AWS_S3 registerTableFunctionS3(factory); diff --git a/src/TableFunctions/registerTableFunctions.h b/src/TableFunctions/registerTableFunctions.h index 4e39324aba6..106c2062242 100644 --- a/src/TableFunctions/registerTableFunctions.h +++ b/src/TableFunctions/registerTableFunctions.h @@ -19,6 +19,8 @@ void registerTableFunctionInput(TableFunctionFactory & factory); void registerTableFunctionGenerate(TableFunctionFactory & factory); void registerTableFunctionMongoDB(TableFunctionFactory & factory); void registerTableFunctionRedis(TableFunctionFactory & factory); +//chdb todo: add a #if USE_PYTHON here +void registerTableFunctionPython(TableFunctionFactory & factory); #if USE_AWS_S3 void registerTableFunctionS3(TableFunctionFactory & factory); diff --git a/tests/pd_zerocopy.ipynb b/tests/pd_zerocopy.ipynb new file mode 100644 index 00000000000..97d518cb4e7 --- /dev/null +++ b/tests/pd_zerocopy.ipynb @@ -0,0 +1,1409 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Read parquet file into memory. Time cost: 1.2989869117736816 s\n", + "Parquet file size: 1395695970 bytes\n", + "Read parquet file as old pandas dataframe. Time cost: 13.909124374389648 s\n", + "Dataframe(numpy) size: 4700000128 bytes\n" + ] + } + ], + "source": [ + "#!python3\n", + "\n", + "import os\n", + "import time\n", + "import chdb\n", + "import chdb.dataframe as cdf\n", + "import pandas as pd\n", + "import numpy as np\n", + "import pyarrow as pa\n", + "import pyarrow.parquet as pq\n", + "import duckdb\n", + "\n", + "# from pyarrow.interchange import from_dataframe\n", + "from utils import current_dir\n", + "\n", + "# # if hits_0.parquet is not available, download it:\n", + "# # https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_0.parquet\n", + "# if not os.path.exists(os.path.join(current_dir, \"hits_0.parquet\")):\n", + "# opener = urllib.request.URLopener()\n", + "# opener.addheader(\"User-Agent\", \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36\")\n", + "# opener.retrieve(\"https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_0.parquet\",\n", + "# os.path.join(current_dir, \"hits_0.parquet\"))\n", + "\n", + "# 122MB parquet file\n", + "# hits_0 = os.path.join(current_dir, \"hits_0.parquet\")\n", + "\n", + "# 14GB parquet file\n", + "# hits_0 = os.path.join(current_dir, \"hits.parquet\")\n", + "\n", + "# 6.3GB parquet file\n", + "# hits_0 = os.path.join(current_dir, \"hits_50m.parquet\")\n", + "\n", + "# 1.3G parquet file\n", + "hits_0 = os.path.join(current_dir, \"hits1.parquet\")\n", + "\n", + "sql = \"\"\"SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID)\n", + " FROM __table__ GROUP BY RegionID ORDER BY c DESC LIMIT 10\"\"\"\n", + "\n", + "\n", + "t = time.time()\n", + "# read parquet file into memory\n", + "with open(hits_0, \"rb\") as f:\n", + " data = f.read()\n", + "print(\"Read parquet file into memory. Time cost:\", time.time() - t, \"s\")\n", + "print(\"Parquet file size:\", len(data), \"bytes\")\n", + "del data\n", + "\n", + "# read parquet file as old pandas dataframe\n", + "t = time.time()\n", + "hits = pd.read_parquet(hits_0)\n", + "print(\"Read parquet file as old pandas dataframe. Time cost:\", time.time() - t, \"s\")\n", + "print(\"Dataframe(numpy) size:\", hits.memory_usage().sum(), \"bytes\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 1373850796\n", + "1 1373894390\n", + "2 1373894393\n", + "3 1373894395\n", + "4 1373894426\n", + "5 1373894428\n", + "6 1373894431\n", + "7 1373839520\n", + "8 1373839671\n", + "9 1373839673\n", + "Name: EventTime, dtype: int64\n", + "0 2013-07-15 01:13:16\n", + "1 2013-07-15 13:19:50\n", + "2 2013-07-15 13:19:53\n", + "3 2013-07-15 13:19:55\n", + "4 2013-07-15 13:20:26\n", + "5 2013-07-15 13:20:28\n", + "6 2013-07-15 13:20:31\n", + "7 2013-07-14 22:05:20\n", + "8 2013-07-14 22:07:51\n", + "9 2013-07-14 22:07:53\n", + "Name: EventTime, dtype: datetime64[ns]\n", + "0 2013-07-15\n", + "1 2013-07-15\n", + "2 2013-07-15\n", + "3 2013-07-15\n", + "4 2013-07-15\n", + "5 2013-07-15\n", + "6 2013-07-15\n", + "7 2013-07-15\n", + "8 2013-07-15\n", + "9 2013-07-15\n", + "Name: EventDate, dtype: datetime64[ns]\n" + ] + }, + { + "data": { + "text/plain": [ + "WatchID int64\n", + "JavaEnable int16\n", + "Title object\n", + "GoodEvent int16\n", + "EventTime datetime64[ns]\n", + " ... \n", + "FromTag object\n", + "HasGCLID int16\n", + "RefererHash int64\n", + "URLHash int64\n", + "CLID int32\n", + "Length: 105, dtype: object" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# fix some types\n", + "print(hits[\"EventTime\"][0:10])\n", + "hits[\"EventTime\"] = pd.to_datetime(hits[\"EventTime\"], unit=\"s\")\n", + "print(hits[\"EventTime\"][0:10])\n", + "\n", + "hits[\"EventDate\"] = pd.to_datetime(hits[\"EventDate\"], unit=\"D\")\n", + "print(hits[\"EventDate\"][0:10])\n", + "\n", + "# fix all object columns to string\n", + "# for col in hits.columns:\n", + "# if hits[col].dtype == \"O\":\n", + "# hits[col] = hits[col].astype(str)\n", + "\n", + "hits.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Convert old dataframe to numpy array. Time cost: 8.034706115722656e-05 s\n" + ] + } + ], + "source": [ + "# convert dataframe to numpy array\n", + "t = time.time()\n", + "df_npy = hits[\"RegionID\"].to_numpy()\n", + "print(\"Convert old dataframe to numpy array. Time cost:\", time.time() - t, \"s\")\n", + "del df_npy" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "class myReader(chdb.PyReader):\n", + " def __init__(self, data):\n", + " self.data = data\n", + " self.cursor = 0\n", + " super().__init__(data)\n", + "\n", + " def read(self, col_names, count):\n", + " # print(\"read\", col_names, count)\n", + " # get the columns from the data with col_names\n", + " block = [self.data[col] for col in col_names]\n", + " # print(\"columns and rows\", len(block), len(block[0]))\n", + " # get the data from the cursor to cursor + count\n", + " block = [col[self.cursor : self.cursor + count] for col in block]\n", + " # print(\"columns and rows\", len(block), len(block[0]))\n", + " # move the cursor\n", + " self.cursor += block[0].shape[0]\n", + " return block" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "queries = [\n", + "\"SELECT COUNT(*) FROM hits;\",\n", + "\"SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;\",\n", + "\"SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;\",\n", + "\"SELECT AVG(UserID) FROM hits;\",\n", + "\"SELECT COUNT(DISTINCT UserID) FROM hits;\",\n", + "\"SELECT COUNT(DISTINCT SearchPhrase) FROM hits;\",\n", + "\"SELECT MIN(EventDate), MAX(EventDate) FROM hits;\",\n", + "\"SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;\",\n", + "\"SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;\",\n", + "\"SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;\",\n", + "\"SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;\",\n", + "\"SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;\",\n", + "\"SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;\",\n", + "\"SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;\",\n", + "\"SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;\",\n", + "\"SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;\",\n", + "\"SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;\",\n", + "\"SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;\",\n", + "\"SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;\",\n", + "\"SELECT UserID FROM hits WHERE UserID = 435090932899640449;\",\n", + "\"SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';\",\n", + "\"SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;\",\n", + "\"SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;\",\n", + "\"SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;\",\n", + "\"SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;\",\n", + "\"SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;\",\n", + "\"SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;\",\n", + "\"SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;\",\n", + "\"SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;\",\n", + "\"SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;\",\n", + "\"SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;\",\n", + "\"SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;\",\n", + "\"SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;\",\n", + "\"SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;\",\n", + "\"SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;\",\n", + "\"SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;\",\n", + "\"SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;\",\n", + "\"SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;\",\n", + "\"SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;\",\n", + "\"SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;\",\n", + "\"SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;\",\n", + "\"SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;\",\n", + "\"SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "counter = 0\n", + "def bench(sql):\n", + " global counter\n", + " con = duckdb.connect()\n", + " df_reader = myReader(hits)\n", + " duckdb_time = 0\n", + " chdb_time = 0\n", + " print(\"Q\"+str(counter)+\":\", sql)\n", + " t = time.time()\n", + " try:\n", + " ret = con.execute(sql).fetch_df()\n", + " duckdb_time = time.time() - t\n", + " print(\"DuckDB return:\", ret)\n", + " except Exception as e:\n", + " print(\"DuckDB error:\", e)\n", + " # replace 'hits' with 'Python(df_reader)'\n", + " sql = sql.replace(\"hits\", \"Python(df_reader)\")\n", + " sql = sql.replace(\"STRLEN\", \"length\")\n", + " t = time.time()\n", + " try:\n", + " ret = chdb.query(sql, \"Dataframe\")\n", + " chdb_time = time.time() - t\n", + " print(\"chDB return:\", ret)\n", + " except Exception as e:\n", + " print(\"chDB error:\", e)\n", + " counter += 1\n", + " return duckdb_time, chdb_time" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Q0: SELECT COUNT(*) FROM hits;\n", + "DuckDB return: count_star()\n", + "0 10000000\n", + "chDB return: count()\n", + "0 10000000\n", + "Q1: SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;\n", + "DuckDB return: count_star()\n", + "0 257266\n", + "chDB return: count()\n", + "0 257266\n", + "Q2: SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;\n", + "DuckDB return: sum(AdvEngineID) count_star() avg(ResolutionWidth)\n", + "0 5276263.0 10000000 1506.781497\n", + "chDB return: sum(AdvEngineID) count() avg(ResolutionWidth)\n", + "0 5276263 10000000 1506.781497\n", + "Q3: SELECT AVG(UserID) FROM hits;\n", + "DuckDB return: avg(UserID)\n", + "0 2.302915e+18\n", + "chDB return: avg(UserID)\n", + "0 -1.522547e+11\n", + "Q4: SELECT COUNT(DISTINCT UserID) FROM hits;\n", + "DuckDB return: count(DISTINCT UserID)\n", + "0 1620177\n", + "chDB return: uniqExact(UserID)\n", + "0 1620177\n", + "Q5: SELECT COUNT(DISTINCT SearchPhrase) FROM hits;\n", + "DuckDB return: count(DISTINCT SearchPhrase)\n", + "0 873731\n", + "chDB return: uniqExact(SearchPhrase)\n", + "0 873731\n", + "Q6: SELECT MIN(EventDate), MAX(EventDate) FROM hits;\n", + "DuckDB return: min(EventDate) max(EventDate)\n", + "0 2013-07-02 2013-07-31\n", + "chDB return: min(EventDate) max(EventDate)\n", + "0 2013-07-02 08:00:00+08:00 2013-07-31 08:00:00+08:00\n", + "Q7: SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;\n", + "DuckDB return: AdvEngineID count_star()\n", + "0 27 107474\n", + "1 2 94688\n", + "2 45 38390\n", + "3 13 8763\n", + "4 44 7479\n", + "5 25 341\n", + "6 50 80\n", + "7 52 34\n", + "8 3 9\n", + "9 28 8\n", + "chDB return: AdvEngineID count()\n", + "0 27 107474\n", + "1 2 94688\n", + "2 45 38390\n", + "3 13 8763\n", + "4 44 7479\n", + "5 25 341\n", + "6 50 80\n", + "7 52 34\n", + "8 3 9\n", + "9 28 8\n", + "Q8: SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;\n", + "DuckDB return: RegionID u\n", + "0 229 289257\n", + "1 2 114971\n", + "2 208 77428\n", + "3 158 41988\n", + "4 169 37128\n", + "5 34 33622\n", + "6 55 28894\n", + "7 107 26996\n", + "8 42 26944\n", + "9 32 26577\n", + "chDB return: RegionID u\n", + "0 229 289257\n", + "1 2 114971\n", + "2 208 77428\n", + "3 158 41988\n", + "4 169 37128\n", + "5 34 33622\n", + "6 55 28894\n", + "7 107 26996\n", + "8 42 26944\n", + "9 32 26577\n", + "Q9: SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;\n", + "DuckDB return: RegionID sum(AdvEngineID) c avg(ResolutionWidth) \\\n", + "0 229 1626324.0 2031299 1553.786671 \n", + "1 2 313589.0 877397 1423.540215 \n", + "2 208 193458.0 468731 1357.893244 \n", + "3 32 53121.0 357921 1545.596458 \n", + "4 42 83542.0 206186 1586.465808 \n", + "5 55 74805.0 194788 1420.300629 \n", + "6 158 25099.0 182178 947.637969 \n", + "7 34 95038.0 175820 1568.273206 \n", + "8 226 47675.0 145891 1586.239096 \n", + "9 36 53042.0 141420 1588.640758 \n", + "\n", + " count(DISTINCT UserID) \n", + "0 289257 \n", + "1 114971 \n", + "2 77428 \n", + "3 26577 \n", + "4 26944 \n", + "5 28894 \n", + "6 41988 \n", + "7 33622 \n", + "8 17202 \n", + "9 20111 \n", + "chDB return: RegionID sum(AdvEngineID) c avg(ResolutionWidth) \\\n", + "0 229 1626324 2031299 1553.786671 \n", + "1 2 313589 877397 1423.540215 \n", + "2 208 193458 468731 1357.893244 \n", + "3 32 53121 357921 1545.596458 \n", + "4 42 83542 206186 1586.465808 \n", + "5 55 74805 194788 1420.300629 \n", + "6 158 25099 182178 947.637969 \n", + "7 34 95038 175820 1568.273206 \n", + "8 226 47675 145891 1586.239096 \n", + "9 36 53042 141420 1588.640758 \n", + "\n", + " uniqExact(UserID) \n", + "0 289257 \n", + "1 114971 \n", + "2 77428 \n", + "3 26577 \n", + "4 26944 \n", + "5 28894 \n", + "6 41988 \n", + "7 33622 \n", + "8 17202 \n", + "9 20111 \n", + "Q10: SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;\n", + "DuckDB return: MobilePhoneModel u\n", + "0 [105, 80, 97, 100] 80774\n", + "1 [105, 80, 104, 111, 110, 101] 3568\n", + "2 [65, 53, 48, 48] 1396\n", + "3 [78, 56, 45, 48, 48] 446\n", + "4 [79, 78, 69, 32, 84, 79, 85, 67, 72, 32, 54, 4... 273\n", + "5 [105, 80, 104, 111] 196\n", + "6 [51, 49, 49, 48, 48, 48, 48] 144\n", + "7 [71, 84, 45, 80, 55, 51, 48, 48, 66] 139\n", + "8 [71, 84, 45, 73, 57, 53, 48, 48] 131\n", + "9 [101, 97, 103, 108, 101, 55, 53] 131\n", + "chDB return: MobilePhoneModel u\n", + "0 iPad 80774\n", + "1 iPhone 3568\n", + "2 A500 1396\n", + "3 N8-00 446\n", + "4 ONE TOUCH 6030A 273\n", + "5 iPho 196\n", + "6 3110000 144\n", + "7 GT-P7300B 139\n", + "8 eagle75 131\n", + "9 GT-I9500 131\n", + "Q11: SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;\n", + "DuckDB return: MobilePhone MobilePhoneModel u\n", + "0 1 [105, 80, 97, 100] 68519\n", + "1 5 [105, 80, 97, 100] 3788\n", + "2 6 [105, 80, 97, 100] 2210\n", + "3 7 [105, 80, 97, 100] 1980\n", + "4 118 [65, 53, 48, 48] 1394\n", + "5 26 [105, 80, 104, 111, 110, 101] 1058\n", + "6 6 [105, 80, 104, 111, 110, 101] 1039\n", + "7 10 [105, 80, 97, 100] 965\n", + "8 13 [105, 80, 97, 100] 770\n", + "9 32 [105, 80, 97, 100] 746\n", + "chDB return: MobilePhone MobilePhoneModel u\n", + "0 1 iPad 68519\n", + "1 5 iPad 3788\n", + "2 6 iPad 2210\n", + "3 7 iPad 1980\n", + "4 118 A500 1394\n", + "5 26 iPhone 1058\n", + "6 6 iPhone 1039\n", + "7 10 iPad 965\n", + "8 13 iPad 770\n", + "9 32 iPad 746\n", + "Q12: SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;\n", + "DuckDB return: SearchPhrase c\n", + "0 [208, 178, 208, 181, 208, 180, 208, 190, 208, ... 4947\n", + "1 [209, 129, 208, 188, 208, 190, 209, 130, 209, ... 3338\n", + "2 [209, 129, 208, 188, 208, 190, 209, 130, 209, ... 2553\n", + "3 [208, 178, 208, 181, 208, 180, 208, 190, 208, ... 2473\n", + "4 [208, 178, 208, 181, 208, 180, 208, 190, 208, ... 2032\n", + "5 [208, 178, 208, 181, 208, 180, 208, 190, 208, ... 1686\n", + "6 [208, 187, 209, 142, 208, 186, 209, 129, 32, 5... 1559\n", + "7 [208, 190, 209, 130, 208, 180, 209, 139, 209, ... 1272\n", + "8 [209, 130, 208, 176, 209, 135, 208, 186, 208, ... 1248\n", + "9 [209, 128, 208, 181, 209, 134, 208, 181, 208, ... 1244\n", + "chDB return: SearchPhrase c\n", + "0 ведомосквы вместу 4947\n", + "1 смотреть онлайн бесплатно 3338\n", + "2 смотреть онлайн 2553\n", + "3 ведомосквы вы из 2473\n", + "4 ведомосквиталия страции 2032\n", + "5 ведомосковский 1686\n", + "6 люкс 20 иномаровск 1559\n", + "7 отдых в кино 1272\n", + "8 тачки рецепт собстве 1248\n", + "9 рецепты сбербан 1244\n", + "Q13: SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;\n", + "DuckDB return: SearchPhrase u\n", + "0 [209, 129, 208, 188, 208, 190, 209, 130, 209, ... 2717\n", + "1 [209, 129, 208, 188, 208, 190, 209, 130, 209, ... 2085\n", + "2 [208, 178, 208, 181, 208, 180, 208, 190, 208, ... 1385\n", + "3 [208, 187, 209, 142, 208, 186, 209, 129, 32, 5... 1190\n", + "4 [209, 129, 208, 188, 208, 190, 209, 130, 209, ... 1031\n", + "5 [208, 181, 208, 177, 209, 131, 209, 130, 209, ... 1007\n", + "6 [208, 181, 208, 177, 209, 131, 209, 130, 209, ... 978\n", + "7 [209, 129, 208, 188, 208, 190, 209, 130, 209, ... 953\n", + "8 [209, 128, 208, 181, 209, 134, 208, 181, 208, ... 909\n", + "9 [209, 132, 45, 49] 894\n", + "chDB return: SearchPhrase u\n", + "0 смотреть онлайн бесплатно 2717\n", + "1 смотреть онлайн 2085\n", + "2 ведомосквы вместу 1385\n", + "3 люкс 20 иномаровск 1190\n", + "4 смотреть 1031\n", + "5 ебутсы арениксандройд полнечный 1007\n", + "6 ебутсы для 978\n", + "7 смотреть онлайн бесплатно в хорошем 953\n", + "8 рецепты сбербан 909\n", + "9 ф-1 894\n", + "Q14: SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;\n", + "DuckDB return: SearchEngineID SearchPhrase c\n", + "0 2 [208, 178, 208, 181, 208, 180, 208, 190, 208, ... 3480\n", + "1 2 [209, 129, 208, 188, 208, 190, 209, 130, 209, ... 2194\n", + "2 2 [208, 178, 208, 181, 208, 180, 208, 190, 208, ... 1859\n", + "3 2 [208, 178, 208, 181, 208, 180, 208, 190, 208, ... 1682\n", + "4 2 [209, 129, 208, 188, 208, 190, 209, 130, 209, ... 1540\n", + "5 2 [208, 178, 208, 181, 208, 180, 208, 190, 208, ... 1440\n", + "6 95 [208, 190, 209, 130, 208, 180, 209, 139, 209, ... 1261\n", + "7 2 [208, 187, 209, 142, 208, 186, 209, 129, 32, 5... 1257\n", + "8 2 [209, 128, 208, 181, 209, 134, 208, 181, 208, ... 1172\n", + "9 4 [208, 191, 208, 190, 208, 186, 208, 181, 209, ... 959\n", + "chDB return: SearchEngineID SearchPhrase c\n", + "0 2 ведомосквы вместу 3480\n", + "1 2 смотреть онлайн бесплатно 2194\n", + "2 2 ведомосквы вы из 1859\n", + "3 2 ведомосковский 1682\n", + "4 2 смотреть онлайн 1540\n", + "5 2 ведомосквиталия страции 1440\n", + "6 95 отдых в кино 1261\n", + "7 2 люкс 20 иномаровск 1257\n", + "8 2 рецепты сбербан 1172\n", + "9 4 покеты рецепт засня 959\n", + "Q15: SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;\n", + "DuckDB return: UserID count_star()\n", + "0 1313338681122956954 29097\n", + "1 1907779576417363396 16854\n", + "2 2305303682471783379 10588\n", + "3 6103038218306105832 2994\n", + "4 3631826469396741283 2828\n", + "5 6949028786848070043 2496\n", + "6 2035345969173555084 2261\n", + "7 517714522250745823 2119\n", + "8 6762020047108358913 2051\n", + "9 6718662516719813769 1678\n", + "chDB return: UserID count()\n", + "0 1313338681122956954 29097\n", + "1 1907779576417363396 16854\n", + "2 2305303682471783379 10588\n", + "3 6103038218306105832 2994\n", + "4 3631826469396741283 2828\n", + "5 6949028786848070043 2496\n", + "6 2035345969173555084 2261\n", + "7 517714522250745823 2119\n", + "8 6762020047108358913 2051\n", + "9 6718662516719813769 1678\n", + "Q16: SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;\n", + "DuckDB return: UserID SearchPhrase count_star()\n", + "0 1313338681122956954 [] 29097\n", + "1 1907779576417363396 [] 16854\n", + "2 2305303682471783379 [] 10588\n", + "3 6103038218306105832 [] 2994\n", + "4 3631826469396741283 [] 2827\n", + "5 6949028786848070043 [] 2496\n", + "6 2035345969173555084 [] 2259\n", + "7 517714522250745823 [] 2119\n", + "8 6762020047108358913 [] 2051\n", + "9 6718662516719813769 [] 1651\n", + "chDB return: UserID SearchPhrase count()\n", + "0 1313338681122956954 29097\n", + "1 1907779576417363396 16854\n", + "2 2305303682471783379 10588\n", + "3 6103038218306105832 2994\n", + "4 3631826469396741283 2827\n", + "5 6949028786848070043 2496\n", + "6 2035345969173555084 2259\n", + "7 517714522250745823 2119\n", + "8 6762020047108358913 2051\n", + "9 6718662516719813769 1651\n", + "Q17: SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;\n", + "DuckDB return: UserID SearchPhrase \\\n", + "0 -5627973928080731456 [208, 188, 208, 190, 208, 179, 209, 131, 209, ... \n", + "1 -5559699448450058766 [] \n", + "2 -5528876654789979294 [] \n", + "3 -5470057502286193376 [] \n", + "4 -5375069617527156279 [] \n", + "5 -5330171504170722833 [] \n", + "6 -5317883671594541868 [] \n", + "7 -5306916608111263156 [209, 128, 208, 181, 209, 134, 208, 181, 208, ... \n", + "8 -5300394725647264452 [] \n", + "9 -5255231155009379631 [] \n", + "\n", + " count_star() \n", + "0 1 \n", + "1 7 \n", + "2 51 \n", + "3 6 \n", + "4 2 \n", + "5 2 \n", + "6 24 \n", + "7 1 \n", + "8 2 \n", + "9 2 \n", + "chDB return: UserID SearchPhrase \\\n", + "0 1666914814759040438 ареньера фабриколести нижний вволге \n", + "1 5319950495870793034 \n", + "2 5742717625414611048 \n", + "3 896421534586754490 \n", + "4 4581954397323777304 \n", + "5 4858662943892668247 ники из как манипу \n", + "6 5347912626652471260 порядом 2 2013 ворождения на двернушка 8 месяц \n", + "7 4117481054795982924 \n", + "8 2305483527112189819 маленькина leifheitdsq \n", + "9 3552892264426602525 \n", + "\n", + " count() \n", + "0 1 \n", + "1 17 \n", + "2 8 \n", + "3 1 \n", + "4 2 \n", + "5 1 \n", + "6 1 \n", + "7 40 \n", + "8 1 \n", + "9 1 \n", + "Q18: SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;\n", + "DuckDB return: UserID m SearchPhrase count_star()\n", + "0 1313338681122956954 31 [] 589\n", + "1 1313338681122956954 28 [] 578\n", + "2 1313338681122956954 29 [] 572\n", + "3 1313338681122956954 33 [] 567\n", + "4 1313338681122956954 27 [] 557\n", + "5 1313338681122956954 32 [] 554\n", + "6 1313338681122956954 30 [] 552\n", + "7 1313338681122956954 34 [] 546\n", + "8 1313338681122956954 26 [] 540\n", + "9 1313338681122956954 10 [] 539\n", + "chDB return: UserID m SearchPhrase count()\n", + "0 1313338681122956954 31 589\n", + "1 1313338681122956954 28 578\n", + "2 1313338681122956954 29 572\n", + "3 1313338681122956954 33 567\n", + "4 1313338681122956954 27 557\n", + "5 1313338681122956954 32 554\n", + "6 1313338681122956954 30 552\n", + "7 1313338681122956954 34 546\n", + "8 1313338681122956954 26 540\n", + "9 1313338681122956954 10 539\n", + "Q19: SELECT UserID FROM hits WHERE UserID = 435090932899640449;\n", + "DuckDB return: Empty DataFrame\n", + "Columns: [UserID]\n", + "Index: []\n", + "chDB return: Empty DataFrame\n", + "Columns: [UserID]\n", + "Index: []\n", + "Q20: SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';\n", + "DuckDB error: Binder Error: No function matches the given name and argument types '~~(BLOB, STRING_LITERAL)'. You might need to add explicit type casts.\n", + "\tCandidate functions:\n", + "\t~~(VARCHAR, VARCHAR) -> BOOLEAN\n", + "\n", + "LINE 1: SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';\n", + " ^\n", + "chDB return: count()\n", + "0 621\n", + "Q21: SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;\n", + "DuckDB error: Binder Error: No function matches the given name and argument types '~~(BLOB, STRING_LITERAL)'. You might need to add explicit type casts.\n", + "\tCandidate functions:\n", + "\t~~(VARCHAR, VARCHAR) -> BOOLEAN\n", + "\n", + "LINE 1: ...RL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' ...\n", + " ^\n", + "chDB return: SearchPhrase \\\n", + "0 зачать онлайн бесплатно \n", + "1 ани пух ходу \n", + "2 комбактерина кабачки в крополь интерном сад тю... \n", + "3 строитель верси джейкоциты вычета \n", + "4 один инструктура птахани нюши смотреть краси \n", + "5 как миксетин инструкция общая \n", + "6 в август 247 грустимошка на кристрат \n", + "7 славлять породится отели 2013 смотреть \n", + "8 лога в змеиновосибирске в хорошем качестве вне... \n", + "9 михайловар для андроизводские новок \n", + "\n", + " min(URL) c \n", + "0 http://tienskaia-moda-brietielkakh-2%2F%2Fwww.... 2 \n", + "1 http://interinburg/detail.google,yandex.aspx#l... 2 \n", + "2 http://samara.irr.ru/catalog_googleTBR%26ad%3D... 2 \n", + "3 http://ru.tv/smsarhiv/num-9/nf-3/csrf-39818/go... 2 \n", + "4 http://bdsm_position/2624217,2013-07-01:2013/f... 2 \n", + "5 http://samara.irr.ru/catalog_googleMBR%26ad%3D... 2 \n", + "6 http://tienskaia-moda-briuki/google.ru/~apok.r... 1 \n", + "7 http:%2F%2Fvk.com.ua/google-jarkovskaya-Lipeckd 1 \n", + "8 http://tienskaia-moda-briez%2F&sr=http://voron... 1 \n", + "9 http://psyche.html?1=1&cid=577&oki=1&option=Ju... 1 \n", + "Q22: SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;\n", + "DuckDB error: Binder Error: No function matches the given name and argument types '~~(BLOB, STRING_LITERAL)'. You might need to add explicit type casts.\n", + "\tCandidate functions:\n", + "\t~~(VARCHAR, VARCHAR) -> BOOLEAN\n", + "\n", + "LINE 1: ...DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.goo...\n", + " ^\n", + "chDB return: SearchPhrase \\\n", + "0 коптимиквиды юриста с роуз рая \n", + "1 ведомосквы вместу \n", + "2 коптимиквиды юрий жд ворожные моем \n", + "3 заделать магнездо \n", + "4 вспомидоры,отека обучение стека \n", + "5 авторы для jimm f/4-5.6 dc union arkham текст \n", + "6 ведомосквиталия страции \n", + "7 вспышки нижний эльдар \n", + "8 создать+новосибируюсь песни летние \n", + "9 коптимизаностиницы \n", + "\n", + " min(URL) \\\n", + "0 https://produkty%2Fpulove.ru/booklyattion-war-... \n", + "1 http://mysw.info/newsru.ru/compatible \n", + "2 https://produkty%2Fpulove.ru/booklyattion-war-... \n", + "3 http://auto.ria.ua/search/ab_district=1&cid=57... \n", + "4 https://produkty%2Fpulove.ru/booklyattion-war-... \n", + "5 http://nn.jobinmoscow.ru/real-estate/rent/Sroc... \n", + "6 https://produkty%2Fpulove.ru/booklyattion-war-... \n", + "7 http://mysw.info/newsru.ru/compatible \n", + "8 http://auto.ria.ua/search/ab_district=1&cid=57... \n", + "9 https://produkty%2Fpulove.ru/booklyattion-war-... \n", + "\n", + " min(Title) c uniqExact(UserID) \n", + "0 Легко на участные участников., Цены - Стильная... 45 12 \n", + "1 Convent-менеджер с Google Players 1.3 кв. м.- ... 17 11 \n", + "2 Легко на участные участников., Цены - Стильная... 16 6 \n", + "3 AUTO.ria.ua: продажа | Востов-на-Дону, чашечка... 13 13 \n", + "4 Легко на участные участников., Цены - Стильная... 10 1 \n", + "5 Google Papa Rapalace Rescu - модной тканика Ас... 9 9 \n", + "6 Легко на участные участников., Цены - Стильная... 8 3 \n", + "7 Convent-менеджер с Google Players 1.3 кв. м.- ... 8 6 \n", + "8 AUTO.ria.ua: продажа | Востов-на-Дону, чашечка... 8 1 \n", + "9 Легко на участные участников., Цены - Стильная... 8 2 \n", + "Q23: SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;\n", + "DuckDB error: Binder Error: No function matches the given name and argument types '~~(BLOB, STRING_LITERAL)'. You might need to add explicit type casts.\n", + "\tCandidate functions:\n", + "\t~~(VARCHAR, VARCHAR) -> BOOLEAN\n", + "\n", + "LINE 1: SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMI...\n", + " ^\n", + "chDB return: WatchID JavaEnable \\\n", + "0 7316105502961799889 1 \n", + "1 5289360038140010777 1 \n", + "2 8187290215265952247 1 \n", + "3 7067335108757864491 1 \n", + "4 9031598395811274817 1 \n", + "5 8603313135134757044 1 \n", + "6 8850598978691021476 1 \n", + "7 8139397706041785641 1 \n", + "8 7270306648984929955 1 \n", + "9 6405590155111045434 1 \n", + "\n", + " Title GoodEvent \\\n", + "0 Аренда 2 игры для женщин в интернет-магазин - ... 1 \n", + "1 Инвеста.Информленны - bonprix collection - Кош... 1 \n", + "2 Инвеста.Информленны - bonprix collection - Кош... 1 \n", + "3 Прогноз поселка - продаже Жена для руб.- Профи... 1 \n", + "4 Инвеста.Информленны - bonprix collection - Кош... 1 \n", + "5 Инвеста.Информленны - bonprix collection - Кош... 1 \n", + "6 Инвеста.Информленны - bonprix collection - Кош... 1 \n", + "7 Инвеста.Информленны - bonprix collection - Кош... 1 \n", + "8 Инвеста.Информленны - bonprix collection - Кош... 1 \n", + "9 Инвеста.Информленны - bonprix collection - Кош... 1 \n", + "\n", + " EventTime EventDate CounterID ClientIP \\\n", + "0 2013-07-02 05:27:24+08:00 2013-07-02 08:00:00+08:00 7525 1419090217 \n", + "1 2013-07-02 07:02:43+08:00 2013-07-02 08:00:00+08:00 7525 -1260511522 \n", + "2 2013-07-02 07:04:18+08:00 2013-07-02 08:00:00+08:00 7525 -1260511522 \n", + "3 2013-07-02 07:04:26+08:00 2013-07-02 08:00:00+08:00 5822 959273659 \n", + "4 2013-07-02 07:05:21+08:00 2013-07-02 08:00:00+08:00 7525 -1260511522 \n", + "5 2013-07-02 07:05:27+08:00 2013-07-02 08:00:00+08:00 7525 -1260511522 \n", + "6 2013-07-02 07:05:56+08:00 2013-07-02 08:00:00+08:00 7525 -1260511522 \n", + "7 2013-07-02 07:06:41+08:00 2013-07-02 08:00:00+08:00 7525 -1260511522 \n", + "8 2013-07-02 07:07:23+08:00 2013-07-02 08:00:00+08:00 7525 -1260511522 \n", + "9 2013-07-02 07:07:33+08:00 2013-07-02 08:00:00+08:00 7525 -1260511522 \n", + "\n", + " RegionID UserID ... UTMSource UTMMedium UTMCampaign \\\n", + "0 229 3033510353420765788 ... \n", + "1 41 3813931635822850500 ... \n", + "2 41 3813931635822850500 ... \n", + "3 32 736458148605978079 ... \n", + "4 41 3813931635822850500 ... \n", + "5 41 3813931635822850500 ... \n", + "6 41 3813931635822850500 ... \n", + "7 41 3813931635822850500 ... \n", + "8 41 3813931635822850500 ... \n", + "9 41 3813931635822850500 ... \n", + "\n", + " UTMContent UTMTerm FromTag HasGCLID RefererHash \\\n", + "0 0 -7095314016616002272 \n", + "1 0 8622994845783504296 \n", + "2 0 8622994845783504296 \n", + "3 0 -7429996293906404352 \n", + "4 0 8622994845783504296 \n", + "5 0 524931272629027392 \n", + "6 0 524931272629027392 \n", + "7 0 524931272629027392 \n", + "8 0 524931272629027392 \n", + "9 0 662346848875253897 \n", + "\n", + " URLHash CLID \n", + "0 -2039922795398915081 0 \n", + "1 441678500069920832 0 \n", + "2 441678500069920832 0 \n", + "3 -4158922421105595558 0 \n", + "4 441678500069920832 0 \n", + "5 775047382916449082 0 \n", + "6 775047382916449082 0 \n", + "7 775047382916449082 0 \n", + "8 775047382916449082 0 \n", + "9 -5547551342880266035 0 \n", + "\n", + "[10 rows x 105 columns]\n", + "Q24: SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;\n", + "DuckDB return: SearchPhrase\n", + "0 [209, 129, 208, 184, 208, 188, 208, 191, 209, ...\n", + "1 [208, 189, 208, 190, 209, 135, 208, 189, 208, ...\n", + "2 [208, 190, 209, 130, 208, 180, 209, 139, 209, ...\n", + "3 [209, 129, 208, 186, 208, 176, 209, 135, 208, ...\n", + "4 [208, 188, 208, 176, 209, 128, 209, 136, 208, ...\n", + "5 [208, 186, 209, 131, 208, 191, 208, 184, 209, ...\n", + "6 [208, 178, 208, 176, 208, 186, 208, 176, 208, ...\n", + "7 [208, 178, 208, 181, 208, 189, 208, 179, 209, ...\n", + "8 [48, 208, 177, 49, 32, 208, 186, 209, 131, 208...\n", + "9 [209, 129, 208, 176, 208, 189, 208, 176, 208, ...\n", + "chDB return: SearchPhrase\n", + "0 ночно китая женщины\n", + "1 симптомы регистратов\n", + "2 скачать читалию в духовке\n", + "3 отдыха чем прокат\n", + "4 маршава нибудь в омске главнованные автобаза ф...\n", + "5 купить ваз 2121099 инжира 1 сезон смотреть онл...\n", + "6 вакансионал 28 неделю вытяжного печь бабка бу ...\n", + "7 венгридический якутии видео ни\n", + "8 санандроид малининец фармарин\n", + "9 0б1 купить в парня смотреть онлайн\n", + "Q25: SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;\n", + "DuckDB return: SearchPhrase\n", + "0 [32, 209, 129, 208, 178, 208, 181, 209, 130, 2...\n", + "1 [33, 32, 104, 101, 107, 116, 100, 102, 32, 103...\n", + "2 [36, 95, 103, 101, 116, 32, 97, 109, 50, 32, 2...\n", + "3 [36, 95, 103, 101, 116, 32, 105, 116, 32, 111,...\n", + "4 [36, 95, 103, 101, 116, 32, 108, 117, 99, 107,...\n", + "5 [36, 95, 112, 111, 115, 108, 97, 110, 100, 111...\n", + "6 [36, 95, 112, 111, 115, 116, 32, 114, 106, 107...\n", + "7 [36, 95, 112, 111, 115, 116, 97, 114, 115, 104...\n", + "8 [36, 100, 32, 208, 191, 209, 128, 208, 184, 20...\n", + "9 [36, 100, 32, 208, 191, 209, 128, 208, 184, 20...\n", + "chDB return: SearchPhrase\n", + "0 светы женске 2 сезон\n", + "1 ! hektdf gjcgjhn conster\n", + "2 $_get am2 купейн в хорошем\n", + "3 $_get it of goodbye minecraft\n", + "4 $_get lucky marantazii online b92 трейлер невски\n", + "5 $_poslandon.ru/moscow 2 торговлю\n", + "6 $_post rjktcfhtdcr\n", + "7 $_postarshippuden paris stan\n", + "8 $d причина\n", + "9 $d причина\n", + "Q26: SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;\n", + "DuckDB return: SearchPhrase\n", + "0 [208, 189, 208, 190, 209, 135, 208, 189, 208, ...\n", + "1 [209, 129, 208, 184, 208, 188, 208, 191, 209, ...\n", + "2 [208, 190, 209, 130, 208, 180, 209, 139, 209, ...\n", + "3 [209, 129, 208, 186, 208, 176, 209, 135, 208, ...\n", + "4 [208, 186, 209, 131, 208, 191, 208, 184, 209, ...\n", + "5 [208, 188, 208, 176, 209, 128, 209, 136, 208, ...\n", + "6 [208, 178, 208, 176, 208, 186, 208, 176, 208, ...\n", + "7 [208, 178, 208, 181, 208, 189, 208, 179, 209, ...\n", + "8 [48, 208, 177, 49, 32, 208, 186, 209, 131, 208...\n", + "9 [48, 208, 177, 49, 32, 208, 186, 209, 131, 208...\n", + "chDB return: SearchPhrase\n", + "0 ночно китая женщины\n", + "1 симптомы регистратов\n", + "2 отдыха чем прокат\n", + "3 скачать читалию в духовке\n", + "4 купить ваз 2121099 инжира 1 сезон смотреть онл...\n", + "5 маршава нибудь в омске главнованные автобаза ф...\n", + "6 вакансионал 28 неделю вытяжного печь бабка бу ...\n", + "7 венгридический якутии видео ни\n", + "8 0б1 купить без програма\n", + "9 0б1 купить в парня смотреть онлайн\n", + "Q27: SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;\n", + "DuckDB error: Binder Error: No function matches the given name and argument types 'strlen(BLOB)'. You might need to add explicit type casts.\n", + "\tCandidate functions:\n", + "\tstrlen(VARCHAR) -> BIGINT\n", + "\n", + "LINE 1: SELECT CounterID, AVG(STRLEN(URL)) AS l, COUNT(*) AS c FROM h...\n", + " ^\n", + "chDB return: CounterID l c\n", + "0 1634 198.148049 315442\n", + "1 786 186.750714 120528\n", + "2 515 126.359674 102793\n", + "3 62 93.217962 613474\n", + "4 3922 87.880246 3861827\n", + "5 38 76.436656 507770\n", + "6 1483 71.266113 869128\n", + "7 2264 67.700580 278338\n", + "8 40367 67.641345 218299\n", + "9 1095 65.021542 363337\n", + "10 1830 64.919784 113980\n", + "11 40206 63.381008 217355\n", + "12 5822 62.768687 383161\n", + "13 1060 61.041178 252489\n", + "14 7525 58.612668 584968\n", + "Q28: SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\u0001') AS k, AVG(STRLEN(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;\n", + "DuckDB error: Binder Error: No function matches the given name and argument types 'regexp_replace(BLOB, STRING_LITERAL, STRING_LITERAL)'. You might need to add explicit type casts.\n", + "\tCandidate functions:\n", + "\tregexp_replace(VARCHAR, VARCHAR, VARCHAR) -> VARCHAR\n", + "\tregexp_replace(VARCHAR, VARCHAR, VARCHAR, VARCHAR) -> VARCHAR\n", + "\n", + "LINE 1: SELECT REGEXP_REPLACE(Referer, '^https?://(?:w...\n", + " ^\n", + "chDB return: k l c min(Referer)\n", + "0 \u0001 99.401568 7697804 http://%26ad%3D1%260.html&ei=9e71d2f0b6590/3/w...\n", + "Q29: SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;\n", + "DuckDB return: sum(ResolutionWidth) sum((ResolutionWidth + 1)) \\\n", + "0 1.506781e+10 1.507781e+10 \n", + "\n", + " sum((ResolutionWidth + 2)) sum((ResolutionWidth + 3)) \\\n", + "0 1.508781e+10 1.509781e+10 \n", + "\n", + " sum((ResolutionWidth + 4)) sum((ResolutionWidth + 5)) \\\n", + "0 1.510781e+10 1.511781e+10 \n", + "\n", + " sum((ResolutionWidth + 6)) sum((ResolutionWidth + 7)) \\\n", + "0 1.512781e+10 1.513781e+10 \n", + "\n", + " sum((ResolutionWidth + 8)) sum((ResolutionWidth + 9)) ... \\\n", + "0 1.514781e+10 1.515781e+10 ... \n", + "\n", + " sum((ResolutionWidth + 80)) sum((ResolutionWidth + 81)) \\\n", + "0 1.586781e+10 1.587781e+10 \n", + "\n", + " sum((ResolutionWidth + 82)) sum((ResolutionWidth + 83)) \\\n", + "0 1.588781e+10 1.589781e+10 \n", + "\n", + " sum((ResolutionWidth + 84)) sum((ResolutionWidth + 85)) \\\n", + "0 1.590781e+10 1.591781e+10 \n", + "\n", + " sum((ResolutionWidth + 86)) sum((ResolutionWidth + 87)) \\\n", + "0 1.592781e+10 1.593781e+10 \n", + "\n", + " sum((ResolutionWidth + 88)) sum((ResolutionWidth + 89)) \n", + "0 1.594781e+10 1.595781e+10 \n", + "\n", + "[1 rows x 90 columns]\n", + "chDB return: sum(ResolutionWidth) sum(plus(ResolutionWidth, 1)) \\\n", + "0 15067814968 15077814968 \n", + "\n", + " sum(plus(ResolutionWidth, 2)) sum(plus(ResolutionWidth, 3)) \\\n", + "0 15087814968 15097814968 \n", + "\n", + " sum(plus(ResolutionWidth, 4)) sum(plus(ResolutionWidth, 5)) \\\n", + "0 15107814968 15117814968 \n", + "\n", + " sum(plus(ResolutionWidth, 6)) sum(plus(ResolutionWidth, 7)) \\\n", + "0 15127814968 15137814968 \n", + "\n", + " sum(plus(ResolutionWidth, 8)) sum(plus(ResolutionWidth, 9)) ... \\\n", + "0 15147814968 15157814968 ... \n", + "\n", + " sum(plus(ResolutionWidth, 80)) sum(plus(ResolutionWidth, 81)) \\\n", + "0 15867814968 15877814968 \n", + "\n", + " sum(plus(ResolutionWidth, 82)) sum(plus(ResolutionWidth, 83)) \\\n", + "0 15887814968 15897814968 \n", + "\n", + " sum(plus(ResolutionWidth, 84)) sum(plus(ResolutionWidth, 85)) \\\n", + "0 15907814968 15917814968 \n", + "\n", + " sum(plus(ResolutionWidth, 86)) sum(plus(ResolutionWidth, 87)) \\\n", + "0 15927814968 15937814968 \n", + "\n", + " sum(plus(ResolutionWidth, 88)) sum(plus(ResolutionWidth, 89)) \n", + "0 15947814968 15957814968 \n", + "\n", + "[1 rows x 90 columns]\n", + "Q30: SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;\n", + "DuckDB return: SearchEngineID ClientIP c sum(IsRefresh) avg(ResolutionWidth)\n", + "0 2 -1262139876 189 14.0 1560.063492\n", + "1 2 -927025522 187 26.0 1621.368984\n", + "2 2 -19034471 184 29.0 1734.782609\n", + "3 2 1124827693 182 90.0 1730.005495\n", + "4 95 993936935 176 0.0 1828.000000\n", + "5 2 2128431738 155 26.0 1591.477419\n", + "6 2 2145233773 151 25.0 1578.662252\n", + "7 2 -792059583 148 10.0 1683.074324\n", + "8 2 -1993532306 145 6.0 1625.655172\n", + "9 95 2031325834 138 1.0 1368.000000\n", + "chDB return: SearchEngineID ClientIP c sum(IsRefresh) avg(ResolutionWidth)\n", + "0 2 -1262139876 189 14 1560.063492\n", + "1 2 -927025522 187 26 1621.368984\n", + "2 2 -19034471 184 29 1734.782609\n", + "3 2 1124827693 182 90 1730.005495\n", + "4 95 993936935 176 0 1828.000000\n", + "5 2 2128431738 155 26 1591.477419\n", + "6 2 2145233773 151 25 1578.662252\n", + "7 2 -792059583 148 10 1683.074324\n", + "8 2 -1993532306 145 6 1625.655172\n", + "9 95 2031325834 138 1 1368.000000\n", + "Q31: SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;\n", + "DuckDB return: WatchID ClientIP c sum(IsRefresh) avg(ResolutionWidth)\n", + "0 7668931906021847229 -1341721422 1 0.0 1368.0\n", + "1 7165456824243544940 -497906719 1 1.0 1368.0\n", + "2 7274491739174625216 -415569899 1 0.0 1368.0\n", + "3 7406021713284022826 1717291897 1 0.0 1368.0\n", + "4 7299732091516868359 -1664378212 1 1.0 1917.0\n", + "5 4637080173602452460 1171287087 1 0.0 1368.0\n", + "6 5623647906101507338 1282438732 1 1.0 1368.0\n", + "7 4935551290596596970 1597930024 1 0.0 508.0\n", + "8 5356841580292029681 1895254206 1 1.0 1638.0\n", + "9 7508876853831984449 355770987 1 0.0 1368.0\n", + "chDB return: WatchID ClientIP c sum(IsRefresh) avg(ResolutionWidth)\n", + "0 6427115150554230793 736252994 1 0 1996.0\n", + "1 4965054029390764634 -1206595968 1 0 166.0\n", + "2 6030703977865133751 434911724 1 0 1996.0\n", + "3 6691203620596311846 2003800917 1 0 1087.0\n", + "4 5786133618012580033 1390766629 1 0 1368.0\n", + "5 5985454501189037066 1832002778 1 0 1638.0\n", + "6 5494909287200572026 1492278923 1 0 1828.0\n", + "7 4698453950679016700 -1916962470 1 0 1750.0\n", + "8 8745161824300249528 1528045946 1 1 1638.0\n", + "9 7352065519984549840 1557735347 1 0 1638.0\n", + "Q32: SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;\n", + "DuckDB return: WatchID ClientIP c sum(IsRefresh) avg(ResolutionWidth)\n", + "0 5669147813168388202 409164014 1 0.0 1087.0\n", + "1 5187536155584756374 -259055783 1 0.0 1638.0\n", + "2 7850753339231053767 1546731662 1 0.0 888.0\n", + "3 8164199526683261115 -1836087206 1 0.0 1996.0\n", + "4 6959364879658339258 -864817513 1 0.0 1087.0\n", + "5 8064063979301589313 -864817513 1 0.0 1087.0\n", + "6 6501263592117931177 1175231813 1 0.0 1996.0\n", + "7 5081318513238856894 -1675522215 1 0.0 2038.0\n", + "8 7513405411154202290 877381272 1 0.0 1368.0\n", + "9 8207609364521618888 877381272 1 0.0 1368.0\n", + "chDB return: WatchID ClientIP c sum(IsRefresh) avg(ResolutionWidth)\n", + "0 7045311802744285412 -1341502114 1 0 1996.0\n", + "1 7997911216135529594 -1050444826 1 0 1750.0\n", + "2 8844035097706011452 1902611968 1 0 0.0\n", + "3 5053190322681433435 -1147935011 1 0 1368.0\n", + "4 6157344501559484646 1722727351 1 0 1638.0\n", + "5 5256342968841438052 749361268 1 0 1638.0\n", + "6 5074356965705409073 1539704498 1 0 508.0\n", + "7 7713773151322457084 53805758 1 0 1087.0\n", + "8 4836369074268702547 2053634497 1 0 1750.0\n", + "9 4848806411334622685 2132338069 1 0 1638.0\n", + "Q33: SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;\n", + "DuckDB return: URL c\n", + "0 [104, 116, 116, 112, 58, 47, 47, 115, 112, 45,... 100821\n", + "1 [104, 116, 116, 112, 58, 47, 47, 105, 114, 114... 90604\n", + "2 [104, 116, 116, 112, 58, 37, 50, 70, 37, 50, 7... 46281\n", + "3 [104, 116, 116, 112, 58, 47, 47, 107, 111, 109... 43455\n", + "4 [104, 116, 116, 112, 58, 47, 47, 97, 102, 105,... 35161\n", + "5 [104, 116, 116, 112, 58, 47, 47, 115, 112, 45,... 31018\n", + "6 [104, 116, 116, 112, 58, 37, 50, 70, 37, 50, 7... 28878\n", + "7 [104, 116, 116, 112, 58, 47, 47, 97, 102, 105,... 26520\n", + "8 [104, 116, 116, 112, 58, 47, 47, 115, 105, 98,... 25242\n", + "9 [104, 116, 116, 112, 58, 47, 47, 115, 112, 45,... 17068\n", + "chDB return: URL c\n", + "0 http://sp-money.yandex.ru/comme%2F27.0.1453.11... 100821\n", + "1 http://irr.ru/index.php?showalbum/login-leniya... 90604\n", + "2 http:%2F%2Fdlia-zhienskaia-moda-tunika 46281\n", + "3 http://komme%2F27.0.1453.116 43455\n", + "4 http://afisha.yandex.ru/region/vacancies 35161\n", + "5 http://sp-money.yandex.ru%26target 31018\n", + "6 http:%2F%2Fwwww.bonprix.ru/mosclinindzya 28878\n", + "7 http://afisha.yandex.ru/region-ware-ne-niz%2F%... 26520\n", + "8 http://sib1.adriver 25242\n", + "9 http://sp-money.yandex.ua/search&event=little 17068\n", + "Q34: SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;\n", + "DuckDB return: 1 URL c\n", + "0 1 [104, 116, 116, 112, 58, 47, 47, 115, 112, 45,... 100821\n", + "1 1 [104, 116, 116, 112, 58, 47, 47, 105, 114, 114... 90604\n", + "2 1 [104, 116, 116, 112, 58, 37, 50, 70, 37, 50, 7... 46281\n", + "3 1 [104, 116, 116, 112, 58, 47, 47, 107, 111, 109... 43455\n", + "4 1 [104, 116, 116, 112, 58, 47, 47, 97, 102, 105,... 35161\n", + "5 1 [104, 116, 116, 112, 58, 47, 47, 115, 112, 45,... 31018\n", + "6 1 [104, 116, 116, 112, 58, 37, 50, 70, 37, 50, 7... 28878\n", + "7 1 [104, 116, 116, 112, 58, 47, 47, 97, 102, 105,... 26520\n", + "8 1 [104, 116, 116, 112, 58, 47, 47, 115, 105, 98,... 25242\n", + "9 1 [104, 116, 116, 112, 58, 47, 47, 115, 112, 45,... 17068\n", + "chDB return: 1 URL c\n", + "0 1 http://sp-money.yandex.ru/comme%2F27.0.1453.11... 100821\n", + "1 1 http://irr.ru/index.php?showalbum/login-leniya... 90604\n", + "2 1 http:%2F%2Fdlia-zhienskaia-moda-tunika 46281\n", + "3 1 http://komme%2F27.0.1453.116 43455\n", + "4 1 http://afisha.yandex.ru/region/vacancies 35161\n", + "5 1 http://sp-money.yandex.ru%26target 31018\n", + "6 1 http:%2F%2Fwwww.bonprix.ru/mosclinindzya 28878\n", + "7 1 http://afisha.yandex.ru/region-ware-ne-niz%2F%... 26520\n", + "8 1 http://sib1.adriver 25242\n", + "9 1 http://sp-money.yandex.ua/search&event=little 17068\n", + "Q35: SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;\n", + "DuckDB return: ClientIP (ClientIP - 1) (ClientIP - 2) (ClientIP - 3) c\n", + "0 -1698104457 -1698104458 -1698104459 -1698104460 29119\n", + "1 -1175819552 -1175819553 -1175819554 -1175819555 16854\n", + "2 -1206311089 -1206311090 -1206311091 -1206311092 6087\n", + "3 720685641 720685640 720685639 720685638 5420\n", + "4 1515409054 1515409053 1515409052 1515409051 4254\n", + "5 1928873128 1928873127 1928873126 1928873125 3290\n", + "6 -1323047292 -1323047293 -1323047294 -1323047295 2998\n", + "7 -1313501018 -1313501019 -1313501020 -1313501021 2746\n", + "8 1151807695 1151807694 1151807693 1151807692 2702\n", + "9 -267589304 -267589305 -267589306 -267589307 2526\n", + "chDB return: ClientIP minus(ClientIP, 1) minus(ClientIP, 2) minus(ClientIP, 3) \\\n", + "0 -1698104457 -1698104458 -1698104459 -1698104460 \n", + "1 -1175819552 -1175819553 -1175819554 -1175819555 \n", + "2 -1206311089 -1206311090 -1206311091 -1206311092 \n", + "3 720685641 720685640 720685639 720685638 \n", + "4 1515409054 1515409053 1515409052 1515409051 \n", + "5 1928873128 1928873127 1928873126 1928873125 \n", + "6 -1323047292 -1323047293 -1323047294 -1323047295 \n", + "7 -1313501018 -1313501019 -1313501020 -1313501021 \n", + "8 1151807695 1151807694 1151807693 1151807692 \n", + "9 -267589304 -267589305 -267589306 -267589307 \n", + "\n", + " c \n", + "0 29119 \n", + "1 16854 \n", + "2 6087 \n", + "3 5420 \n", + "4 4254 \n", + "5 3290 \n", + "6 2998 \n", + "7 2746 \n", + "8 2702 \n", + "9 2526 \n", + "Q36: SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;\n", + "DuckDB return: URL PageViews\n", + "0 [104, 116, 116, 112, 58, 47, 47, 105, 114, 114... 85646\n", + "1 [104, 116, 116, 112, 58, 47, 47, 107, 111, 109... 42422\n", + "2 [104, 116, 116, 112, 58, 47, 47, 105, 114, 114... 15165\n", + "3 [104, 116, 116, 112, 58, 47, 47, 105, 114, 114... 13779\n", + "4 [104, 116, 116, 112, 58, 47, 47, 105, 114, 114... 10559\n", + "5 [104, 116, 116, 112, 58, 47, 47, 105, 114, 114... 8997\n", + "6 [104, 116, 116, 112, 58, 47, 47, 107, 111, 109... 6322\n", + "7 [104, 116, 116, 112, 58, 47, 47, 105, 114, 114... 3633\n", + "8 [104, 116, 116, 112, 58, 47, 47, 105, 114, 114... 3363\n", + "9 [104, 116, 116, 112, 58, 47, 47, 107, 111, 109... 2538\n", + "chDB return: URL PageViews\n", + "0 http://irr.ru/index.php?showalbum/login-leniya... 85646\n", + "1 http://komme%2F27.0.1453.116 42422\n", + "2 http://irr.ru/index.php?showalbum/login-kapust... 15165\n", + "3 http://irr.ru/index.php?showalbum/login-kapust... 13779\n", + "4 http://irr.ru/index.php 10559\n", + "5 http://irr.ru/index.php?showalbum/login 8997\n", + "6 http://komme%2F27.0.1453.116 Safari%2F5.0 (com... 6322\n", + "7 http://irr.ru/index.php?showalbum/login-kupalnik 3633\n", + "8 http://irr.ru/index.php?showalbum/login-kapust... 3363\n", + "9 http://komme%2F27.0.1453.116 Safari 2538\n", + "Q37: SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;\n", + "DuckDB return: Title PageViews\n", + "0 [208, 162, 208, 181, 209, 129, 209, 130, 32, 4... 102228\n", + "1 [208, 168, 208, 176, 209, 128, 208, 176, 209, ... 68968\n", + "2 [208, 159, 209, 128, 208, 184, 208, 188, 208, ... 67496\n", + "3 [208, 145, 209, 128, 209, 142, 208, 186, 208, ... 31750\n", + "4 [208, 162, 208, 181, 208, 191, 208, 187, 208, ... 19270\n", + "5 [68, 97, 118, 101, 32, 97, 110, 100, 32, 72, 1... 11962\n", + "6 [208, 159, 209, 128, 208, 184, 208, 188, 208, ... 11618\n", + "7 [65, 85, 84, 79, 46, 114, 105, 97, 46, 117, 97... 11611\n", + "8 [79, 87, 65, 80, 114, 111, 102, 101, 115, 115,... 8965\n", + "9 [208, 162, 209, 128, 209, 131, 209, 129, 208, ... 8445\n", + "chDB return: Title PageViews\n", + "0 Тест (Россия) - Яндекс 102228\n", + "1 Шарарай), Выбрать! - обсуждаются на голд: Шоуб... 68968\n", + "2 Приморск - IRR.ru 67496\n", + "3 Брюки New Era H (Асус) 258 общая выплаток, гор... 31750\n", + "4 Теплоску на 19270\n", + "5 Dave and Hotpoint sport – самые вещие 11962\n", + "6 Приморск (Россия) - Яндекс.Видео 11618\n", + "7 AUTO.ria.ua ™ - Аппер 11611\n", + "8 OWAProfessign), продать 8965\n", + "9 Труси - Шоубиз 8445\n", + "Q38: SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;\n", + "DuckDB return: URL PageViews\n", + "0 [104, 116, 116, 112, 58, 47, 47, 115, 109, 101... 2\n", + "1 [104, 116, 116, 112, 58, 47, 47, 115, 116, 97,... 2\n", + "2 [104, 116, 116, 112, 58, 47, 47, 109, 97, 115,... 2\n", + "3 [104, 116, 116, 112, 58, 47, 47, 101, 120, 116... 2\n", + "4 [104, 116, 116, 112, 58, 47, 47, 103, 117, 105... 2\n", + "5 [104, 116, 116, 112, 58, 47, 47, 97, 102, 105,... 2\n", + "6 [104, 116, 116, 112, 58, 47, 47, 106, 111, 98,... 2\n", + "7 [104, 116, 116, 112, 37, 51, 65, 37, 50, 53, 5... 2\n", + "8 [104, 116, 116, 112, 58, 47, 47, 118, 105, 100... 2\n", + "9 [104, 116, 116, 112, 58, 47, 47, 97, 108, 112,... 2\n", + "chDB return: URL PageViews\n", + "0 http://pogoda.yandex.ru/places/premiery%2Fpage... 2\n", + "1 http://stalker-pub-20087898675494,960948/#page... 2\n", + "2 http://stalker-pub-20087898675494,960948/#page... 2\n", + "3 http://video.yandex.ru/price_do=¤cy 2\n", + "4 http://kemerovokuznetsi/#tabsinternet Explorer... 2\n", + "5 http://afisha.yandex.ru/?favorite_off=FORID:10... 2\n", + "6 http://sravni.ru/search=0&city_id 2\n", + "7 http://komanipulya33497_36779014826580/price_o... 2\n", + "8 http://koolti-1t.html?sort=price=меньше 0/manu... 2\n", + "9 http://bonprix.ru/omsk/event=big 2\n", + "Q39: SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;\n", + "DuckDB return: TraficSourceID SearchEngineID AdvEngineID \\\n", + "0 -1 0 0 \n", + "1 0 0 0 \n", + "2 -1 0 0 \n", + "3 0 0 0 \n", + "4 1 0 0 \n", + "5 1 0 0 \n", + "6 3 2 0 \n", + "7 -1 0 0 \n", + "8 -1 0 0 \n", + "9 -1 0 0 \n", + "\n", + " Src \\\n", + "0 [104, 116, 116, 112, 58, 47, 47, 115, 116, 97,... \n", + "1 [] \n", + "2 [104, 116, 116, 112, 58, 47, 47, 115, 116, 97,... \n", + "3 [] \n", + "4 [104, 116, 116, 112, 58, 47, 47, 109, 121, 115... \n", + "5 [104, 116, 116, 112, 58, 47, 47, 103, 111, 111... \n", + "6 [] \n", + "7 [104, 116, 116, 112, 58, 47, 47, 107, 105, 110... \n", + "8 [104, 116, 116, 112, 58, 47, 47, 115, 116, 97,... \n", + "9 [104, 116, 116, 112, 58, 47, 47, 115, 116, 97,... \n", + "\n", + " Dst PageViews \n", + "0 [104, 116, 116, 112, 58, 47, 47, 105, 114, 114... 13 \n", + "1 [104, 116, 116, 112, 58, 47, 47, 105, 114, 114... 13 \n", + "2 [104, 116, 116, 112, 58, 47, 47, 105, 114, 114... 13 \n", + "3 [104, 116, 116, 112, 58, 47, 47, 105, 114, 114... 13 \n", + "4 [104, 116, 116, 112, 58, 47, 47, 105, 114, 114... 13 \n", + "5 [104, 116, 116, 112, 58, 47, 47, 105, 114, 114... 13 \n", + "6 [104, 116, 116, 112, 58, 47, 47, 107, 111, 109... 13 \n", + "7 [104, 116, 116, 112, 58, 47, 47, 105, 114, 114... 13 \n", + "8 [104, 116, 116, 112, 58, 47, 47, 105, 114, 114... 13 \n", + "9 [104, 116, 116, 112, 58, 47, 47, 105, 114, 114... 13 \n", + "chDB return: TraficSourceID SearchEngineID AdvEngineID \\\n", + "0 -1 0 0 \n", + "1 -1 0 0 \n", + "2 5 0 0 \n", + "3 -1 0 0 \n", + "4 -1 0 0 \n", + "5 -1 0 0 \n", + "6 -1 0 0 \n", + "7 -1 0 0 \n", + "8 -1 0 0 \n", + "9 1 0 0 \n", + "\n", + " Src \\\n", + "0 http://state=19945206/foto-4/login-2491724/?bu... \n", + "1 http://state=19945206/foto-4/login-avanga_728x... \n", + "2 http://state=19945206/foto-4/login-2006/make=К... \n", + "3 http://state=19945206/foto-4/login-2491724/?bu... \n", + "4 http://state=19945206/foto-4/login-kupe_921675... \n", + "5 http://state=19945206/foto-4/login-2006/makumi... \n", + "6 http://state=19945206/foto-4/login-2006/makumi... \n", + "7 http://state=19945206/foto-4/login-2006/makumi... \n", + "8 http://state=19945206/foto-4/login-2491724/?bu... \n", + "9 http://acase.php?input_who1=2&input_who2=1 \n", + "\n", + " Dst PageViews \n", + "0 http://irr.ru/index.php?showalbum/login-kapust... 13 \n", + "1 http://irr.ru/index.php?showalbum/login-kapust... 13 \n", + "2 http://ekburg.irr.ru%2Fpuloveplanet 13 \n", + "3 http://irr.ru/index.php?showalbum/login-kapust... 13 \n", + "4 http://irr.ru/index.php?showalbum/login.html_p... 13 \n", + "5 http://irr.ru/index.php?showalbum/login-kupalj... 13 \n", + "6 http://irr.ru/index.php?showalbum/login-leniya... 13 \n", + "7 http://irr.ru/index.php?showalbum/login-leniya... 13 \n", + "8 http://irr.ru/index.php?showalbum/login-kapust... 13 \n", + "9 http://komme%2F27.0.1453.116 Safari%2F5.0 (com... 13 \n", + "Q40: SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;\n", + "DuckDB return: URLHash EventDate PageViews\n", + "0 8436286387721556030 2013-07-15 23\n", + "1 -1285046671250476833 2013-07-15 23\n", + "2 -8435826299601811261 2013-07-15 23\n", + "3 7719727592795372103 2013-07-15 22\n", + "4 3756346524397046411 2013-07-15 22\n", + "5 -3172049944036544851 2013-07-15 22\n", + "6 -3950137591013798111 2013-07-15 22\n", + "7 2680587802399303961 2013-07-15 22\n", + "8 1387759335351574242 2013-07-15 22\n", + "9 -6314751298222231545 2013-07-15 21\n", + "chDB return: URLHash EventDate PageViews\n", + "0 8436286387721556030 2013-07-15 08:00:00+08:00 23\n", + "1 7516345568886640333 2013-07-15 08:00:00+08:00 23\n", + "2 -8435826299601811261 2013-07-15 08:00:00+08:00 23\n", + "3 -3950137591013798111 2013-07-15 08:00:00+08:00 22\n", + "4 7719727592795372103 2013-07-15 08:00:00+08:00 22\n", + "5 1387759335351574242 2013-07-15 08:00:00+08:00 22\n", + "6 3756346524397046411 2013-07-15 08:00:00+08:00 22\n", + "7 2680587802399303961 2013-07-15 08:00:00+08:00 22\n", + "8 -3172049944036544851 2013-07-15 08:00:00+08:00 22\n", + "9 -6841350226646068633 2013-07-15 08:00:00+08:00 21\n", + "Q41: SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;\n", + "DuckDB return: Empty DataFrame\n", + "Columns: [WindowClientWidth, WindowClientHeight, PageViews]\n", + "Index: []\n", + "chDB return: Empty DataFrame\n", + "Columns: [WindowClientWidth, WindowClientHeight, PageViews]\n", + "Index: []\n", + "Q42: SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;\n", + "DuckDB return: M PageViews\n", + "0 2013-07-15 12:40:00 434\n", + "1 2013-07-15 12:41:00 378\n", + "2 2013-07-15 12:42:00 395\n", + "3 2013-07-15 12:43:00 391\n", + "4 2013-07-15 12:44:00 366\n", + "5 2013-07-15 12:45:00 406\n", + "6 2013-07-15 12:46:00 395\n", + "7 2013-07-15 12:47:00 381\n", + "8 2013-07-15 12:48:00 385\n", + "9 2013-07-15 12:49:00 415\n", + "chDB return: Empty DataFrame\n", + "Columns: [M, PageViews]\n", + "Index: []\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Benchmark results\n", + "duckdb_times = []\n", + "chdb_times = []\n", + "\n", + "counter = 0\n", + "for query in queries:\n", + " duckdb_time, chdb_time = bench(query)\n", + " duckdb_times.append(duckdb_time)\n", + " chdb_times.append(chdb_time)\n", + "\n", + "x = range(len(queries))\n", + "xlable = [f\"Q{num}\" for num in x]\n", + "width = 0.35\n", + "\n", + "fig, ax = plt.subplots(figsize=(10, 6), dpi=300)\n", + "\n", + "rects1 = ax.bar(x, duckdb_times, width, label=\"DuckDB\")\n", + "rects2 = ax.bar([i + width for i in x], chdb_times, width, label=\"chDB\")\n", + "\n", + "ax.set_ylabel(\"Time (s)\")\n", + "ax.set_title(\"Benchmark Results\")\n", + "ax.set_xticks([i + width / 2 for i in x])\n", + "ax.set_xticklabels(xlable, rotation=90)\n", + "ax.legend()\n", + "\n", + "# Add the value of each bar on top\n", + "for rect in rects1 + rects2:\n", + " height = rect.get_height()\n", + " ax.annotate(\n", + " f\"{height:.2f}\",\n", + " xy=(rect.get_x() + rect.get_width() / 2, height),\n", + " xytext=(0, 3),\n", + " textcoords=\"offset points\",\n", + " ha=\"center\",\n", + " va=\"bottom\",\n", + " rotation=90, # Rotate the text 90°\n", + " fontsize=5, # Set the font size to a smaller value\n", + " )\n", + "\n", + "\n", + "fig.tight_layout()\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}