From 5778260b51da73c8c07f376686af1148896d42e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hannes=20M=C3=BChleisen?= Date: Wed, 11 Oct 2023 12:30:53 +0200 Subject: [PATCH] update to duckdb 0.9.1 --- src/duckdb/extension/icu/icu-makedate.cpp | 2 +- src/duckdb/extension/icu/icu-strptime.cpp | 2 - src/duckdb/extension/icu/icu_extension.cpp | 1 - src/duckdb/extension/json/json_functions.cpp | 11 +- .../json/json_functions/json_create.cpp | 41 +++-- .../json/json_functions/json_transform.cpp | 40 +++-- .../extension/parquet/column_reader.cpp | 27 +++- .../extension/parquet/column_writer.cpp | 11 +- .../parquet/include/column_reader.hpp | 2 + .../parquet/include/parquet_bss_decoder.hpp | 49 ++++++ .../extension/parquet/parquet_extension.cpp | 7 +- .../extension/parquet/parquet_timestamp.cpp | 7 +- .../src/common/arrow/appender/list_data.cpp | 4 +- .../src/common/arrow/appender/map_data.cpp | 25 +-- .../src/common/arrow/appender/struct_data.cpp | 4 +- .../src/common/arrow/appender/union_data.cpp | 4 +- .../src/common/arrow/arrow_appender.cpp | 33 +++- src/duckdb/src/common/arrow/arrow_wrapper.cpp | 6 +- src/duckdb/src/common/enum_util.cpp | 2 +- src/duckdb/src/common/exception.cpp | 144 ++++++++---------- src/duckdb/src/common/preserved_error.cpp | 20 +++ .../common/serializer/binary_deserializer.cpp | 6 +- src/duckdb/src/common/types/data_chunk.cpp | 2 +- .../src/core_functions/scalar/map/map.cpp | 98 ++++++++---- .../expression_executor/execute_reference.cpp | 2 +- .../execution/expression_executor_state.cpp | 10 +- .../csv_scanner/buffered_csv_reader.cpp | 2 +- .../csv_scanner/csv_state_machine_cache.cpp | 89 +++++------ .../csv_scanner/parallel_csv_reader.cpp | 22 +-- .../csv_scanner/sniffer/csv_sniffer.cpp | 46 +++--- .../csv_scanner/sniffer/dialect_detection.cpp | 17 +-- .../csv_scanner/sniffer/header_detection.cpp | 11 +- .../csv_scanner/sniffer/type_detection.cpp | 14 +- .../csv_scanner/sniffer/type_refinement.cpp | 21 +-- .../csv_scanner/sniffer/type_replacement.cpp | 4 +- .../operator/helper/physical_reset.cpp | 5 +- .../operator/helper/physical_set.cpp | 6 +- .../execution/perfect_aggregate_hashtable.cpp | 10 +- .../execution/radix_partitioned_hashtable.cpp | 2 +- src/duckdb/src/function/function_binder.cpp | 2 +- .../src/function/table/arrow_conversion.cpp | 3 +- src/duckdb/src/function/table/read_csv.cpp | 2 +- .../function/table/version/pragma_version.cpp | 4 +- src/duckdb/src/include/duckdb.h | 10 +- .../common/arrow/appender/append_data.hpp | 4 + .../common/arrow/appender/enum_data.hpp | 4 +- .../duckdb/common/arrow/arrow_appender.hpp | 3 +- .../duckdb/common/arrow/arrow_wrapper.hpp | 3 + .../src/include/duckdb/common/exception.hpp | 1 + .../include/duckdb/common/preserved_error.hpp | 4 +- .../serializer/serialization_traits.hpp | 1 + .../execution/expression_executor_state.hpp | 2 +- .../operator/scan/csv/base_csv_reader.hpp | 4 - .../operator/scan/csv/csv_sniffer.hpp | 22 +-- .../execution/operator/scan/csv/csv_state.hpp | 28 ++++ .../operator/scan/csv/csv_state_machine.hpp | 23 ++- .../scan/csv/csv_state_machine_cache.hpp | 26 +++- .../operator/scan/csv/parallel_csv_reader.hpp | 2 +- .../duckdb/function/replacement_scan.hpp | 20 +++ src/duckdb/src/include/duckdb/main/config.hpp | 2 + .../duckdb/optimizer/filter_pushdown.hpp | 2 + .../src/include/duckdb/planner/binder.hpp | 1 + .../duckdb/planner/bound_parameter_map.hpp | 3 + .../duckdb/planner/expression_binder.hpp | 4 +- src/duckdb/src/main/capi/arrow-c.cpp | 11 +- src/duckdb/src/main/config.cpp | 14 ++ .../src/main/extension/extension_helper.cpp | 7 + .../src/main/extension/extension_install.cpp | 26 ++-- .../optimizer/common_aggregate_optimizer.cpp | 4 +- src/duckdb/src/optimizer/filter_pushdown.cpp | 1 + .../optimizer/pushdown/pushdown_distinct.cpp | 19 +++ .../transform/statement/transform_copy.cpp | 6 +- .../statement/transform_create_sequence.cpp | 15 +- .../expression/bind_between_expression.cpp | 12 +- .../expression/bind_collate_expression.cpp | 6 +- .../expression/bind_comparison_expression.cpp | 31 ++-- .../binder/query_node/bind_select_node.cpp | 17 +-- .../planner/binder/statement/bind_create.cpp | 16 +- .../planner/binder/tableref/plan_joinref.cpp | 3 + .../src/planner/bound_parameter_map.cpp | 21 ++- .../expression_binder/base_select_binder.cpp | 7 +- src/duckdb/src/planner/planner.cpp | 2 +- .../transaction/duck_transaction_manager.cpp | 22 +-- .../third_party/parquet/parquet_types.h | 3 +- src/duckdb/ub_src_optimizer_pushdown.cpp | 2 + 85 files changed, 763 insertions(+), 471 deletions(-) create mode 100644 src/duckdb/extension/parquet/include/parquet_bss_decoder.hpp create mode 100644 src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state.hpp create mode 100644 src/duckdb/src/optimizer/pushdown/pushdown_distinct.cpp diff --git a/src/duckdb/extension/icu/icu-makedate.cpp b/src/duckdb/extension/icu/icu-makedate.cpp index c169da7e..bcc78e4b 100644 --- a/src/duckdb/extension/icu/icu-makedate.cpp +++ b/src/duckdb/extension/icu/icu-makedate.cpp @@ -23,7 +23,7 @@ struct ICUMakeDate : public ICUDateFunc { } // Extract the time zone parts - auto micros = SetTime(calendar, instant); + SetTime(calendar, instant); const auto era = ExtractField(calendar, UCAL_ERA); const auto year = ExtractField(calendar, UCAL_YEAR); const auto mm = ExtractField(calendar, UCAL_MONTH) + 1; diff --git a/src/duckdb/extension/icu/icu-strptime.cpp b/src/duckdb/extension/icu/icu-strptime.cpp index 68c351f8..138f89af 100644 --- a/src/duckdb/extension/icu/icu-strptime.cpp +++ b/src/duckdb/extension/icu/icu-strptime.cpp @@ -96,7 +96,6 @@ struct ICUStrptime : public ICUDateFunc { auto &info = func_expr.bind_info->Cast(); CalendarPtr calendar_ptr(info.calendar->clone()); auto calendar = calendar_ptr.get(); - auto &formats = info.formats; D_ASSERT(fmt_arg.GetVectorType() == VectorType::CONSTANT_VECTOR); @@ -126,7 +125,6 @@ struct ICUStrptime : public ICUDateFunc { auto &info = func_expr.bind_info->Cast(); CalendarPtr calendar_ptr(info.calendar->clone()); auto calendar = calendar_ptr.get(); - auto &formats = info.formats; D_ASSERT(fmt_arg.GetVectorType() == VectorType::CONSTANT_VECTOR); diff --git a/src/duckdb/extension/icu/icu_extension.cpp b/src/duckdb/extension/icu/icu_extension.cpp index bdd95144..2ac45079 100644 --- a/src/duckdb/extension/icu/icu_extension.cpp +++ b/src/duckdb/extension/icu/icu_extension.cpp @@ -223,7 +223,6 @@ static void SetICUCalendar(ClientContext &context, SetScope scope, Value ¶me void IcuExtension::Load(DuckDB &ddb) { auto &db = *ddb.instance; - auto &catalog = Catalog::GetSystemCatalog(db); // iterate over all the collations int32_t count; diff --git a/src/duckdb/extension/json/json_functions.cpp b/src/duckdb/extension/json/json_functions.cpp index 97a2cb4d..72246ab8 100644 --- a/src/duckdb/extension/json/json_functions.cpp +++ b/src/duckdb/extension/json/json_functions.cpp @@ -189,16 +189,7 @@ vector JSONFunctions::GetTableFunctions() { unique_ptr JSONFunctions::ReadJSONReplacement(ClientContext &context, const string &table_name, ReplacementScanData *data) { - auto lower_name = StringUtil::Lower(table_name); - // remove any compression - if (StringUtil::EndsWith(lower_name, ".gz")) { - lower_name = lower_name.substr(0, lower_name.size() - 3); - } else if (StringUtil::EndsWith(lower_name, ".zst")) { - lower_name = lower_name.substr(0, lower_name.size() - 4); - } - if (!StringUtil::EndsWith(lower_name, ".json") && !StringUtil::Contains(lower_name, ".json?") && - !StringUtil::EndsWith(lower_name, ".jsonl") && !StringUtil::Contains(lower_name, ".jsonl?") && - !StringUtil::EndsWith(lower_name, ".ndjson") && !StringUtil::Contains(lower_name, ".ndjson?")) { + if (!ReplacementScan::CanReplace(table_name, {"json", "jsonl", "ndjson"})) { return nullptr; } auto table_function = make_uniq(); diff --git a/src/duckdb/extension/json/json_functions/json_create.cpp b/src/duckdb/extension/json/json_functions/json_create.cpp index 12093577..a5f6c3c6 100644 --- a/src/duckdb/extension/json/json_functions/json_create.cpp +++ b/src/duckdb/extension/json/json_functions/json_create.cpp @@ -682,20 +682,33 @@ BoundCastInfo AnyToJSONCastBind(BindCastInput &input, const LogicalType &source, } void JSONFunctions::RegisterJSONCreateCastFunctions(CastFunctionSet &casts) { - auto json_to_any_cost = casts.ImplicitCastCost(LogicalType::ANY, JSONCommon::JSONType()); - casts.RegisterCastFunction(LogicalType::ANY, JSONCommon::JSONType(), AnyToJSONCastBind, json_to_any_cost); - - const auto struct_type = LogicalType::STRUCT({{"any", LogicalType::ANY}}); - auto struct_to_json_cost = casts.ImplicitCastCost(struct_type, LogicalType::VARCHAR) - 2; - casts.RegisterCastFunction(struct_type, JSONCommon::JSONType(), AnyToJSONCastBind, struct_to_json_cost); - - const auto list_type = LogicalType::LIST(LogicalType::ANY); - auto list_to_json_cost = casts.ImplicitCastCost(list_type, LogicalType::VARCHAR) - 2; - casts.RegisterCastFunction(list_type, JSONCommon::JSONType(), AnyToJSONCastBind, list_to_json_cost); - - const auto map_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY); - auto map_to_json_cost = casts.ImplicitCastCost(map_type, LogicalType::VARCHAR) - 2; - casts.RegisterCastFunction(map_type, JSONCommon::JSONType(), AnyToJSONCastBind, map_to_json_cost); + // Anything can be cast to JSON + for (const auto &type : LogicalType::AllTypes()) { + LogicalType source_type; + switch (type.id()) { + case LogicalTypeId::STRUCT: + source_type = LogicalType::STRUCT({{"any", LogicalType::ANY}}); + break; + case LogicalTypeId::LIST: + source_type = LogicalType::LIST(LogicalType::ANY); + break; + case LogicalTypeId::MAP: + source_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY); + break; + case LogicalTypeId::UNION: + source_type = LogicalType::UNION({{"any", LogicalType::ANY}}); + break; + case LogicalTypeId::VARCHAR: + // We skip this one here as it's handled in json_functions.cpp + continue; + default: + source_type = type; + } + // We prefer going to JSON over going to VARCHAR if a function can do either + const auto source_to_json_cost = + MaxValue(casts.ImplicitCastCost(source_type, LogicalType::VARCHAR) - 1, 0); + casts.RegisterCastFunction(source_type, JSONCommon::JSONType(), AnyToJSONCastBind, source_to_json_cost); + } } } // namespace duckdb diff --git a/src/duckdb/extension/json/json_functions/json_transform.cpp b/src/duckdb/extension/json/json_functions/json_transform.cpp index a9b3f15c..d0b5af4e 100644 --- a/src/duckdb/extension/json/json_functions/json_transform.cpp +++ b/src/duckdb/extension/json/json_functions/json_transform.cpp @@ -898,20 +898,32 @@ BoundCastInfo JSONToAnyCastBind(BindCastInput &input, const LogicalType &source, } void JSONFunctions::RegisterJSONTransformCastFunctions(CastFunctionSet &casts) { - auto json_to_any_cost = casts.ImplicitCastCost(JSONCommon::JSONType(), LogicalType::ANY); - casts.RegisterCastFunction(JSONCommon::JSONType(), LogicalType::ANY, JSONToAnyCastBind, json_to_any_cost); - - const auto struct_type = LogicalType::STRUCT({{"any", LogicalType::ANY}}); - auto json_to_struct_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, struct_type) - 2; - casts.RegisterCastFunction(JSONCommon::JSONType(), struct_type, JSONToAnyCastBind, json_to_struct_cost); - - const auto list_type = LogicalType::LIST(LogicalType::ANY); - auto json_to_list_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, list_type) - 2; - casts.RegisterCastFunction(JSONCommon::JSONType(), list_type, JSONToAnyCastBind, json_to_list_cost); - - const auto map_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY); - auto json_to_map_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, map_type) - 2; - casts.RegisterCastFunction(JSONCommon::JSONType(), map_type, JSONToAnyCastBind, json_to_map_cost); + // JSON can be cast to anything + for (const auto &type : LogicalType::AllTypes()) { + LogicalType target_type; + switch (type.id()) { + case LogicalTypeId::STRUCT: + target_type = LogicalType::STRUCT({{"any", LogicalType::ANY}}); + break; + case LogicalTypeId::LIST: + target_type = LogicalType::LIST(LogicalType::ANY); + break; + case LogicalTypeId::MAP: + target_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY); + break; + case LogicalTypeId::UNION: + target_type = LogicalType::UNION({{"any", LogicalType::ANY}}); + break; + case LogicalTypeId::VARCHAR: + // We skip this one here as it's handled in json_functions.cpp + continue; + default: + target_type = type; + } + // Going from JSON to another type has the same cost as going from VARCHAR to that type + const auto json_to_target_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, target_type); + casts.RegisterCastFunction(JSONCommon::JSONType(), target_type, JSONToAnyCastBind, json_to_target_cost); + } } } // namespace duckdb diff --git a/src/duckdb/extension/parquet/column_reader.cpp b/src/duckdb/extension/parquet/column_reader.cpp index 0a0510a8..2c13b5f4 100644 --- a/src/duckdb/extension/parquet/column_reader.cpp +++ b/src/duckdb/extension/parquet/column_reader.cpp @@ -243,6 +243,7 @@ void ColumnReader::InitializeRead(idx_t row_group_idx_p, const vector(block->ptr, block->len - 1); + block->inc(block->len); + break; + } case Encoding::PLAIN: // nothing to do here, will be read directly below break; @@ -488,7 +496,7 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr idx_t null_count = 0; - if ((dict_decoder || dbp_decoder || rle_decoder) && HasDefines()) { + if ((dict_decoder || dbp_decoder || rle_decoder || bss_decoder) && HasDefines()) { // we need the null count because the dictionary offsets have no entries for nulls for (idx_t i = 0; i < read_now; i++) { if (define_out[i + result_offset] != max_define) { @@ -534,6 +542,23 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr } else if (byte_array_data) { // DELTA_BYTE_ARRAY or DELTA_LENGTH_BYTE_ARRAY DeltaByteArray(define_out, read_now, filter, result_offset, result); + } else if (bss_decoder) { + auto read_buf = make_shared(); + + switch (schema.type) { + case duckdb_parquet::format::Type::FLOAT: + read_buf->resize(reader.allocator, sizeof(float) * (read_now - null_count)); + bss_decoder->GetBatch(read_buf->ptr, read_now - null_count); + break; + case duckdb_parquet::format::Type::DOUBLE: + read_buf->resize(reader.allocator, sizeof(double) * (read_now - null_count)); + bss_decoder->GetBatch(read_buf->ptr, read_now - null_count); + break; + default: + throw std::runtime_error("BYTE_STREAM_SPLIT encoding is only supported for FLOAT or DOUBLE data"); + } + + Plain(read_buf, define_out, read_now, filter, result_offset, result); } else { PlainReference(block, result); Plain(block, define_out, read_now, filter, result_offset, result); diff --git a/src/duckdb/extension/parquet/column_writer.cpp b/src/duckdb/extension/parquet/column_writer.cpp index 83b1c1a7..e9ef6e88 100644 --- a/src/duckdb/extension/parquet/column_writer.cpp +++ b/src/duckdb/extension/parquet/column_writer.cpp @@ -796,6 +796,13 @@ struct ParquetTimestampSOperator : public BaseParquetOperator { } }; +struct ParquetTimeTZOperator : public BaseParquetOperator { + template + static TGT Operation(SRC input) { + return input.time().micros; + } +}; + struct ParquetHugeintOperator { template static TGT Operation(SRC input) { @@ -1975,12 +1982,14 @@ unique_ptr ColumnWriter::CreateWriterRecursive(vector>(writer, schema_idx, std::move(schema_path), max_repeat, max_define, can_have_nulls); + case LogicalTypeId::TIME_TZ: + return make_uniq>( + writer, schema_idx, std::move(schema_path), max_repeat, max_define, can_have_nulls); case LogicalTypeId::HUGEINT: return make_uniq>( writer, schema_idx, std::move(schema_path), max_repeat, max_define, can_have_nulls); diff --git a/src/duckdb/extension/parquet/include/column_reader.hpp b/src/duckdb/extension/parquet/include/column_reader.hpp index 029b1103..3d6491ce 100644 --- a/src/duckdb/extension/parquet/include/column_reader.hpp +++ b/src/duckdb/extension/parquet/include/column_reader.hpp @@ -9,6 +9,7 @@ #pragma once #include "duckdb.hpp" +#include "parquet_bss_decoder.hpp" #include "parquet_dbp_decoder.hpp" #include "parquet_rle_bp_decoder.hpp" #include "parquet_statistics.hpp" @@ -161,6 +162,7 @@ class ColumnReader { unique_ptr repeated_decoder; unique_ptr dbp_decoder; unique_ptr rle_decoder; + unique_ptr bss_decoder; // dummies for Skip() parquet_filter_t none_filter; diff --git a/src/duckdb/extension/parquet/include/parquet_bss_decoder.hpp b/src/duckdb/extension/parquet/include/parquet_bss_decoder.hpp new file mode 100644 index 00000000..b8cd8d11 --- /dev/null +++ b/src/duckdb/extension/parquet/include/parquet_bss_decoder.hpp @@ -0,0 +1,49 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// parquet_bss_decoder.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once +#include "parquet_types.h" +#include "resizable_buffer.hpp" + +namespace duckdb { + +/// Decoder for the Byte Stream Split encoding +class BssDecoder { +public: + /// Create a decoder object. buffer/buffer_len is the encoded data. + BssDecoder(data_ptr_t buffer, uint32_t buffer_len) : buffer_(buffer, buffer_len), value_offset_(0) { + } + +public: + template + void GetBatch(data_ptr_t values_target_ptr, uint32_t batch_size) { + if (buffer_.len % sizeof(T) != 0) { + std::stringstream error; + error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len + << ") should be a multiple of the type size (" << sizeof(T) << ")"; + throw std::runtime_error(error.str()); + } + uint32_t num_buffer_values = buffer_.len / sizeof(T); + + buffer_.available((value_offset_ + batch_size) * sizeof(T)); + + for (uint32_t byte_offset = 0; byte_offset < sizeof(T); ++byte_offset) { + data_ptr_t input_bytes = buffer_.ptr + byte_offset * num_buffer_values + value_offset_; + for (uint32_t i = 0; i < batch_size; ++i) { + values_target_ptr[byte_offset + i * sizeof(T)] = *(input_bytes + i); + } + } + value_offset_ += batch_size; + } + +private: + ByteBuffer buffer_; + uint32_t value_offset_; +}; + +} // namespace duckdb diff --git a/src/duckdb/extension/parquet/parquet_extension.cpp b/src/duckdb/extension/parquet/parquet_extension.cpp index f2192bf1..e897645e 100644 --- a/src/duckdb/extension/parquet/parquet_extension.cpp +++ b/src/duckdb/extension/parquet/parquet_extension.cpp @@ -20,6 +20,8 @@ #include "duckdb/common/enums/file_compression_type.hpp" #include "duckdb/common/file_system.hpp" #include "duckdb/common/multi_file_reader.hpp" +#include "duckdb/common/serializer/deserializer.hpp" +#include "duckdb/common/serializer/serializer.hpp" #include "duckdb/common/types/chunk_collection.hpp" #include "duckdb/function/copy_function.hpp" #include "duckdb/function/table_function.hpp" @@ -34,8 +36,6 @@ #include "duckdb/planner/operator/logical_get.hpp" #include "duckdb/storage/statistics/base_statistics.hpp" #include "duckdb/storage/table/row_group.hpp" -#include "duckdb/common/serializer/serializer.hpp" -#include "duckdb/common/serializer/deserializer.hpp" #endif namespace duckdb { @@ -983,8 +983,7 @@ idx_t ParquetWriteDesiredBatchSize(ClientContext &context, FunctionData &bind_da //===--------------------------------------------------------------------===// unique_ptr ParquetScanReplacement(ClientContext &context, const string &table_name, ReplacementScanData *data) { - auto lower_name = StringUtil::Lower(table_name); - if (!StringUtil::EndsWith(lower_name, ".parquet") && !StringUtil::Contains(lower_name, ".parquet?")) { + if (!ReplacementScan::CanReplace(table_name, {"parquet"})) { return nullptr; } auto table_function = make_uniq(); diff --git a/src/duckdb/extension/parquet/parquet_timestamp.cpp b/src/duckdb/extension/parquet/parquet_timestamp.cpp index 08cf021a..ec3ee8b1 100644 --- a/src/duckdb/extension/parquet/parquet_timestamp.cpp +++ b/src/duckdb/extension/parquet/parquet_timestamp.cpp @@ -66,10 +66,9 @@ dtime_t ParquetIntToTimeNs(const int64_t &raw_time) { return Time::FromTimeNs(raw_time); } -dtime_tz_t ParquetIntToTimeTZ(const int64_t &raw_time) { - dtime_tz_t result; - result.bits = raw_time; - return result; +dtime_tz_t ParquetIntToTimeTZ(const int64_t &raw_micros) { + dtime_t t(raw_micros); + return dtime_tz_t(t, 0); } } // namespace duckdb diff --git a/src/duckdb/src/common/arrow/appender/list_data.cpp b/src/duckdb/src/common/arrow/appender/list_data.cpp index 57400fc7..50ff8068 100644 --- a/src/duckdb/src/common/arrow/appender/list_data.cpp +++ b/src/duckdb/src/common/arrow/appender/list_data.cpp @@ -69,10 +69,10 @@ void ArrowListData::Finalize(ArrowAppendData &append_data, const LogicalType &ty result->buffers[1] = append_data.main_buffer.data(); auto &child_type = ListType::GetChildType(type); - append_data.child_pointers.resize(1); + ArrowAppender::AddChildren(append_data, 1); result->children = append_data.child_pointers.data(); result->n_children = 1; - append_data.child_pointers[0] = ArrowAppender::FinalizeChild(child_type, *append_data.child_data[0]); + append_data.child_arrays[0] = *ArrowAppender::FinalizeChild(child_type, std::move(append_data.child_data[0])); } } // namespace duckdb diff --git a/src/duckdb/src/common/arrow/appender/map_data.cpp b/src/duckdb/src/common/arrow/appender/map_data.cpp index 90e99a9a..3bacf653 100644 --- a/src/duckdb/src/common/arrow/appender/map_data.cpp +++ b/src/duckdb/src/common/arrow/appender/map_data.cpp @@ -52,33 +52,38 @@ void ArrowMapData::Append(ArrowAppendData &append_data, Vector &input, idx_t fro void ArrowMapData::Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) { // set up the main map buffer + D_ASSERT(result); result->n_buffers = 2; result->buffers[1] = append_data.main_buffer.data(); // the main map buffer has a single child: a struct - append_data.child_pointers.resize(1); + ArrowAppender::AddChildren(append_data, 1); result->children = append_data.child_pointers.data(); result->n_children = 1; - append_data.child_pointers[0] = ArrowAppender::FinalizeChild(type, *append_data.child_data[0]); - // now that struct has two children: the key and the value type auto &struct_data = *append_data.child_data[0]; - auto &struct_result = append_data.child_pointers[0]; - struct_data.child_pointers.resize(2); + auto struct_result = ArrowAppender::FinalizeChild(type, std::move(append_data.child_data[0])); + + // Initialize the struct array data + const auto struct_child_count = 2; + ArrowAppender::AddChildren(struct_data, struct_child_count); + struct_result->children = struct_data.child_pointers.data(); struct_result->n_buffers = 1; - struct_result->n_children = 2; + struct_result->n_children = struct_child_count; struct_result->length = struct_data.child_data[0]->row_count; - struct_result->children = struct_data.child_pointers.data(); + + append_data.child_arrays[0] = *struct_result; D_ASSERT(struct_data.child_data[0]->row_count == struct_data.child_data[1]->row_count); auto &key_type = MapType::KeyType(type); auto &value_type = MapType::ValueType(type); - struct_data.child_pointers[0] = ArrowAppender::FinalizeChild(key_type, *struct_data.child_data[0]); - struct_data.child_pointers[1] = ArrowAppender::FinalizeChild(value_type, *struct_data.child_data[1]); + auto key_data = ArrowAppender::FinalizeChild(key_type, std::move(struct_data.child_data[0])); + struct_data.child_arrays[0] = *key_data; + struct_data.child_arrays[1] = *ArrowAppender::FinalizeChild(value_type, std::move(struct_data.child_data[1])); // keys cannot have null values - if (struct_data.child_pointers[0]->null_count > 0) { + if (key_data->null_count > 0) { throw std::runtime_error("Arrow doesn't accept NULL keys on Maps"); } } diff --git a/src/duckdb/src/common/arrow/appender/struct_data.cpp b/src/duckdb/src/common/arrow/appender/struct_data.cpp index b6c0972e..ce74a92a 100644 --- a/src/duckdb/src/common/arrow/appender/struct_data.cpp +++ b/src/duckdb/src/common/arrow/appender/struct_data.cpp @@ -33,12 +33,12 @@ void ArrowStructData::Finalize(ArrowAppendData &append_data, const LogicalType & result->n_buffers = 1; auto &child_types = StructType::GetChildTypes(type); - append_data.child_pointers.resize(child_types.size()); + ArrowAppender::AddChildren(append_data, child_types.size()); result->children = append_data.child_pointers.data(); result->n_children = child_types.size(); for (idx_t i = 0; i < child_types.size(); i++) { auto &child_type = child_types[i].second; - append_data.child_pointers[i] = ArrowAppender::FinalizeChild(child_type, *append_data.child_data[i]); + append_data.child_arrays[i] = *ArrowAppender::FinalizeChild(child_type, std::move(append_data.child_data[i])); } } diff --git a/src/duckdb/src/common/arrow/appender/union_data.cpp b/src/duckdb/src/common/arrow/appender/union_data.cpp index 0c52f80e..cfe54f89 100644 --- a/src/duckdb/src/common/arrow/appender/union_data.cpp +++ b/src/duckdb/src/common/arrow/appender/union_data.cpp @@ -58,12 +58,12 @@ void ArrowUnionData::Finalize(ArrowAppendData &append_data, const LogicalType &t result->buffers[1] = append_data.main_buffer.data(); auto &child_types = UnionType::CopyMemberTypes(type); - append_data.child_pointers.resize(child_types.size()); + ArrowAppender::AddChildren(append_data, child_types.size()); result->children = append_data.child_pointers.data(); result->n_children = child_types.size(); for (idx_t i = 0; i < child_types.size(); i++) { auto &child_type = child_types[i].second; - append_data.child_pointers[i] = ArrowAppender::FinalizeChild(child_type, *append_data.child_data[i]); + append_data.child_arrays[i] = *ArrowAppender::FinalizeChild(child_type, std::move(append_data.child_data[i])); } } diff --git a/src/duckdb/src/common/arrow/arrow_appender.cpp b/src/duckdb/src/common/arrow/arrow_appender.cpp index 18414f5b..10d1e39e 100644 --- a/src/duckdb/src/common/arrow/arrow_appender.cpp +++ b/src/duckdb/src/common/arrow/arrow_appender.cpp @@ -39,18 +39,31 @@ void ArrowAppender::ReleaseArray(ArrowArray *array) { if (!array || !array->release) { return; } - array->release = nullptr; auto holder = static_cast(array->private_data); + for (int64_t i = 0; i < array->n_children; i++) { + auto child = array->children[i]; + if (!child->release) { + // Child was moved out of the array + continue; + } + child->release(child); + D_ASSERT(!child->release); + } + if (array->dictionary && array->dictionary->release) { + array->dictionary->release(array->dictionary); + } + array->release = nullptr; delete holder; } //===--------------------------------------------------------------------===// // Finalize Arrow Child //===--------------------------------------------------------------------===// -ArrowArray *ArrowAppender::FinalizeChild(const LogicalType &type, ArrowAppendData &append_data) { +ArrowArray *ArrowAppender::FinalizeChild(const LogicalType &type, unique_ptr append_data_p) { auto result = make_uniq(); - result->private_data = nullptr; + auto &append_data = *append_data_p; + result->private_data = append_data_p.release(); result->release = ArrowAppender::ReleaseArray; result->n_children = 0; result->null_count = 0; @@ -75,7 +88,7 @@ ArrowArray ArrowAppender::Finalize() { auto root_holder = make_uniq(options); ArrowArray result; - root_holder->child_pointers.resize(types.size()); + AddChildren(*root_holder, types.size()); result.children = root_holder->child_pointers.data(); result.n_children = types.size(); @@ -88,10 +101,8 @@ ArrowArray ArrowAppender::Finalize() { result.dictionary = nullptr; root_holder->child_data = std::move(root_data); - // FIXME: this violates a property of the arrow format, if root owns all the child memory then consumers can't move - // child arrays https://arrow.apache.org/docs/format/CDataInterface.html#moving-child-arrays for (idx_t i = 0; i < root_holder->child_data.size(); i++) { - root_holder->child_pointers[i] = ArrowAppender::FinalizeChild(types[i], *root_holder->child_data[i]); + root_holder->child_arrays[i] = *ArrowAppender::FinalizeChild(types[i], std::move(root_holder->child_data[i])); } // Release ownership to caller @@ -238,4 +249,12 @@ unique_ptr ArrowAppender::InitializeChild(const LogicalType &ty return result; } +void ArrowAppender::AddChildren(ArrowAppendData &data, idx_t count) { + data.child_pointers.resize(count); + data.child_arrays.resize(count); + for (idx_t i = 0; i < count; i++) { + data.child_pointers[i] = &data.child_arrays[i]; + } +} + } // namespace duckdb diff --git a/src/duckdb/src/common/arrow/arrow_wrapper.cpp b/src/duckdb/src/common/arrow/arrow_wrapper.cpp index 68170c96..5a39f81b 100644 --- a/src/duckdb/src/common/arrow/arrow_wrapper.cpp +++ b/src/duckdb/src/common/arrow/arrow_wrapper.cpp @@ -16,21 +16,21 @@ namespace duckdb { ArrowSchemaWrapper::~ArrowSchemaWrapper() { if (arrow_schema.release) { arrow_schema.release(&arrow_schema); - arrow_schema.release = nullptr; + D_ASSERT(!arrow_schema.release); } } ArrowArrayWrapper::~ArrowArrayWrapper() { if (arrow_array.release) { arrow_array.release(&arrow_array); - arrow_array.release = nullptr; + D_ASSERT(!arrow_array.release); } } ArrowArrayStreamWrapper::~ArrowArrayStreamWrapper() { if (arrow_array_stream.release) { arrow_array_stream.release(&arrow_array_stream); - arrow_array_stream.release = nullptr; + D_ASSERT(!arrow_array_stream.release); } } diff --git a/src/duckdb/src/common/enum_util.cpp b/src/duckdb/src/common/enum_util.cpp index 305d09d5..2d397936 100644 --- a/src/duckdb/src/common/enum_util.cpp +++ b/src/duckdb/src/common/enum_util.cpp @@ -68,7 +68,7 @@ #include "duckdb/execution/index/art/node.hpp" #include "duckdb/execution/operator/scan/csv/base_csv_reader.hpp" #include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp" -#include "duckdb/execution/operator/scan/csv/csv_state_machine.hpp" +#include "duckdb/execution/operator/scan/csv/csv_state.hpp" #include "duckdb/execution/operator/scan/csv/quote_rules.hpp" #include "duckdb/function/aggregate_state.hpp" #include "duckdb/function/function.hpp" diff --git a/src/duckdb/src/common/exception.cpp b/src/duckdb/src/common/exception.cpp index 68e761ea..882d47df 100644 --- a/src/duckdb/src/common/exception.cpp +++ b/src/duckdb/src/common/exception.cpp @@ -1,5 +1,4 @@ #include "duckdb/common/exception.hpp" - #include "duckdb/common/string_util.hpp" #include "duckdb/common/to_string.hpp" #include "duckdb/common/types.hpp" @@ -82,91 +81,68 @@ string Exception::ConstructMessageRecursive(const string &msg, std::vector= raw_message.size()) { + // Not enough characters afterward, bail out + return; + } + string err = raw_message.substr(0, position_semicolon); + string msg = raw_message.substr(position_semicolon + 2); + if (err.size() > 6 && err.substr(err.size() - 6) == " Error" && !msg.empty()) { + ExceptionType new_type = Exception::StringToExceptionType(err.substr(0, err.size() - 6)); + if (new_type != type) { + type = new_type; + raw_message = msg; + } + } } const string &PreservedError::Message() { diff --git a/src/duckdb/src/common/serializer/binary_deserializer.cpp b/src/duckdb/src/common/serializer/binary_deserializer.cpp index 86d9638b..6b31146c 100644 --- a/src/duckdb/src/common/serializer/binary_deserializer.cpp +++ b/src/duckdb/src/common/serializer/binary_deserializer.cpp @@ -8,7 +8,8 @@ namespace duckdb { void BinaryDeserializer::OnPropertyBegin(const field_id_t field_id, const char *) { auto field = NextField(); if (field != field_id) { - throw InternalException("Failed to deserialize: field id mismatch, expected: %d, got: %d", field_id, field); + throw SerializationException("Failed to deserialize: field id mismatch, expected: %d, got: %d", field_id, + field); } } @@ -34,7 +35,8 @@ void BinaryDeserializer::OnObjectBegin() { void BinaryDeserializer::OnObjectEnd() { auto next_field = NextField(); if (next_field != MESSAGE_TERMINATOR_FIELD_ID) { - throw InternalException("Failed to deserialize: expected end of object, but found field id: %d", next_field); + throw SerializationException("Failed to deserialize: expected end of object, but found field id: %d", + next_field); } nesting_level--; } diff --git a/src/duckdb/src/common/types/data_chunk.cpp b/src/duckdb/src/common/types/data_chunk.cpp index 4c7e16a6..a734f086 100644 --- a/src/duckdb/src/common/types/data_chunk.cpp +++ b/src/duckdb/src/common/types/data_chunk.cpp @@ -64,7 +64,7 @@ void DataChunk::InitializeEmpty(vector::const_iterator begin, vecto } void DataChunk::Reset() { - if (data.empty()) { + if (data.empty() || vector_caches.empty()) { return; } if (vector_caches.size() != data.size()) { diff --git a/src/duckdb/src/core_functions/scalar/map/map.cpp b/src/duckdb/src/core_functions/scalar/map/map.cpp index b4c5669a..c56d15f9 100644 --- a/src/duckdb/src/core_functions/scalar/map/map.cpp +++ b/src/duckdb/src/core_functions/scalar/map/map.cpp @@ -87,11 +87,24 @@ static bool ListEntriesEqual(Vector &keys, Vector &values, idx_t count) { return true; } +static list_entry_t *GetBiggestList(Vector &key, Vector &value, idx_t &size) { + auto key_size = ListVector::GetListSize(key); + auto value_size = ListVector::GetListSize(value); + if (key_size > value_size) { + size = key_size; + return ListVector::GetData(key); + } + size = value_size; + return ListVector::GetData(value); +} + static void MapFunction(DataChunk &args, ExpressionState &state, Vector &result) { D_ASSERT(result.GetType().id() == LogicalTypeId::MAP); - auto &key_vector = MapVector::GetKeys(result); - auto &value_vector = MapVector::GetValues(result); + auto count = args.size(); + + auto &map_key_vector = MapVector::GetKeys(result); + auto &map_value_vector = MapVector::GetValues(result); auto result_data = ListVector::GetData(result); result.SetVectorType(VectorType::CONSTANT_VECTOR); @@ -99,52 +112,73 @@ static void MapFunction(DataChunk &args, ExpressionState &state, Vector &result) ListVector::SetListSize(result, 0); result_data->offset = 0; result_data->length = 0; - result.Verify(args.size()); + result.Verify(count); return; } - bool keys_are_const = args.data[0].GetVectorType() == VectorType::CONSTANT_VECTOR; - bool values_are_const = args.data[1].GetVectorType() == VectorType::CONSTANT_VECTOR; - if (!keys_are_const || !values_are_const) { - result.SetVectorType(VectorType::FLAT_VECTOR); + D_ASSERT(args.ColumnCount() == 2); + auto &key_vector = args.data[0]; + auto &value_vector = args.data[1]; + + if (args.AllConstant()) { + auto key_data = ListVector::GetData(key_vector); + auto value_data = ListVector::GetData(value_vector); + auto key_entry = key_data[0]; + auto value_entry = value_data[0]; + if (key_entry != value_entry) { + throw BinderException("Key and value list sizes don't match"); + } + result_data[0] = key_entry; + ListVector::SetListSize(result, ListVector::GetListSize(key_vector)); + map_key_vector.Reference(ListVector::GetEntry(key_vector)); + map_value_vector.Reference(ListVector::GetEntry(value_vector)); + MapVector::MapConversionVerify(result, count); + result.Verify(count); + return; } - auto key_count = ListVector::GetListSize(args.data[0]); - auto value_count = ListVector::GetListSize(args.data[1]); - auto key_data = ListVector::GetData(args.data[0]); - auto value_data = ListVector::GetData(args.data[1]); - auto src_data = key_data; - - if (keys_are_const && !values_are_const) { - AlignVectorToReference(args.data[0], args.data[1], args.size(), key_vector); - src_data = value_data; - } else if (values_are_const && !keys_are_const) { - AlignVectorToReference(args.data[1], args.data[0], args.size(), value_vector); + result.SetVectorType(VectorType::FLAT_VECTOR); + + if (key_vector.GetVectorType() == VectorType::CONSTANT_VECTOR) { + D_ASSERT(value_vector.GetVectorType() != VectorType::CONSTANT_VECTOR); + Vector expanded_const(ListType::GetChildType(key_vector.GetType()), count); + AlignVectorToReference(key_vector, value_vector, count, expanded_const); + map_key_vector.Reference(expanded_const); + + value_vector.Flatten(count); + map_value_vector.Reference(ListVector::GetEntry(value_vector)); + } else if (value_vector.GetVectorType() == VectorType::CONSTANT_VECTOR) { + D_ASSERT(key_vector.GetVectorType() != VectorType::CONSTANT_VECTOR); + Vector expanded_const(ListType::GetChildType(value_vector.GetType()), count); + AlignVectorToReference(value_vector, key_vector, count, expanded_const); + map_value_vector.Reference(expanded_const); + + key_vector.Flatten(count); + map_key_vector.Reference(ListVector::GetEntry(key_vector)); } else { - if (!ListEntriesEqual(args.data[0], args.data[1], args.size())) { + key_vector.Flatten(count); + value_vector.Flatten(count); + + if (!ListEntriesEqual(key_vector, value_vector, count)) { throw InvalidInputException("Error in MAP creation: key list and value list do not align. i.e. different " "size or incompatible structure"); } + + map_value_vector.Reference(ListVector::GetEntry(value_vector)); + map_key_vector.Reference(ListVector::GetEntry(key_vector)); } - ListVector::SetListSize(result, MaxValue(key_count, value_count)); + idx_t list_size; + auto src_data = GetBiggestList(key_vector, value_vector, list_size); + ListVector::SetListSize(result, list_size); result_data = ListVector::GetData(result); - for (idx_t i = 0; i < args.size(); i++) { + for (idx_t i = 0; i < count; i++) { result_data[i] = src_data[i]; } - // check whether one of the vectors has already been referenced to an expanded vector in the case of const/non-const - // combination. If not, then referencing is still necessary - if (!(keys_are_const && !values_are_const)) { - key_vector.Reference(ListVector::GetEntry(args.data[0])); - } - if (!(values_are_const && !keys_are_const)) { - value_vector.Reference(ListVector::GetEntry(args.data[1])); - } - - MapVector::MapConversionVerify(result, args.size()); - result.Verify(args.size()); + MapVector::MapConversionVerify(result, count); + result.Verify(count); } static unique_ptr MapBind(ClientContext &context, ScalarFunction &bound_function, diff --git a/src/duckdb/src/execution/expression_executor/execute_reference.cpp b/src/duckdb/src/execution/expression_executor/execute_reference.cpp index 88fdfa63..4dac1539 100644 --- a/src/duckdb/src/execution/expression_executor/execute_reference.cpp +++ b/src/duckdb/src/execution/expression_executor/execute_reference.cpp @@ -6,7 +6,7 @@ namespace duckdb { unique_ptr ExpressionExecutor::InitializeState(const BoundReferenceExpression &expr, ExpressionExecutorState &root) { auto result = make_uniq(expr, root); - result->Finalize(); + result->Finalize(true); return result; } diff --git a/src/duckdb/src/execution/expression_executor_state.cpp b/src/duckdb/src/execution/expression_executor_state.cpp index 401e7615..da5acfa6 100644 --- a/src/duckdb/src/execution/expression_executor_state.cpp +++ b/src/duckdb/src/execution/expression_executor_state.cpp @@ -1,4 +1,5 @@ #include "duckdb/execution/expression_executor_state.hpp" + #include "duckdb/execution/expression_executor.hpp" #include "duckdb/planner/expression.hpp" #include "duckdb/planner/expression/bound_function_expression.hpp" @@ -10,8 +11,13 @@ void ExpressionState::AddChild(Expression *expr) { child_states.push_back(ExpressionExecutor::InitializeState(*expr, root)); } -void ExpressionState::Finalize() { - if (!types.empty()) { +void ExpressionState::Finalize(bool empty) { + if (types.empty()) { + return; + } + if (empty) { + intermediate_chunk.InitializeEmpty(types); + } else { intermediate_chunk.Initialize(GetAllocator(), types); } } diff --git a/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp b/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp index 55c9494c..19e02da1 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/buffered_csv_reader.cpp @@ -192,6 +192,7 @@ void BufferedCSVReader::ParseCSV(ParserMode mode) { } bool BufferedCSVReader::TryParseCSV(ParserMode parser_mode, DataChunk &insert_chunk, string &error_message) { + cached_buffers.clear(); mode = parser_mode; // used for parsing algorithm bool finished_chunk = false; @@ -427,7 +428,6 @@ add_row : { Flush(insert_chunk); } - end_of_file_reached = true; return true; } diff --git a/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp b/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp index 4cf52f2b..3129048e 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/csv_state_machine_cache.cpp @@ -3,8 +3,8 @@ namespace duckdb { -void InitializeTransitionArray(unsigned char *transition_array, const uint8_t state) { - for (uint32_t i = 0; i < NUM_TRANSITIONS; i++) { +void InitializeTransitionArray(CSVState *transition_array, const CSVState state) { + for (uint32_t i = 0; i < StateMachine::NUM_TRANSITIONS; i++) { transition_array[i] = state; } } @@ -13,72 +13,65 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op D_ASSERT(state_machine_cache.find(state_machine_options) == state_machine_cache.end()); // Initialize transition array with default values to the Standard option auto &transition_array = state_machine_cache[state_machine_options]; - const uint8_t standard_state = static_cast(CSVState::STANDARD); - const uint8_t field_separator_state = static_cast(CSVState::DELIMITER); - const uint8_t record_separator_state = static_cast(CSVState::RECORD_SEPARATOR); - const uint8_t carriage_return_state = static_cast(CSVState::CARRIAGE_RETURN); - const uint8_t quoted_state = static_cast(CSVState::QUOTED); - const uint8_t unquoted_state = static_cast(CSVState::UNQUOTED); - const uint8_t escape_state = static_cast(CSVState::ESCAPE); - const uint8_t empty_line_state = static_cast(CSVState::EMPTY_LINE); - const uint8_t invalid_state = static_cast(CSVState::INVALID); - for (uint32_t i = 0; i < NUM_STATES; i++) { - switch (i) { - case quoted_state: - InitializeTransitionArray(transition_array[i], quoted_state); + for (uint32_t i = 0; i < StateMachine::NUM_STATES; i++) { + CSVState cur_state = CSVState(i); + switch (cur_state) { + case CSVState::QUOTED: + InitializeTransitionArray(transition_array[cur_state], CSVState::QUOTED); break; - case unquoted_state: - case invalid_state: - case escape_state: - InitializeTransitionArray(transition_array[i], invalid_state); + case CSVState::UNQUOTED: + case CSVState::INVALID: + case CSVState::ESCAPE: + InitializeTransitionArray(transition_array[cur_state], CSVState::INVALID); break; default: - InitializeTransitionArray(transition_array[i], standard_state); + InitializeTransitionArray(transition_array[cur_state], CSVState::STANDARD); break; } } // Now set values depending on configuration // 1) Standard State - transition_array[standard_state][static_cast(state_machine_options.delimiter)] = field_separator_state; - transition_array[standard_state][static_cast('\n')] = record_separator_state; - transition_array[standard_state][static_cast('\r')] = carriage_return_state; - transition_array[standard_state][static_cast(state_machine_options.quote)] = quoted_state; + transition_array[CSVState::STANDARD][static_cast(state_machine_options.delimiter)] = CSVState::DELIMITER; + transition_array[CSVState::STANDARD][static_cast('\n')] = CSVState::RECORD_SEPARATOR; + transition_array[CSVState::STANDARD][static_cast('\r')] = CSVState::CARRIAGE_RETURN; + transition_array[CSVState::STANDARD][static_cast(state_machine_options.quote)] = CSVState::QUOTED; // 2) Field Separator State - transition_array[field_separator_state][static_cast(state_machine_options.delimiter)] = - field_separator_state; - transition_array[field_separator_state][static_cast('\n')] = record_separator_state; - transition_array[field_separator_state][static_cast('\r')] = carriage_return_state; - transition_array[field_separator_state][static_cast(state_machine_options.quote)] = quoted_state; + transition_array[CSVState::DELIMITER][static_cast(state_machine_options.delimiter)] = CSVState::DELIMITER; + transition_array[CSVState::DELIMITER][static_cast('\n')] = CSVState::RECORD_SEPARATOR; + transition_array[CSVState::DELIMITER][static_cast('\r')] = CSVState::CARRIAGE_RETURN; + transition_array[CSVState::DELIMITER][static_cast(state_machine_options.quote)] = CSVState::QUOTED; // 3) Record Separator State - transition_array[record_separator_state][static_cast(state_machine_options.delimiter)] = - field_separator_state; - transition_array[record_separator_state][static_cast('\n')] = empty_line_state; - transition_array[record_separator_state][static_cast('\r')] = empty_line_state; - transition_array[record_separator_state][static_cast(state_machine_options.quote)] = quoted_state; + transition_array[CSVState::RECORD_SEPARATOR][static_cast(state_machine_options.delimiter)] = + CSVState::DELIMITER; + transition_array[CSVState::RECORD_SEPARATOR][static_cast('\n')] = CSVState::EMPTY_LINE; + transition_array[CSVState::RECORD_SEPARATOR][static_cast('\r')] = CSVState::EMPTY_LINE; + transition_array[CSVState::RECORD_SEPARATOR][static_cast(state_machine_options.quote)] = CSVState::QUOTED; // 4) Carriage Return State - transition_array[carriage_return_state][static_cast('\n')] = record_separator_state; - transition_array[carriage_return_state][static_cast('\r')] = empty_line_state; - transition_array[carriage_return_state][static_cast(state_machine_options.escape)] = escape_state; + transition_array[CSVState::CARRIAGE_RETURN][static_cast('\n')] = CSVState::RECORD_SEPARATOR; + transition_array[CSVState::CARRIAGE_RETURN][static_cast('\r')] = CSVState::EMPTY_LINE; + transition_array[CSVState::CARRIAGE_RETURN][static_cast(state_machine_options.escape)] = CSVState::ESCAPE; // 5) Quoted State - transition_array[quoted_state][static_cast(state_machine_options.quote)] = unquoted_state; + transition_array[CSVState::QUOTED][static_cast(state_machine_options.quote)] = CSVState::UNQUOTED; if (state_machine_options.quote != state_machine_options.escape) { - transition_array[quoted_state][static_cast(state_machine_options.escape)] = escape_state; + transition_array[CSVState::QUOTED][static_cast(state_machine_options.escape)] = CSVState::ESCAPE; } // 6) Unquoted State - transition_array[unquoted_state][static_cast('\n')] = record_separator_state; - transition_array[unquoted_state][static_cast('\r')] = carriage_return_state; - transition_array[unquoted_state][static_cast(state_machine_options.delimiter)] = field_separator_state; + transition_array[CSVState::UNQUOTED][static_cast('\n')] = CSVState::RECORD_SEPARATOR; + transition_array[CSVState::UNQUOTED][static_cast('\r')] = CSVState::CARRIAGE_RETURN; + transition_array[CSVState::UNQUOTED][static_cast(state_machine_options.delimiter)] = CSVState::DELIMITER; if (state_machine_options.quote == state_machine_options.escape) { - transition_array[unquoted_state][static_cast(state_machine_options.escape)] = quoted_state; + transition_array[CSVState::UNQUOTED][static_cast(state_machine_options.escape)] = CSVState::QUOTED; } // 7) Escaped State - transition_array[escape_state][static_cast(state_machine_options.quote)] = quoted_state; - transition_array[escape_state][static_cast(state_machine_options.escape)] = quoted_state; + transition_array[CSVState::ESCAPE][static_cast(state_machine_options.quote)] = CSVState::QUOTED; + transition_array[CSVState::ESCAPE][static_cast(state_machine_options.escape)] = CSVState::QUOTED; // 8) Empty Line State - transition_array[empty_line_state][static_cast('\r')] = empty_line_state; - transition_array[empty_line_state][static_cast('\n')] = empty_line_state; + transition_array[CSVState::EMPTY_LINE][static_cast('\r')] = CSVState::EMPTY_LINE; + transition_array[CSVState::EMPTY_LINE][static_cast('\n')] = CSVState::EMPTY_LINE; + transition_array[CSVState::EMPTY_LINE][static_cast(state_machine_options.delimiter)] = CSVState::DELIMITER; + transition_array[CSVState::EMPTY_LINE][static_cast(state_machine_options.quote)] = CSVState::QUOTED; } CSVStateMachineCache::CSVStateMachineCache() { @@ -95,7 +88,7 @@ CSVStateMachineCache::CSVStateMachineCache() { } } -const state_machine_t &CSVStateMachineCache::Get(const CSVStateMachineOptions &state_machine_options) { +const StateMachine &CSVStateMachineCache::Get(const CSVStateMachineOptions &state_machine_options) { //! Custom State Machine, we need to create it and cache it first if (state_machine_cache.find(state_machine_options) == state_machine_cache.end()) { Insert(state_machine_options); diff --git a/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp b/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp index 73fe6726..88d90765 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/parallel_csv_reader.cpp @@ -49,11 +49,12 @@ bool ParallelCSVReader::NewLineDelimiter(bool carry, bool carry_followed_by_nl, return (carry && carry_followed_by_nl) || (!carry && first_char); } -void ParallelCSVReader::SkipEmptyLines() { +bool ParallelCSVReader::SkipEmptyLines() { + const idx_t initial_position_buffer = position_buffer; idx_t new_pos_buffer = position_buffer; if (parse_chunk.data.size() == 1) { // Empty lines are null data. - return; + return initial_position_buffer != position_buffer; } for (; new_pos_buffer < end_buffer; new_pos_buffer++) { if (StringUtil::CharacterIsNewline((*buffer)[new_pos_buffer])) { @@ -63,13 +64,14 @@ void ParallelCSVReader::SkipEmptyLines() { position_buffer++; } if (new_pos_buffer > end_buffer) { - return; + return initial_position_buffer != position_buffer; } position_buffer = new_pos_buffer; } else if ((*buffer)[new_pos_buffer] != ' ') { - return; + return initial_position_buffer != position_buffer; } } + return initial_position_buffer != position_buffer; } bool ParallelCSVReader::SetPosition() { @@ -185,7 +187,6 @@ bool ParallelCSVReader::SetPosition() { } // Ensure that parse_chunk has no gunk when trying to figure new line parse_chunk.Reset(); - verification_positions.end_of_last_line = position_buffer; finished = false; return successfully_read_first_line; @@ -288,7 +289,7 @@ bool ParallelCSVReader::TryParseSimpleCSV(DataChunk &insert_chunk, string &error idx_t column = 0; idx_t offset = 0; bool has_quotes = false; - + bool last_line_empty = false; vector escape_positions; if ((start_buffer == buffer->buffer_start || start_buffer == buffer->buffer_end) && !try_add_line) { // First time reading this buffer piece @@ -454,7 +455,10 @@ add_row : { if (!BufferRemainder()) { goto final_state; } - SkipEmptyLines(); + if (SkipEmptyLines() && reached_remainder_state) { + last_line_empty = true; + goto final_state; + } if (position_buffer - verification_positions.end_of_last_line > options.buffer_size) { error_message = "Line does not fit in one buffer. Increase the buffer size."; return false; @@ -583,8 +587,8 @@ final_state : { return true; } // If this is the last buffer, we have to read the last value - if (buffer->buffer->is_last_buffer || !buffer->next_buffer || - (buffer->next_buffer && buffer->next_buffer->is_last_buffer)) { + if (!last_line_empty && (buffer->buffer->is_last_buffer || !buffer->next_buffer || + (buffer->next_buffer && buffer->next_buffer->is_last_buffer))) { if (column > 0 || start_buffer != position_buffer || try_add_line || (insert_chunk.data.size() == 1 && start_buffer != position_buffer)) { // remaining values to be added to the chunk diff --git a/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp b/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp index cc3fc947..59b68baf 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp @@ -22,30 +22,9 @@ CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr } } -SnifferResult CSVSniffer::SniffCSV() { - // 1. Dialect Detection - DetectDialect(); - if (explicit_set_columns) { - if (!candidates.empty()) { - options.dialect_options.state_machine_options = candidates[0]->dialect_options.state_machine_options; - options.dialect_options.new_line = candidates[0]->dialect_options.new_line; - } - // We do not need to run type and header detection as these were defined by the user - return SnifferResult(detected_types, names); - } - // 2. Type Detection - DetectTypes(); - // 3. Header Detection - DetectHeader(); - D_ASSERT(best_sql_types_candidates_per_column_idx.size() == names.size()); - // 4. Type Replacement - ReplaceTypes(); - // 5. Type Refinement - RefineTypes(); - // We are done, construct and return the result. - - // Set the CSV Options in the reference +void CSVSniffer::SetResultOptions() { options.dialect_options = best_candidate->dialect_options; + options.dialect_options.new_line = best_candidate->dialect_options.new_line; options.has_header = best_candidate->dialect_options.header; options.skip_rows_set = options.dialect_options.skip_rows > 0; if (options.has_header) { @@ -53,8 +32,27 @@ SnifferResult CSVSniffer::SniffCSV() { } else { options.dialect_options.true_start = best_start_without_header; } +} - // Return the types and names +SnifferResult CSVSniffer::SniffCSV() { + // 1. Dialect Detection + DetectDialect(); + // 2. Type Detection + DetectTypes(); + // 3. Type Refinement + RefineTypes(); + // 4. Header Detection + DetectHeader(); + if (explicit_set_columns) { + SetResultOptions(); + // We do not need to run type refinement, since the types have been given by the user + return SnifferResult({}, {}); + } + // 5. Type Replacement + ReplaceTypes(); + D_ASSERT(best_sql_types_candidates_per_column_idx.size() == names.size()); + // We are done, Set the CSV Options in the reference. Construct and return the result. + SetResultOptions(); return SnifferResult(detected_types, names); } diff --git a/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp b/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp index add96c2d..85234f06 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp @@ -5,9 +5,9 @@ namespace duckdb { struct SniffDialect { inline static void Initialize(CSVStateMachine &machine) { - machine.state = CSVState::STANDARD; - machine.previous_state = CSVState::STANDARD; - machine.pre_previous_state = CSVState::STANDARD; + machine.state = CSVState::EMPTY_LINE; + machine.previous_state = CSVState::EMPTY_LINE; + machine.pre_previous_state = CSVState::EMPTY_LINE; machine.cur_rows = 0; machine.column_count = 1; } @@ -21,17 +21,12 @@ struct SniffDialect { sniffed_column_counts.clear(); return true; } - machine.pre_previous_state = machine.previous_state; - machine.previous_state = machine.state; - - machine.state = static_cast( - machine.transition_array[static_cast(machine.state)][static_cast(current_char)]); + machine.Transition(current_char); bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN; machine.column_count += machine.previous_state == CSVState::DELIMITER; sniffed_column_counts[machine.cur_rows] = machine.column_count; - machine.cur_rows += - machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE; + machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR; machine.column_count -= (machine.column_count - 1) * (machine.previous_state == CSVState::RECORD_SEPARATOR); // It means our carriage return is actually a record separator @@ -304,7 +299,7 @@ void CSVSniffer::DetectDialect() { unordered_map> quote_candidates_map; // Candidates for the escape option unordered_map> escape_candidates_map; - escape_candidates_map[(uint8_t)QuoteRule::QUOTES_RFC] = {'\0', '\"', '\''}; + escape_candidates_map[(uint8_t)QuoteRule::QUOTES_RFC] = {'\"', '\'', '\0'}; escape_candidates_map[(uint8_t)QuoteRule::QUOTES_OTHER] = {'\\'}; escape_candidates_map[(uint8_t)QuoteRule::NO_QUOTES] = {'\0'}; // Number of rows read diff --git a/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp b/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp index 152f9baf..a23af7f5 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/sniffer/header_detection.cpp @@ -97,9 +97,14 @@ void CSVSniffer::DetectHeader() { bool first_row_consistent = true; // check if header row is all null and/or consistent with detected column data types bool first_row_nulls = true; - // This case will fail in dialect detection, so we assert here just for sanity - D_ASSERT(best_candidate->options.null_padding || - best_sql_types_candidates_per_column_idx.size() == best_header_row.size()); + // If null-padding is not allowed and there is a mismatch between our header candidate and the number of columns + // We can't detect the dialect/type options properly + if (!best_candidate->options.null_padding && + best_sql_types_candidates_per_column_idx.size() != best_header_row.size()) { + throw InvalidInputException( + "Error in file \"%s\": CSV options could not be auto-detected. Consider setting parser options manually.", + options.file_path); + } for (idx_t col = 0; col < best_header_row.size(); col++) { auto dummy_val = best_header_row[col]; if (!dummy_val.IsNull()) { diff --git a/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp b/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp index c7a300cc..c22eb906 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp @@ -143,20 +143,17 @@ struct SniffValue { machine.rows_read++; } - if ((machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) || + if ((machine.previous_state == CSVState::RECORD_SEPARATOR) || (machine.state != CSVState::RECORD_SEPARATOR && machine.previous_state == CSVState::CARRIAGE_RETURN)) { sniffed_values[machine.cur_rows].position = machine.line_start_pos; sniffed_values[machine.cur_rows].set = true; machine.line_start_pos = current_pos; } - machine.pre_previous_state = machine.previous_state; - machine.previous_state = machine.state; - machine.state = static_cast( - machine.transition_array[static_cast(machine.state)][static_cast(current_char)]); + + machine.Transition(current_char); bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN; - if (machine.previous_state == CSVState::DELIMITER || - (machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) || + if (machine.previous_state == CSVState::DELIMITER || (machine.previous_state == CSVState::RECORD_SEPARATOR) || (machine.state != CSVState::RECORD_SEPARATOR && carriage_return)) { // Started a new value // Check if it's UTF-8 @@ -175,8 +172,7 @@ struct SniffValue { (machine.state == CSVState::QUOTED && machine.previous_state == CSVState::QUOTED)) { machine.value += current_char; } - machine.cur_rows += - machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE; + machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR; // It means our carriage return is actually a record separator machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return; if (machine.cur_rows >= sniffed_values.size()) { diff --git a/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp b/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp index 66f2547a..8500f68f 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_refinement.cpp @@ -3,9 +3,9 @@ namespace duckdb { struct Parse { inline static void Initialize(CSVStateMachine &machine) { - machine.state = CSVState::STANDARD; - machine.previous_state = CSVState::STANDARD; - machine.pre_previous_state = CSVState::STANDARD; + machine.state = CSVState::EMPTY_LINE; + machine.previous_state = CSVState::EMPTY_LINE; + machine.pre_previous_state = CSVState::EMPTY_LINE; machine.cur_rows = 0; machine.column_count = 0; @@ -14,22 +14,18 @@ struct Parse { inline static bool Process(CSVStateMachine &machine, DataChunk &parse_chunk, char current_char, idx_t current_pos) { - machine.pre_previous_state = machine.previous_state; - machine.previous_state = machine.state; - machine.state = static_cast( - machine.transition_array[static_cast(machine.state)][static_cast(current_char)]); + machine.Transition(current_char); bool carriage_return = machine.previous_state == CSVState::CARRIAGE_RETURN; - if (machine.previous_state == CSVState::DELIMITER || - (machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE) || + if (machine.previous_state == CSVState::DELIMITER || (machine.previous_state == CSVState::RECORD_SEPARATOR) || (machine.state != CSVState::RECORD_SEPARATOR && carriage_return)) { // Started a new value // Check if it's UTF-8 (Or not?) machine.VerifyUTF8(); auto &v = parse_chunk.data[machine.column_count++]; auto parse_data = FlatVector::GetData(v); - auto &validity_mask = FlatVector::Validity(v); if (machine.value.empty()) { + auto &validity_mask = FlatVector::Validity(v); validity_mask.SetInvalid(machine.cur_rows); } else { parse_data[machine.cur_rows] = StringVector::AddStringOrBlob(v, string_t(machine.value)); @@ -50,12 +46,11 @@ struct Parse { (machine.state == CSVState::QUOTED && machine.previous_state == CSVState::QUOTED)) { machine.value += current_char; } - machine.cur_rows += - machine.previous_state == CSVState::RECORD_SEPARATOR && machine.state != CSVState::EMPTY_LINE; + machine.cur_rows += machine.previous_state == CSVState::RECORD_SEPARATOR && machine.column_count > 0; machine.column_count -= machine.column_count * (machine.previous_state == CSVState::RECORD_SEPARATOR); // It means our carriage return is actually a record separator - machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return; + machine.cur_rows += machine.state != CSVState::RECORD_SEPARATOR && carriage_return && machine.column_count > 0; machine.column_count -= machine.column_count * (machine.state != CSVState::RECORD_SEPARATOR && carriage_return); if (machine.cur_rows >= STANDARD_VECTOR_SIZE) { diff --git a/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp b/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp index 41988082..2d0685a3 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_replacement.cpp @@ -14,7 +14,7 @@ void CSVSniffer::ReplaceTypes() { for (idx_t i = 0; i < names.size(); i++) { auto it = best_candidate->options.sql_types_per_column.find(names[i]); if (it != best_candidate->options.sql_types_per_column.end()) { - best_sql_types_candidates_per_column_idx[i] = {best_candidate->options.sql_type_list[it->second]}; + detected_types[i] = best_candidate->options.sql_type_list[it->second]; found++; } } @@ -33,7 +33,7 @@ void CSVSniffer::ReplaceTypes() { best_candidate->options.sql_type_list.size(), names.size()); } for (idx_t i = 0; i < best_candidate->options.sql_type_list.size(); i++) { - best_sql_types_candidates_per_column_idx[i] = {best_candidate->options.sql_type_list[i]}; + detected_types[i] = best_candidate->options.sql_type_list[i]; } } } // namespace duckdb diff --git a/src/duckdb/src/execution/operator/helper/physical_reset.cpp b/src/duckdb/src/execution/operator/helper/physical_reset.cpp index 6fd3b9f3..916670cf 100644 --- a/src/duckdb/src/execution/operator/helper/physical_reset.cpp +++ b/src/duckdb/src/execution/operator/helper/physical_reset.cpp @@ -21,10 +21,7 @@ void PhysicalReset::ResetExtensionVariable(ExecutionContext &context, DBConfig & SourceResultType PhysicalReset::GetData(ExecutionContext &context, DataChunk &chunk, OperatorSourceInput &input) const { auto &config = DBConfig::GetConfig(context.client); - if (config.options.lock_configuration) { - throw InvalidInputException("Cannot reset configuration option \"%s\" - the configuration has been locked", - name); - } + config.CheckLock(name); auto option = DBConfig::GetOptionByName(name); if (!option) { // check if this is an extra extension variable diff --git a/src/duckdb/src/execution/operator/helper/physical_set.cpp b/src/duckdb/src/execution/operator/helper/physical_set.cpp index 8153ea54..4fe86880 100644 --- a/src/duckdb/src/execution/operator/helper/physical_set.cpp +++ b/src/duckdb/src/execution/operator/helper/physical_set.cpp @@ -24,10 +24,8 @@ void PhysicalSet::SetExtensionVariable(ClientContext &context, ExtensionOption & SourceResultType PhysicalSet::GetData(ExecutionContext &context, DataChunk &chunk, OperatorSourceInput &input) const { auto &config = DBConfig::GetConfig(context.client); - if (config.options.lock_configuration) { - throw InvalidInputException("Cannot change configuration option \"%s\" - the configuration has been locked", - name); - } + // check if we are allowed to change the configuration option + config.CheckLock(name); auto option = DBConfig::GetOptionByName(name); if (!option) { // check if this is an extra extension variable diff --git a/src/duckdb/src/execution/perfect_aggregate_hashtable.cpp b/src/duckdb/src/execution/perfect_aggregate_hashtable.cpp index 826b1d55..a3a6bc99 100644 --- a/src/duckdb/src/execution/perfect_aggregate_hashtable.cpp +++ b/src/duckdb/src/execution/perfect_aggregate_hashtable.cpp @@ -298,12 +298,10 @@ void PerfectAggregateHashTable::Destroy() { RowOperationsState row_state(*aggregate_allocator); data_ptr_t payload_ptr = data; for (idx_t i = 0; i < total_groups; i++) { - if (group_is_set[i]) { - data_pointers[count++] = payload_ptr; - if (count == STANDARD_VECTOR_SIZE) { - RowOperations::DestroyStates(row_state, layout, addresses, count); - count = 0; - } + data_pointers[count++] = payload_ptr; + if (count == STANDARD_VECTOR_SIZE) { + RowOperations::DestroyStates(row_state, layout, addresses, count); + count = 0; } payload_ptr += tuple_size; } diff --git a/src/duckdb/src/execution/radix_partitioned_hashtable.cpp b/src/duckdb/src/execution/radix_partitioned_hashtable.cpp index e6d3ad20..49d2257c 100644 --- a/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +++ b/src/duckdb/src/execution/radix_partitioned_hashtable.cpp @@ -261,7 +261,7 @@ idx_t RadixHTConfig::ExternalRadixBits(const idx_t &maximum_sink_radix_bits_p) { idx_t RadixHTConfig::SinkCapacity(ClientContext &context) { // Get active and maximum number of threads const idx_t active_threads = TaskScheduler::GetScheduler(context).NumberOfThreads(); - const auto max_threads = DBConfig::GetSystemMaxThreads(FileSystem::GetFileSystem(context)); + const auto max_threads = DBConfig::GetConfig(context).options.maximum_threads; // Compute cache size per active thread (assuming cache is shared) const auto total_shared_cache_size = max_threads * L3_CACHE_SIZE; diff --git a/src/duckdb/src/function/function_binder.cpp b/src/duckdb/src/function/function_binder.cpp index aba67bb1..68d7ef81 100644 --- a/src/duckdb/src/function/function_binder.cpp +++ b/src/duckdb/src/function/function_binder.cpp @@ -228,7 +228,7 @@ void FunctionBinder::CastToFunctionArguments(SimpleFunction &function, vectorreturn_type.id() == LogicalTypeId::LAMBDA) { continue; } diff --git a/src/duckdb/src/function/table/arrow_conversion.cpp b/src/duckdb/src/function/table/arrow_conversion.cpp index 597e6c33..ba7d011a 100644 --- a/src/duckdb/src/function/table/arrow_conversion.cpp +++ b/src/duckdb/src/function/table/arrow_conversion.cpp @@ -639,10 +639,11 @@ static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowScanLoca for (idx_t type_idx = 0; type_idx < static_cast(array.n_children); type_idx++) { Vector child(members[type_idx].second); auto arrow_array = array.children[type_idx]; + auto &child_type = arrow_type[type_idx]; SetValidityMask(child, *arrow_array, scan_state, size, nested_offset); - ColumnArrowToDuckDB(child, *arrow_array, scan_state, size, arrow_type, nested_offset, &validity_mask); + ColumnArrowToDuckDB(child, *arrow_array, scan_state, size, child_type, nested_offset, &validity_mask); children.push_back(std::move(child)); } diff --git a/src/duckdb/src/function/table/read_csv.cpp b/src/duckdb/src/function/table/read_csv.cpp index 5a21874a..bd5baea3 100644 --- a/src/duckdb/src/function/table/read_csv.cpp +++ b/src/duckdb/src/function/table/read_csv.cpp @@ -38,7 +38,7 @@ void ReadCSVData::FinalizeRead(ClientContext &context) { auto number_of_threads = TaskScheduler::GetScheduler(context).NumberOfThreads(); //! If we have many csv files, we run single-threaded on each file and parallelize on the number of files bool many_csv_files = files.size() > 1 && int64_t(files.size() * 2) >= number_of_threads; - if (options.parallel_mode != ParallelMode::PARALLEL && many_csv_files) { + if (options.parallel_mode != ParallelMode::PARALLEL && (many_csv_files || number_of_threads == 1)) { single_threaded = true; } if (options.parallel_mode == ParallelMode::SINGLE_THREADED || not_supported_options || diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index c1f391c6..a1a987ad 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,8 +1,8 @@ #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "0.9.0" +#define DUCKDB_VERSION "v0.9.1" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "0d84ccf478" +#define DUCKDB_SOURCE_ID "401c8061c6" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb.h b/src/duckdb/src/include/duckdb.h index adc5208a..8fa38f7d 100644 --- a/src/duckdb/src/include/duckdb.h +++ b/src/duckdb/src/include/duckdb.h @@ -317,7 +317,7 @@ typedef enum { //===--------------------------------------------------------------------===// /*! -Creates a new database or opens an existing database file stored at the the given path. +Creates a new database or opens an existing database file stored at the given path. If no path is given a new in-memory database is created instead. The instantiated database should be closed with 'duckdb_close' @@ -328,7 +328,7 @@ The instantiated database should be closed with 'duckdb_close' DUCKDB_API duckdb_state duckdb_open(const char *path, duckdb_database *out_database); /*! -Extended version of duckdb_open. Creates a new database or opens an existing database file stored at the the given path. +Extended version of duckdb_open. Creates a new database or opens an existing database file stored at the given path. * path: Path to the database file on disk, or `nullptr` or `:memory:` to open an in-memory database. * out_database: The result database object. @@ -1009,7 +1009,7 @@ Binds an int64_t value to the prepared statement at the specified index. DUCKDB_API duckdb_state duckdb_bind_int64(duckdb_prepared_statement prepared_statement, idx_t param_idx, int64_t val); /*! -Binds an duckdb_hugeint value to the prepared statement at the specified index. +Binds a duckdb_hugeint value to the prepared statement at the specified index. */ DUCKDB_API duckdb_state duckdb_bind_hugeint(duckdb_prepared_statement prepared_statement, idx_t param_idx, duckdb_hugeint val); @@ -1040,12 +1040,12 @@ Binds an uint64_t value to the prepared statement at the specified index. DUCKDB_API duckdb_state duckdb_bind_uint64(duckdb_prepared_statement prepared_statement, idx_t param_idx, uint64_t val); /*! -Binds an float value to the prepared statement at the specified index. +Binds a float value to the prepared statement at the specified index. */ DUCKDB_API duckdb_state duckdb_bind_float(duckdb_prepared_statement prepared_statement, idx_t param_idx, float val); /*! -Binds an double value to the prepared statement at the specified index. +Binds a double value to the prepared statement at the specified index. */ DUCKDB_API duckdb_state duckdb_bind_double(duckdb_prepared_statement prepared_statement, idx_t param_idx, double val); diff --git a/src/duckdb/src/include/duckdb/common/arrow/appender/append_data.hpp b/src/duckdb/src/include/duckdb/common/arrow/appender/append_data.hpp index 0961e595..8cbbf02c 100644 --- a/src/duckdb/src/include/duckdb/common/arrow/appender/append_data.hpp +++ b/src/duckdb/src/include/duckdb/common/arrow/appender/append_data.hpp @@ -27,6 +27,7 @@ typedef void (*finalize_t)(ArrowAppendData &append_data, const LogicalType &type // ArrowAppendState struct ArrowAppendData { explicit ArrowAppendData(ClientProperties &options_p) : options(options_p) { + dictionary.release = nullptr; } // the buffers of the arrow vector ArrowBuffer validity; @@ -48,6 +49,9 @@ struct ArrowAppendData { unique_ptr array; duckdb::array buffers = {{nullptr, nullptr, nullptr}}; vector child_pointers; + // Arrays so the children can be moved + vector child_arrays; + ArrowArray dictionary; ClientProperties options; }; diff --git a/src/duckdb/src/include/duckdb/common/arrow/appender/enum_data.hpp b/src/duckdb/src/include/duckdb/common/arrow/appender/enum_data.hpp index 54cfdc0d..ffcf729f 100644 --- a/src/duckdb/src/include/duckdb/common/arrow/appender/enum_data.hpp +++ b/src/duckdb/src/include/duckdb/common/arrow/appender/enum_data.hpp @@ -62,7 +62,9 @@ struct ArrowEnumData : public ArrowScalarBaseData { result->n_buffers = 2; result->buffers[1] = append_data.main_buffer.data(); // finalize the enum child data, and assign it to the dictionary - result->dictionary = ArrowAppender::FinalizeChild(LogicalType::VARCHAR, *append_data.child_data[0]); + result->dictionary = &append_data.dictionary; + append_data.dictionary = + *ArrowAppender::FinalizeChild(LogicalType::VARCHAR, std::move(append_data.child_data[0])); } }; diff --git a/src/duckdb/src/include/duckdb/common/arrow/arrow_appender.hpp b/src/duckdb/src/include/duckdb/common/arrow/arrow_appender.hpp index 0d3edf7e..46aa70a5 100644 --- a/src/duckdb/src/include/duckdb/common/arrow/arrow_appender.hpp +++ b/src/duckdb/src/include/duckdb/common/arrow/arrow_appender.hpp @@ -28,9 +28,10 @@ class ArrowAppender { public: static void ReleaseArray(ArrowArray *array); - static ArrowArray *FinalizeChild(const LogicalType &type, ArrowAppendData &append_data); + static ArrowArray *FinalizeChild(const LogicalType &type, unique_ptr append_data); static unique_ptr InitializeChild(const LogicalType &type, idx_t capacity, ClientProperties &options); + static void AddChildren(ArrowAppendData &data, idx_t count); private: //! The types of the chunks that will be appended in diff --git a/src/duckdb/src/include/duckdb/common/arrow/arrow_wrapper.hpp b/src/duckdb/src/include/duckdb/common/arrow/arrow_wrapper.hpp index d022c33a..0d3fc60d 100644 --- a/src/duckdb/src/include/duckdb/common/arrow/arrow_wrapper.hpp +++ b/src/duckdb/src/include/duckdb/common/arrow/arrow_wrapper.hpp @@ -35,6 +35,9 @@ class ArrowArrayWrapper { arrow_array.length = 0; arrow_array.release = nullptr; } + ArrowArrayWrapper(ArrowArrayWrapper &&other) : arrow_array(other.arrow_array) { + other.arrow_array.release = nullptr; + } ~ArrowArrayWrapper(); }; diff --git a/src/duckdb/src/include/duckdb/common/exception.hpp b/src/duckdb/src/include/duckdb/common/exception.hpp index cffde874..5e2ad4d6 100644 --- a/src/duckdb/src/include/duckdb/common/exception.hpp +++ b/src/duckdb/src/include/duckdb/common/exception.hpp @@ -97,6 +97,7 @@ class Exception : public std::exception { DUCKDB_API const string &RawMessage() const; DUCKDB_API static string ExceptionTypeToString(ExceptionType type); + DUCKDB_API static ExceptionType StringToExceptionType(const string &type); [[noreturn]] DUCKDB_API static void ThrowAsTypeWithMessage(ExceptionType type, const string &message, const std::shared_ptr &original); virtual std::shared_ptr Copy() const { diff --git a/src/duckdb/src/include/duckdb/common/preserved_error.hpp b/src/duckdb/src/include/duckdb/common/preserved_error.hpp index c95cb780..65bc4fae 100644 --- a/src/duckdb/src/include/duckdb/common/preserved_error.hpp +++ b/src/duckdb/src/include/duckdb/common/preserved_error.hpp @@ -18,9 +18,7 @@ class PreservedError { //! Not initialized, default constructor DUCKDB_API PreservedError(); //! From std::exception - PreservedError(const std::exception &ex) - : initialized(true), type(ExceptionType::INVALID), raw_message(SanitizeErrorMessage(ex.what())), - exception_instance(nullptr) { + PreservedError(const std::exception &ex) : PreservedError(ex.what()) { } //! From a raw string DUCKDB_API explicit PreservedError(const string &raw_message); diff --git a/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp b/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp index 4b079b63..2fbce818 100644 --- a/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp +++ b/src/duckdb/src/include/duckdb/common/serializer/serialization_traits.hpp @@ -1,6 +1,7 @@ #pragma once #include #include +#include #include "duckdb/common/vector.hpp" #include "duckdb/common/unordered_map.hpp" diff --git a/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp b/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp index c40908ed..a03b7918 100644 --- a/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp +++ b/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp @@ -33,7 +33,7 @@ struct ExpressionState { public: void AddChild(Expression *expr); - void Finalize(); + void Finalize(bool empty = false); Allocator &GetAllocator(); bool HasContext(); DUCKDB_API ClientContext &GetContext(); diff --git a/src/duckdb/src/include/duckdb/execution/operator/scan/csv/base_csv_reader.hpp b/src/duckdb/src/include/duckdb/execution/operator/scan/csv/base_csv_reader.hpp index ea214f8a..388a7813 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/scan/csv/base_csv_reader.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/scan/csv/base_csv_reader.hpp @@ -51,13 +51,9 @@ class BaseCSVReader { bool linenr_estimated = false; bool row_empty = false; - idx_t sample_chunk_idx = 0; - bool jumping_samples = false; - bool end_of_file_reached = false; bool bom_checked = false; idx_t bytes_in_chunk = 0; - double bytes_per_line_avg = 0; DataChunk parse_chunk; diff --git a/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp b/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp index 01ed8560..943c8a65 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_sniffer.hpp @@ -34,9 +34,9 @@ class CSVSniffer { //! CSV Sniffing consists of five steps: //! 1. Dialect Detection: Generate the CSV Options (delimiter, quote, escape, etc.) //! 2. Type Detection: Figures out the types of the columns (For one chunk) - //! 3. Header Detection: Figures out if the CSV file has a header and produces the names of the columns - //! 4. Type Replacement: Replaces the types of the columns if the user specified them - //! 5. Type Refinement: Refines the types of the columns for the remaining chunks + //! 3. Type Refinement: Refines the types of the columns for the remaining chunks + //! 4. Header Detection: Figures out if the CSV file has a header and produces the names of the columns + //! 5. Type Replacement: Replaces the types of the columns if the user specified them SnifferResult SniffCSV(); private: @@ -50,6 +50,8 @@ class CSVSniffer { CSVReaderOptions &options; //! Buffer being used on sniffer shared_ptr buffer_manager; + //! Sets the result options + void SetResultOptions(); //! ------------------------------------------------------// //! ----------------- Dialect Detection ----------------- // @@ -105,6 +107,13 @@ class CSVSniffer { idx_t best_start_without_header = 0; vector best_header_row; + //! ------------------------------------------------------// + //! ------------------ Type Refinement ------------------ // + //! ------------------------------------------------------// + void RefineTypes(); + bool TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type); + vector detected_types; + //! ------------------------------------------------------// //! ------------------ Header Detection ----------------- // //! ------------------------------------------------------// @@ -117,13 +126,6 @@ class CSVSniffer { //! ------------------ Type Replacement ----------------- // //! ------------------------------------------------------// void ReplaceTypes(); - - //! ------------------------------------------------------// - //! ------------------ Type Refinement ------------------ // - //! ------------------------------------------------------// - void RefineTypes(); - bool TryCastVector(Vector &parse_chunk_col, idx_t size, const LogicalType &sql_type); - vector detected_types; }; } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state.hpp b/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state.hpp new file mode 100644 index 00000000..02137eef --- /dev/null +++ b/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state.hpp @@ -0,0 +1,28 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// duckdb/execution/operator/scan/csv/csv_state.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace duckdb { + +//! All States of CSV Parsing +enum class CSVState : uint8_t { + STANDARD = 0, //! Regular unquoted field state + DELIMITER = 1, //! State after encountering a field separator (e.g., ;) + RECORD_SEPARATOR = 2, //! State after encountering a record separator (i.e., \n) + CARRIAGE_RETURN = 3, //! State after encountering a carriage return(i.e., \r) + QUOTED = 4, //! State when inside a quoted field + UNQUOTED = 5, //! State when leaving a quoted field + ESCAPE = 6, //! State when encountering an escape character (e.g., \) + EMPTY_LINE = 7, //! State when encountering an empty line (i.e., \r\r \n\n, \n\r) + INVALID = 8 //! Got to an Invalid State, this should error. +}; + +} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp b/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp index b4d82c96..94bb525d 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine.hpp @@ -14,19 +14,6 @@ namespace duckdb { -//! All States of CSV Parsing -enum class CSVState : uint8_t { - STANDARD = 0, //! Regular unquoted field state - DELIMITER = 1, //! State after encountering a field separator (e.g., ;) - RECORD_SEPARATOR = 2, //! State after encountering a record separator (i.e., \n) - CARRIAGE_RETURN = 3, //! State after encountering a carriage return(i.e., \r) - QUOTED = 4, //! State when inside a quoted field - UNQUOTED = 5, //! State when leaving a quoted field - ESCAPE = 6, //! State when encountering an escape character (e.g., \) - EMPTY_LINE = 7, //! State when encountering an empty line (i.e., \r\r \n\n, \n\r) - INVALID = 8 //! Got to an Invalid State, this should error. -}; - //! The CSV State Machine comprises a state transition array (STA). //! The STA indicates the current state of parsing based on both the current and preceding characters. //! This reveals whether we are dealing with a Field, a New Line, a Delimiter, and so forth. @@ -38,6 +25,14 @@ class CSVStateMachine { explicit CSVStateMachine(CSVReaderOptions &options_p, const CSVStateMachineOptions &state_machine_options, shared_ptr buffer_manager_p, CSVStateMachineCache &csv_state_machine_cache_p); + + //! Transition all states to next state, that depends on the current char + inline void Transition(char current_char) { + pre_previous_state = previous_state; + previous_state = state; + state = transition_array[state][static_cast(current_char)]; + } + //! Resets the state machine, so it can be used again void Reset(); @@ -52,7 +47,7 @@ class CSVStateMachine { idx_t start_row = 0; //! The Transition Array is a Finite State Machine //! It holds the transitions of all states, on all 256 possible different characters - const state_machine_t &transition_array; + const StateMachine &transition_array; //! Both these variables are used for new line identifier detection bool single_record_separator = false; diff --git a/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp b/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp index f63024cb..0ec4e6f1 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/scan/csv/csv_state_machine_cache.hpp @@ -8,14 +8,28 @@ #pragma once -#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp" +#include "duckdb/execution/operator/scan/csv/csv_state.hpp" #include "duckdb/execution/operator/scan/csv/csv_buffer_manager.hpp" +#include "duckdb/execution/operator/scan/csv/csv_reader_options.hpp" #include "duckdb/execution/operator/scan/csv/quote_rules.hpp" namespace duckdb { -static constexpr uint32_t NUM_STATES = 9; -static constexpr uint32_t NUM_TRANSITIONS = 256; -typedef uint8_t state_machine_t[NUM_STATES][NUM_TRANSITIONS]; + +//! Class to wrap the state machine matrix +class StateMachine { +public: + static constexpr uint32_t NUM_STATES = 9; + static constexpr uint32_t NUM_TRANSITIONS = 256; + CSVState state_machine[NUM_STATES][NUM_TRANSITIONS]; + + const CSVState *operator[](CSVState state) const { + return state_machine[static_cast(state)]; + } + + CSVState *operator[](CSVState state) { + return state_machine[static_cast(state)]; + } +}; //! Hash function used in out state machine cache, it hashes and combines all options used to generate a state machine struct HashCSVStateMachineConfig { @@ -36,12 +50,12 @@ class CSVStateMachineCache { ~CSVStateMachineCache() {}; //! Gets a state machine from the cache, if it's not from one the default options //! It first caches it, then returns it. - const state_machine_t &Get(const CSVStateMachineOptions &state_machine_options); + const StateMachine &Get(const CSVStateMachineOptions &state_machine_options); private: void Insert(const CSVStateMachineOptions &state_machine_options); //! Cache on delimiter|quote|escape - unordered_map state_machine_cache; + unordered_map state_machine_cache; //! Default value for options used to intialize CSV State Machine Cache const vector default_delimiter = {',', '|', ';', '\t'}; const vector> default_quote = {{'\"'}, {'\"', '\''}, {'\0'}}; diff --git a/src/duckdb/src/include/duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp b/src/duckdb/src/include/duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp index 511df229..a3d14fef 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/scan/csv/parallel_csv_reader.hpp @@ -148,7 +148,7 @@ class ParallelCSVReader : public BaseCSVReader { //! Sets Position depending on the byte_start of this thread bool SetPosition(); //! Called when scanning the 1st buffer, skips empty lines - void SkipEmptyLines(); + bool SkipEmptyLines(); //! When a buffer finishes reading its piece, it still can try to scan up to the real end of the buffer //! Up to finding a new line. This function sets the buffer_end and marks a boolean variable //! when changing the buffer end the first time. diff --git a/src/duckdb/src/include/duckdb/function/replacement_scan.hpp b/src/duckdb/src/include/duckdb/function/replacement_scan.hpp index 3447b96d..455f8ae1 100644 --- a/src/duckdb/src/include/duckdb/function/replacement_scan.hpp +++ b/src/duckdb/src/include/duckdb/function/replacement_scan.hpp @@ -9,6 +9,7 @@ #pragma once #include "duckdb/common/common.hpp" +#include "duckdb/common/string_util.hpp" namespace duckdb { @@ -30,6 +31,25 @@ struct ReplacementScan { : function(function), data(std::move(data_p)) { } + static bool CanReplace(const string &table_name, const vector &extensions) { + auto lower_name = StringUtil::Lower(table_name); + + if (StringUtil::EndsWith(lower_name, ".gz")) { + lower_name = lower_name.substr(0, lower_name.size() - 3); + } else if (StringUtil::EndsWith(lower_name, ".zst")) { + lower_name = lower_name.substr(0, lower_name.size() - 4); + } + + for (auto &extension : extensions) { + if (StringUtil::EndsWith(lower_name, "." + extension) || + StringUtil::Contains(lower_name, "." + extension + "?")) { + return true; + } + } + + return false; + } + replacement_scan_t function; unique_ptr data; }; diff --git a/src/duckdb/src/include/duckdb/main/config.hpp b/src/duckdb/src/include/duckdb/main/config.hpp index d9cd3a20..654bc80c 100644 --- a/src/duckdb/src/include/duckdb/main/config.hpp +++ b/src/duckdb/src/include/duckdb/main/config.hpp @@ -240,6 +240,8 @@ struct DBConfig { DUCKDB_API void SetOption(const string &name, Value value); DUCKDB_API void ResetOption(const string &name); + DUCKDB_API void CheckLock(const string &name); + DUCKDB_API static idx_t ParseMemoryLimit(const string &arg); //! Return the list of possible compression functions for the specific physical type diff --git a/src/duckdb/src/include/duckdb/optimizer/filter_pushdown.hpp b/src/duckdb/src/include/duckdb/optimizer/filter_pushdown.hpp index 1b9a4482..ab6f04b0 100644 --- a/src/duckdb/src/include/duckdb/optimizer/filter_pushdown.hpp +++ b/src/duckdb/src/include/duckdb/optimizer/filter_pushdown.hpp @@ -43,6 +43,8 @@ class FilterPushdown { //! Push down a LogicalAggregate op unique_ptr PushdownAggregate(unique_ptr op); + //! Push down a distinct operator + unique_ptr PushdownDistinct(unique_ptr op); //! Push down a LogicalFilter op unique_ptr PushdownFilter(unique_ptr op); //! Push down a LogicalCrossProduct op diff --git a/src/duckdb/src/include/duckdb/planner/binder.hpp b/src/duckdb/src/include/duckdb/planner/binder.hpp index 90a7ae94..b26bc284 100644 --- a/src/duckdb/src/include/duckdb/planner/binder.hpp +++ b/src/duckdb/src/include/duckdb/planner/binder.hpp @@ -362,6 +362,7 @@ class Binder : public std::enable_shared_from_this { //! If only a schema name is provided (e.g. "a.b") then figure out if "a" is a schema or a catalog name void BindSchemaOrCatalog(string &catalog_name, string &schema_name); + const string BindCatalog(string &catalog_name); SchemaCatalogEntry &BindCreateSchema(CreateInfo &info); unique_ptr BindSelectNode(SelectNode &statement, unique_ptr from_table); diff --git a/src/duckdb/src/include/duckdb/planner/bound_parameter_map.hpp b/src/duckdb/src/include/duckdb/planner/bound_parameter_map.hpp index ab5ef410..33ce5cf0 100644 --- a/src/duckdb/src/include/duckdb/planner/bound_parameter_map.hpp +++ b/src/duckdb/src/include/duckdb/planner/bound_parameter_map.hpp @@ -35,6 +35,9 @@ struct BoundParameterMap { unique_ptr BindParameterExpression(ParameterExpression &expr); + //! Flag to indicate that we need to rebind this prepared statement before execution + bool rebind = false; + private: shared_ptr CreateOrGetData(const string &identifier); void CreateNewParameter(const string &id, const shared_ptr ¶m_data); diff --git a/src/duckdb/src/include/duckdb/planner/expression_binder.hpp b/src/duckdb/src/include/duckdb/planner/expression_binder.hpp index 11e5a882..6df014d1 100644 --- a/src/duckdb/src/include/duckdb/planner/expression_binder.hpp +++ b/src/duckdb/src/include/duckdb/planner/expression_binder.hpp @@ -90,8 +90,8 @@ class ExpressionBinder { void QualifyColumnNames(unique_ptr &expr); static void QualifyColumnNames(Binder &binder, unique_ptr &expr); - static unique_ptr PushCollation(ClientContext &context, unique_ptr source, - const string &collation, bool equality_only = false); + static bool PushCollation(ClientContext &context, unique_ptr &source, const LogicalType &sql_type, + bool equality_only = false); static void TestCollation(ClientContext &context, const string &collation); bool BindCorrelatedColumns(unique_ptr &expr); diff --git a/src/duckdb/src/main/capi/arrow-c.cpp b/src/duckdb/src/main/capi/arrow-c.cpp index 9dff8ed1..78dd3e69 100644 --- a/src/duckdb/src/main/capi/arrow-c.cpp +++ b/src/duckdb/src/main/capi/arrow-c.cpp @@ -64,7 +64,7 @@ duckdb_state duckdb_prepared_arrow_schema(duckdb_prepared_statement prepared, du if (result_schema->release) { // Need to release the existing schema before we overwrite it result_schema->release(result_schema); - result_schema->release = nullptr; + D_ASSERT(!result_schema->release); } ArrowConverter::ToArrowSchema(result_schema, prepared_types, prepared_names, properties); @@ -155,14 +155,17 @@ struct PrivateData { // LCOV_EXCL_START // This function is never called, but used to set ArrowSchema's release functions to a non-null NOOP. -void EmptySchemaRelease(ArrowSchema *) { +void EmptySchemaRelease(ArrowSchema *schema) { + schema->release = nullptr; } // LCOV_EXCL_STOP -void EmptyArrayRelease(ArrowArray *) { +void EmptyArrayRelease(ArrowArray *array) { + array->release = nullptr; } -void EmptyStreamRelease(ArrowArrayStream *) { +void EmptyStreamRelease(ArrowArrayStream *stream) { + stream->release = nullptr; } void FactoryGetSchema(uintptr_t stream_factory_ptr, duckdb::ArrowSchemaWrapper &schema) { diff --git a/src/duckdb/src/main/config.cpp b/src/duckdb/src/main/config.cpp index 5e934993..89d5e1ba 100644 --- a/src/duckdb/src/main/config.cpp +++ b/src/duckdb/src/main/config.cpp @@ -233,6 +233,20 @@ void DBConfig::SetDefaultMaxMemory() { } } +void DBConfig::CheckLock(const string &name) { + if (!options.lock_configuration) { + // not locked + return; + } + case_insensitive_set_t allowed_settings {"schema", "search_path"}; + if (allowed_settings.find(name) != allowed_settings.end()) { + // we are always allowed to change these settings + return; + } + // not allowed! + throw InvalidInputException("Cannot change configuration option \"%s\" - the configuration has been locked", name); +} + idx_t CGroupBandwidthQuota(idx_t physical_cores, FileSystem &fs) { static constexpr const char *CPU_MAX = "/sys/fs/cgroup/cpu.max"; static constexpr const char *CFS_QUOTA = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"; diff --git a/src/duckdb/src/main/extension/extension_helper.cpp b/src/duckdb/src/main/extension/extension_helper.cpp index f47e853a..9c39eed2 100644 --- a/src/duckdb/src/main/extension/extension_helper.cpp +++ b/src/duckdb/src/main/extension/extension_helper.cpp @@ -196,6 +196,9 @@ string ExtensionHelper::AddExtensionInstallHintToErrorMsg(ClientContext &context } bool ExtensionHelper::TryAutoLoadExtension(ClientContext &context, const string &extension_name) noexcept { + if (context.db->ExtensionIsLoaded(extension_name)) { + return true; + } auto &dbconfig = DBConfig::GetConfig(context); try { if (dbconfig.options.autoinstall_known_extensions) { @@ -211,6 +214,10 @@ bool ExtensionHelper::TryAutoLoadExtension(ClientContext &context, const string } void ExtensionHelper::AutoLoadExtension(ClientContext &context, const string &extension_name) { + if (context.db->ExtensionIsLoaded(extension_name)) { + // Avoid downloading again + return; + } auto &dbconfig = DBConfig::GetConfig(context); try { #ifndef DUCKDB_WASM diff --git a/src/duckdb/src/main/extension/extension_install.cpp b/src/duckdb/src/main/extension/extension_install.cpp index 231c38ee..95d0c2e4 100644 --- a/src/duckdb/src/main/extension/extension_install.cpp +++ b/src/duckdb/src/main/extension/extension_install.cpp @@ -208,18 +208,20 @@ void ExtensionHelper::InstallExtensionInternal(DBConfig &config, ClientConfig *c fs.RemoveFile(temp_path); } auto is_http_url = StringUtil::Contains(extension, "http://"); - if (fs.FileExists(extension)) { - idx_t file_size; - auto in_buffer = ReadExtensionFileFromDisk(fs, extension, file_size); - WriteExtensionFileToDisk(fs, temp_path, in_buffer.get(), file_size); - - if (fs.FileExists(local_extension_path) && force_install) { - fs.RemoveFile(local_extension_path); + if (ExtensionHelper::IsFullPath(extension)) { + if (fs.FileExists(extension)) { + idx_t file_size; + auto in_buffer = ReadExtensionFileFromDisk(fs, extension, file_size); + WriteExtensionFileToDisk(fs, temp_path, in_buffer.get(), file_size); + + if (fs.FileExists(local_extension_path) && force_install) { + fs.RemoveFile(local_extension_path); + } + fs.MoveFile(temp_path, local_extension_path); + return; + } else if (!is_http_url) { + throw IOException("Failed to read extension from \"%s\": no such file", extension); } - fs.MoveFile(temp_path, local_extension_path); - return; - } else if (StringUtil::Contains(extension, "/") && !is_http_url) { - throw IOException("Failed to read extension from \"%s\": no such file", extension); } #ifdef DISABLE_DUCKDB_REMOTE_INSTALL @@ -280,7 +282,7 @@ void ExtensionHelper::InstallExtensionInternal(DBConfig &config, ClientConfig *c // create suggestions string message; auto exact_match = ExtensionHelper::CreateSuggestions(extension_name, message); - if (exact_match) { + if (exact_match && !IsRelease(DuckDB::LibraryVersion())) { message += "\nAre you using a development build? In this case, extensions might not (yet) be uploaded."; } if (res.error() == duckdb_httplib::Error::Success) { diff --git a/src/duckdb/src/optimizer/common_aggregate_optimizer.cpp b/src/duckdb/src/optimizer/common_aggregate_optimizer.cpp index 9ce0394d..da13092c 100644 --- a/src/duckdb/src/optimizer/common_aggregate_optimizer.cpp +++ b/src/duckdb/src/optimizer/common_aggregate_optimizer.cpp @@ -38,8 +38,8 @@ void CommonAggregateOptimizer::ExtractCommonAggregates(LogicalAggregate &aggr) { // aggregate does not exist yet: add it to the map aggregate_remap[*aggr.expressions[i]] = i; if (i != original_index) { - // this aggregate is not erased, however an agregate BEFORE it has been erased - // so we need to remap this aggregaet + // this aggregate is not erased, however an aggregate BEFORE it has been erased + // so we need to remap this aggregate ColumnBinding original_binding(aggr.aggregate_index, original_index); ColumnBinding new_binding(aggr.aggregate_index, i); aggregate_map[original_binding] = new_binding; diff --git a/src/duckdb/src/optimizer/filter_pushdown.cpp b/src/duckdb/src/optimizer/filter_pushdown.cpp index a8ca617c..1a38ad2a 100644 --- a/src/duckdb/src/optimizer/filter_pushdown.cpp +++ b/src/duckdb/src/optimizer/filter_pushdown.cpp @@ -33,6 +33,7 @@ unique_ptr FilterPushdown::Rewrite(unique_ptr case LogicalOperatorType::LOGICAL_UNION: return PushdownSetOperation(std::move(op)); case LogicalOperatorType::LOGICAL_DISTINCT: + return PushdownDistinct(std::move(op)); case LogicalOperatorType::LOGICAL_ORDER_BY: { // we can just push directly through these operations without any rewriting op->children[0] = Rewrite(std::move(op->children[0])); diff --git a/src/duckdb/src/optimizer/pushdown/pushdown_distinct.cpp b/src/duckdb/src/optimizer/pushdown/pushdown_distinct.cpp new file mode 100644 index 00000000..030d1819 --- /dev/null +++ b/src/duckdb/src/optimizer/pushdown/pushdown_distinct.cpp @@ -0,0 +1,19 @@ +#include "duckdb/optimizer/filter_pushdown.hpp" +#include "duckdb/planner/expression_iterator.hpp" +#include "duckdb/planner/operator/logical_distinct.hpp" + +namespace duckdb { + +unique_ptr FilterPushdown::PushdownDistinct(unique_ptr op) { + D_ASSERT(op->type == LogicalOperatorType::LOGICAL_DISTINCT); + auto &distinct = op->Cast(); + if (!distinct.order_by) { + // regular DISTINCT - can just push down + op->children[0] = Rewrite(std::move(op->children[0])); + return op; + } + // no pushdown through DISTINCT ON (yet?) + return FinishPushdown(std::move(op)); +} + +} // namespace duckdb diff --git a/src/duckdb/src/parser/transform/statement/transform_copy.cpp b/src/duckdb/src/parser/transform/statement/transform_copy.cpp index 41c8bea8..d5094802 100644 --- a/src/duckdb/src/parser/transform/statement/transform_copy.cpp +++ b/src/duckdb/src/parser/transform/statement/transform_copy.cpp @@ -1,6 +1,7 @@ #include "duckdb/common/string_util.hpp" #include "duckdb/common/types/value.hpp" #include "duckdb/core_functions/scalar/struct_functions.hpp" +#include "duckdb/function/replacement_scan.hpp" #include "duckdb/parser/expression/constant_expression.hpp" #include "duckdb/parser/expression/function_expression.hpp" #include "duckdb/parser/statement/copy_statement.hpp" @@ -82,9 +83,10 @@ unique_ptr Transformer::TransformCopy(duckdb_libpgquery::PGCopySt // copy to a file info.file_path = stmt.filename; } - if (StringUtil::EndsWith(info.file_path, ".parquet")) { + + if (ReplacementScan::CanReplace(info.file_path, {"parquet"})) { info.format = "parquet"; - } else if (StringUtil::EndsWith(info.file_path, ".json") || StringUtil::EndsWith(info.file_path, ".ndjson")) { + } else if (ReplacementScan::CanReplace(info.file_path, {"json", "jsonl", "ndjson"})) { info.format = "json"; } else { info.format = "csv"; diff --git a/src/duckdb/src/parser/transform/statement/transform_create_sequence.cpp b/src/duckdb/src/parser/transform/statement/transform_create_sequence.cpp index cfc671f6..50e3c3f4 100644 --- a/src/duckdb/src/parser/transform/statement/transform_create_sequence.cpp +++ b/src/duckdb/src/parser/transform/statement/transform_create_sequence.cpp @@ -17,6 +17,8 @@ unique_ptr Transformer::TransformCreateSequence(duckdb_libpgque info->name = qname.name; if (stmt.options) { + int64_t default_start_value = info->start_value; + bool has_start_value = false; unordered_set used; duckdb_libpgquery::PGListCell *cell = nullptr; for_each_cell(cell, stmt.options->head) { @@ -51,10 +53,10 @@ unique_ptr Transformer::TransformCreateSequence(duckdb_libpgque throw ParserException("Increment must not be zero"); } if (info->increment < 0) { - info->start_value = info->max_value = -1; + default_start_value = info->max_value = -1; info->min_value = NumericLimits::Minimum(); } else { - info->start_value = info->min_value = 1; + default_start_value = info->min_value = 1; info->max_value = NumericLimits::Maximum(); } } else if (opt_name == "minvalue") { @@ -68,7 +70,7 @@ unique_ptr Transformer::TransformCreateSequence(duckdb_libpgque info->min_value = opt_value; if (info->increment > 0) { - info->start_value = info->min_value; + default_start_value = info->min_value; } } else if (opt_name == "maxvalue") { if (used.find(SequenceInfo::SEQ_MAX) != used.end()) { @@ -81,7 +83,7 @@ unique_ptr Transformer::TransformCreateSequence(duckdb_libpgque info->max_value = opt_value; if (info->increment < 0) { - info->start_value = info->max_value; + default_start_value = info->max_value; } } else if (opt_name == "start") { if (used.find(SequenceInfo::SEQ_START) != used.end()) { @@ -91,7 +93,7 @@ unique_ptr Transformer::TransformCreateSequence(duckdb_libpgque if (nodef) { continue; } - + has_start_value = true; info->start_value = opt_value; } else if (opt_name == "cycle") { if (used.find(SequenceInfo::SEQ_CYCLE) != used.end()) { @@ -107,6 +109,9 @@ unique_ptr Transformer::TransformCreateSequence(duckdb_libpgque throw ParserException("Unrecognized option \"%s\" for CREATE SEQUENCE", opt_name); } } + if (!has_start_value) { + info->start_value = default_start_value; + } } info->temporary = !stmt.sequence->relpersistence; info->on_conflict = TransformOnConflict(stmt.onconflict); diff --git a/src/duckdb/src/planner/binder/expression/bind_between_expression.cpp b/src/duckdb/src/planner/binder/expression/bind_between_expression.cpp index 1c82daef..a44e5d37 100644 --- a/src/duckdb/src/planner/binder/expression/bind_between_expression.cpp +++ b/src/duckdb/src/planner/binder/expression/bind_between_expression.cpp @@ -34,13 +34,11 @@ BindResult ExpressionBinder::BindExpression(BetweenExpression &expr, idx_t depth input = BoundCastExpression::AddCastToType(context, std::move(input), input_type); lower = BoundCastExpression::AddCastToType(context, std::move(lower), input_type); upper = BoundCastExpression::AddCastToType(context, std::move(upper), input_type); - if (input_type.id() == LogicalTypeId::VARCHAR) { - // handle collation - auto collation = StringType::GetCollation(input_type); - input = PushCollation(context, std::move(input), collation, false); - lower = PushCollation(context, std::move(lower), collation, false); - upper = PushCollation(context, std::move(upper), collation, false); - } + // handle collation + PushCollation(context, input, input_type, false); + PushCollation(context, lower, input_type, false); + PushCollation(context, upper, input_type, false); + if (!input->HasSideEffects() && !input->HasParameter() && !input->HasSubquery()) { // the expression does not have side effects and can be copied: create two comparisons // the reason we do this is that individual comparisons are easier to handle in optimizers diff --git a/src/duckdb/src/planner/binder/expression/bind_collate_expression.cpp b/src/duckdb/src/planner/binder/expression/bind_collate_expression.cpp index cbe51e7d..6682b5c9 100644 --- a/src/duckdb/src/planner/binder/expression/bind_collate_expression.cpp +++ b/src/duckdb/src/planner/binder/expression/bind_collate_expression.cpp @@ -18,8 +18,10 @@ BindResult ExpressionBinder::BindExpression(CollateExpression &expr, idx_t depth throw BinderException("collations are only supported for type varchar"); } // Validate the collation, but don't use it - PushCollation(context, child->Copy(), expr.collation, false); - child->return_type = LogicalType::VARCHAR_COLLATION(expr.collation); + auto child_copy = child->Copy(); + auto collation_type = LogicalType::VARCHAR_COLLATION(expr.collation); + PushCollation(context, child_copy, collation_type, false); + child->return_type = collation_type; return BindResult(std::move(child)); } diff --git a/src/duckdb/src/planner/binder/expression/bind_comparison_expression.cpp b/src/duckdb/src/planner/binder/expression/bind_comparison_expression.cpp index 0a452f71..0cff7da5 100644 --- a/src/duckdb/src/planner/binder/expression/bind_comparison_expression.cpp +++ b/src/duckdb/src/planner/binder/expression/bind_comparison_expression.cpp @@ -18,20 +18,25 @@ namespace duckdb { -unique_ptr ExpressionBinder::PushCollation(ClientContext &context, unique_ptr source, - const string &collation_p, bool equality_only) { +bool ExpressionBinder::PushCollation(ClientContext &context, unique_ptr &source, + const LogicalType &sql_type, bool equality_only) { + if (sql_type.id() != LogicalTypeId::VARCHAR) { + // only VARCHAR columns require collation + return false; + } // replace default collation with system collation + auto str_collation = StringType::GetCollation(sql_type); string collation; - if (collation_p.empty()) { + if (str_collation.empty()) { collation = DBConfig::GetConfig(context).options.collation; } else { - collation = collation_p; + collation = str_collation; } collation = StringUtil::Lower(collation); // bind the collation if (collation.empty() || collation == "binary" || collation == "c" || collation == "posix") { - // binary collation: just skip - return source; + // no collation or binary collation: skip + return false; } auto &catalog = Catalog::GetSystemCatalog(context); auto splits = StringUtil::Split(StringUtil::Lower(collation), "."); @@ -60,11 +65,12 @@ unique_ptr ExpressionBinder::PushCollation(ClientContext &context, u auto function = function_binder.BindScalarFunction(collation_entry.function, std::move(children)); source = std::move(function); } - return source; + return true; } void ExpressionBinder::TestCollation(ClientContext &context, const string &collation) { - PushCollation(context, make_uniq(Value("")), collation); + auto expr = make_uniq_base(Value("")); + PushCollation(context, expr, LogicalType::VARCHAR_COLLATION(collation)); } LogicalType BoundComparisonExpression::BindComparison(LogicalType left_type, LogicalType right_type) { @@ -134,12 +140,9 @@ BindResult ExpressionBinder::BindExpression(ComparisonExpression &expr, idx_t de right = BoundCastExpression::AddCastToType(context, std::move(right), input_type, input_type.id() == LogicalTypeId::ENUM); - if (input_type.id() == LogicalTypeId::VARCHAR) { - // handle collation - auto collation = StringType::GetCollation(input_type); - left = PushCollation(context, std::move(left), collation, expr.type == ExpressionType::COMPARE_EQUAL); - right = PushCollation(context, std::move(right), collation, expr.type == ExpressionType::COMPARE_EQUAL); - } + PushCollation(context, left, input_type, expr.type == ExpressionType::COMPARE_EQUAL); + PushCollation(context, right, input_type, expr.type == ExpressionType::COMPARE_EQUAL); + // now create the bound comparison expression return BindResult(make_uniq(expr.type, std::move(left), std::move(right))); } diff --git a/src/duckdb/src/planner/binder/query_node/bind_select_node.cpp b/src/duckdb/src/planner/binder/query_node/bind_select_node.cpp index 6a052a30..36d11109 100644 --- a/src/duckdb/src/planner/binder/query_node/bind_select_node.cpp +++ b/src/duckdb/src/planner/binder/query_node/bind_select_node.cpp @@ -222,10 +222,7 @@ void Binder::BindModifierTypes(BoundQueryNode &result, const vector for (auto &target_distinct : distinct.target_distincts) { auto &bound_colref = target_distinct->Cast(); const auto &sql_type = sql_types[bound_colref.binding.column_index]; - if (sql_type.id() == LogicalTypeId::VARCHAR) { - target_distinct = ExpressionBinder::PushCollation(context, std::move(target_distinct), - StringType::GetCollation(sql_type), true); - } + ExpressionBinder::PushCollation(context, target_distinct, sql_type, true); } break; } @@ -253,10 +250,7 @@ void Binder::BindModifierTypes(BoundQueryNode &result, const vector D_ASSERT(bound_colref.binding.column_index < sql_types.size()); const auto &sql_type = sql_types[bound_colref.binding.column_index]; bound_colref.return_type = sql_types[bound_colref.binding.column_index]; - if (sql_type.id() == LogicalTypeId::VARCHAR) { - order_node.expression = ExpressionBinder::PushCollation(context, std::move(order_node.expression), - StringType::GetCollation(sql_type)); - } + ExpressionBinder::PushCollation(context, order_node.expression, sql_type); } break; } @@ -389,9 +383,8 @@ unique_ptr Binder::BindSelectNode(SelectNode &statement, unique_ bool contains_subquery = bound_expr_ref.HasSubquery(); // push a potential collation, if necessary - auto collated_expr = ExpressionBinder::PushCollation(context, std::move(bound_expr), - StringType::GetCollation(group_type), true); - if (!contains_subquery && !collated_expr->Equals(bound_expr_ref)) { + bool requires_collation = ExpressionBinder::PushCollation(context, bound_expr, group_type, true); + if (!contains_subquery && requires_collation) { // if there is a collation on a group x, we should group by the collated expr, // but also push a first(x) aggregate in case x is selected (uncollated) info.collated_groups[i] = result->aggregates.size(); @@ -405,7 +398,7 @@ unique_ptr Binder::BindSelectNode(SelectNode &statement, unique_ auto function = function_binder.BindAggregateFunction(first_fun, std::move(first_children)); result->aggregates.push_back(std::move(function)); } - result->groups.group_expressions.push_back(std::move(collated_expr)); + result->groups.group_expressions.push_back(std::move(bound_expr)); // in the unbound expression we DO bind the table names of any ColumnRefs // we do this to make sure that "table.a" and "a" are treated the same diff --git a/src/duckdb/src/planner/binder/statement/bind_create.cpp b/src/duckdb/src/planner/binder/statement/bind_create.cpp index 0916da30..972a7128 100644 --- a/src/duckdb/src/planner/binder/statement/bind_create.cpp +++ b/src/duckdb/src/planner/binder/statement/bind_create.cpp @@ -68,6 +68,16 @@ void Binder::BindSchemaOrCatalog(string &catalog, string &schema) { BindSchemaOrCatalog(context, catalog, schema); } +const string Binder::BindCatalog(string &catalog) { + auto &db_manager = DatabaseManager::Get(context); + optional_ptr database = db_manager.GetDatabase(context, catalog); + if (database) { + return db_manager.GetDatabase(context, catalog).get()->GetName(); + } else { + return db_manager.GetDefaultDatabase(context); + } +} + SchemaCatalogEntry &Binder::BindSchema(CreateInfo &info) { BindSchemaOrCatalog(info.catalog, info.schema); if (IsInvalidCatalog(info.catalog) && info.temporary) { @@ -456,9 +466,13 @@ BoundStatement Binder::Bind(CreateStatement &stmt) { auto catalog_type = stmt.info->type; switch (catalog_type) { - case CatalogType::SCHEMA_ENTRY: + case CatalogType::SCHEMA_ENTRY: { + auto &base = stmt.info->Cast(); + auto catalog = BindCatalog(base.catalog); + properties.modified_databases.insert(catalog); result.plan = make_uniq(LogicalOperatorType::LOGICAL_CREATE_SCHEMA, std::move(stmt.info)); break; + } case CatalogType::VIEW_ENTRY: { auto &base = stmt.info->Cast(); // bind the schema diff --git a/src/duckdb/src/planner/binder/tableref/plan_joinref.cpp b/src/duckdb/src/planner/binder/tableref/plan_joinref.cpp index 9bc0b130..bf447eaa 100644 --- a/src/duckdb/src/planner/binder/tableref/plan_joinref.cpp +++ b/src/duckdb/src/planner/binder/tableref/plan_joinref.cpp @@ -135,6 +135,9 @@ unique_ptr LogicalComparisonJoin::CreateJoin(ClientContext &con bool need_to_consider_arbitrary_expressions = true; switch (reftype) { case JoinRefType::ASOF: { + if (!arbitrary_expressions.empty()) { + throw BinderException("Invalid ASOF JOIN condition"); + } need_to_consider_arbitrary_expressions = false; auto asof_idx = conditions.size(); for (size_t c = 0; c < conditions.size(); ++c) { diff --git a/src/duckdb/src/planner/bound_parameter_map.cpp b/src/duckdb/src/planner/bound_parameter_map.cpp index cd55830a..420fa393 100644 --- a/src/duckdb/src/planner/bound_parameter_map.cpp +++ b/src/duckdb/src/planner/bound_parameter_map.cpp @@ -43,19 +43,30 @@ shared_ptr BoundParameterMap::CreateOrGetData(const string & } unique_ptr BoundParameterMap::BindParameterExpression(ParameterExpression &expr) { - auto &identifier = expr.identifier; - auto return_type = GetReturnType(identifier); + auto &identifier = expr.identifier; D_ASSERT(!parameter_data.count(identifier)); // No value has been supplied yet, - // We return a shared pointer to an object that will get populated wtih a Value later - // When the BoundParameterExpression get executed, this will be used to get the corresponding value + // We return a shared pointer to an object that will get populated with a Value later + // When the BoundParameterExpression gets executed, this will be used to get the corresponding value auto param_data = CreateOrGetData(identifier); auto bound_expr = make_uniq(identifier); + bound_expr->parameter_data = param_data; - bound_expr->return_type = return_type; bound_expr->alias = expr.alias; + + auto param_type = param_data->return_type; + auto identifier_type = GetReturnType(identifier); + + // we found a type for this bound parameter, but now we found another occurrence with the same identifier, + // a CAST around this consecutive occurrence might swallow the unknown type of this consecutive occurrence, + // then, if we do not rebind, we potentially have unknown data types during execution + if (identifier_type == LogicalType::UNKNOWN && param_type != LogicalType::UNKNOWN) { + rebind = true; + } + + bound_expr->return_type = identifier_type; return bound_expr; } diff --git a/src/duckdb/src/planner/expression_binder/base_select_binder.cpp b/src/duckdb/src/planner/expression_binder/base_select_binder.cpp index 542bfa52..57cdf686 100644 --- a/src/duckdb/src/planner/expression_binder/base_select_binder.cpp +++ b/src/duckdb/src/planner/expression_binder/base_select_binder.cpp @@ -98,11 +98,8 @@ BindResult BaseSelectBinder::BindColumnRef(unique_ptr &expr_pt " This is not yet supported.", colref.column_names[0]); } - auto result = BindResult(node.select_list[index]->Copy()); - if (result.expression->type == ExpressionType::BOUND_COLUMN_REF) { - auto &result_expr = result.expression->Cast(); - result_expr.depth = depth; - } + auto copied_expression = node.original_expressions[index]->Copy(); + result = BindExpression(copied_expression, depth, false); return result; } } diff --git a/src/duckdb/src/planner/planner.cpp b/src/duckdb/src/planner/planner.cpp index 369e400f..3020f658 100644 --- a/src/duckdb/src/planner/planner.cpp +++ b/src/duckdb/src/planner/planner.cpp @@ -76,7 +76,7 @@ void Planner::CreatePlan(SQLStatement &statement) { } this->properties = binder->properties; this->properties.parameter_count = parameter_count; - properties.bound_all_parameters = parameters_resolved; + properties.bound_all_parameters = !bound_parameters.rebind && parameters_resolved; Planner::VerifyPlan(context, plan, bound_parameters.GetParametersPtr()); diff --git a/src/duckdb/src/transaction/duck_transaction_manager.cpp b/src/duckdb/src/transaction/duck_transaction_manager.cpp index b5dab24d..3b18cb41 100644 --- a/src/duckdb/src/transaction/duck_transaction_manager.cpp +++ b/src/duckdb/src/transaction/duck_transaction_manager.cpp @@ -252,6 +252,7 @@ void DuckTransactionManager::RollbackTransaction(Transaction *transaction_p) { } void DuckTransactionManager::RemoveTransaction(DuckTransaction &transaction) noexcept { + bool changes_made = transaction.ChangesMade(); // remove the transaction from the list of active transactions idx_t t_index = active_transactions.size(); // check for the lowest and highest start time in the list of transactions @@ -275,15 +276,18 @@ void DuckTransactionManager::RemoveTransaction(DuckTransaction &transaction) noe D_ASSERT(t_index != active_transactions.size()); auto current_transaction = std::move(active_transactions[t_index]); auto current_query = DatabaseManager::Get(db).ActiveQueryNumber(); - if (transaction.commit_id != 0) { - // the transaction was committed, add it to the list of recently - // committed transactions - recently_committed_transactions.push_back(std::move(current_transaction)); - } else { - // the transaction was aborted, but we might still need its information - // add it to the set of transactions awaiting GC - current_transaction->highest_active_query = current_query; - old_transactions.push_back(std::move(current_transaction)); + if (changes_made) { + // if the transaction made any changes we need to keep it around + if (transaction.commit_id != 0) { + // the transaction was committed, add it to the list of recently + // committed transactions + recently_committed_transactions.push_back(std::move(current_transaction)); + } else { + // the transaction was aborted, but we might still need its information + // add it to the set of transactions awaiting GC + current_transaction->highest_active_query = current_query; + old_transactions.push_back(std::move(current_transaction)); + } } // remove the transaction from the set of currently active transactions active_transactions.erase(active_transactions.begin() + t_index); diff --git a/src/duckdb/third_party/parquet/parquet_types.h b/src/duckdb/third_party/parquet/parquet_types.h index e19a15cc..3037b1ec 100644 --- a/src/duckdb/third_party/parquet/parquet_types.h +++ b/src/duckdb/third_party/parquet/parquet_types.h @@ -92,7 +92,8 @@ struct Encoding { DELTA_BINARY_PACKED = 5, DELTA_LENGTH_BYTE_ARRAY = 6, DELTA_BYTE_ARRAY = 7, - RLE_DICTIONARY = 8 + RLE_DICTIONARY = 8, + BYTE_STREAM_SPLIT = 9, }; }; diff --git a/src/duckdb/ub_src_optimizer_pushdown.cpp b/src/duckdb/ub_src_optimizer_pushdown.cpp index ce9443dc..ce710b45 100644 --- a/src/duckdb/ub_src_optimizer_pushdown.cpp +++ b/src/duckdb/ub_src_optimizer_pushdown.cpp @@ -2,6 +2,8 @@ #include "src/optimizer/pushdown/pushdown_cross_product.cpp" +#include "src/optimizer/pushdown/pushdown_distinct.cpp" + #include "src/optimizer/pushdown/pushdown_filter.cpp" #include "src/optimizer/pushdown/pushdown_get.cpp"