diff --git a/src/duckdb/extension/icu/icu-timebucket.cpp b/src/duckdb/extension/icu/icu-timebucket.cpp index 62bdbc74..1d928704 100644 --- a/src/duckdb/extension/icu/icu-timebucket.cpp +++ b/src/duckdb/extension/icu/icu-timebucket.cpp @@ -76,24 +76,21 @@ struct ICUTimeBucket : public ICUDateFunc { static inline timestamp_t WidthConvertibleToDaysCommon(int32_t bucket_width_days, const timestamp_t ts, const timestamp_t origin, icu::Calendar *calendar) { - const auto trunc_days = TruncationFactory(DatePartSpecifier::DAY); const auto sub_days = SubtractFactory(DatePartSpecifier::DAY); - uint64_t tmp_micros = SetTime(calendar, ts); - trunc_days(calendar, tmp_micros); - timestamp_t truncated_ts = GetTimeUnsafe(calendar, tmp_micros); - - int64_t ts_days = sub_days(calendar, origin, truncated_ts); + int64_t ts_days = sub_days(calendar, origin, ts); int64_t result_days = (ts_days / bucket_width_days) * bucket_width_days; if (result_days < NumericLimits::Minimum() || result_days > NumericLimits::Maximum()) { throw OutOfRangeException("Timestamp out of range"); } - if (ts_days < 0 && ts_days % bucket_width_days != 0) { - result_days = - SubtractOperatorOverflowCheck::Operation(result_days, bucket_width_days); + timestamp_t bucket = Add(calendar, origin, interval_t {0, static_cast(result_days), 0}); + if (ts < bucket) { + D_ASSERT(ts < origin); + bucket = Add(calendar, bucket, interval_t {0, -bucket_width_days, 0}); + D_ASSERT(ts > bucket); } - return Add(calendar, origin, interval_t {0, static_cast(result_days), 0}); + return bucket; } static inline timestamp_t WidthConvertibleToMonthsCommon(int32_t bucket_width_months, const timestamp_t ts, diff --git a/src/duckdb/extension/icu/icu-timezone.cpp b/src/duckdb/extension/icu/icu-timezone.cpp index 5e500442..7a47baec 100644 --- a/src/duckdb/extension/icu/icu-timezone.cpp +++ b/src/duckdb/extension/icu/icu-timezone.cpp @@ -81,6 +81,9 @@ static void ICUTimeZoneFunction(ClientContext &context, TableFunctionInput &data break; } + // What PG reports is the total offset for today, + // which is the ICU total offset (i.e., "raw") plus the DST offset. + raw_offset_ms += dst_offset_ms; output.SetValue(2, index, Value::INTERVAL(Interval::FromMicro(raw_offset_ms * Interval::MICROS_PER_MSEC))); output.SetValue(3, index, Value(dst_offset_ms != 0)); ++index; diff --git a/src/duckdb/extension/json/buffered_json_reader.cpp b/src/duckdb/extension/json/buffered_json_reader.cpp index 2dee8df5..8f50e531 100644 --- a/src/duckdb/extension/json/buffered_json_reader.cpp +++ b/src/duckdb/extension/json/buffered_json_reader.cpp @@ -23,7 +23,7 @@ bool JSONFileHandle::IsOpen() const { } void JSONFileHandle::Close() { - if (IsOpen() && file_handle->OnDiskFile()) { + if (IsOpen() && !file_handle->IsPipe()) { file_handle->Close(); file_handle = nullptr; } @@ -72,30 +72,23 @@ void JSONFileHandle::ReadAtPosition(char *pointer, idx_t size, idx_t position, b D_ASSERT(size != 0); if (plain_file_source) { file_handle->Read(pointer, size, position); - actual_reads++; - - return; - } - - if (sample_run) { // Cache the buffer + } else if (sample_run) { // Cache the buffer file_handle->Read(pointer, size, position); - actual_reads++; cached_buffers.emplace_back(allocator.Allocate(size)); memcpy(cached_buffers.back().get(), pointer, size); cached_size += size; + } else { + if (!cached_buffers.empty() || position < cached_size) { + ReadFromCache(pointer, size, position); + } - return; - } - - if (!cached_buffers.empty() || position < cached_size) { - ReadFromCache(pointer, size, position); - actual_reads++; + if (size != 0) { + file_handle->Read(pointer, size, position); + } } - - if (size != 0) { - file_handle->Read(pointer, size, position); - actual_reads++; + if (++actual_reads > requested_reads) { + throw InternalException("JSONFileHandle performed more actual reads than requested reads"); } } diff --git a/src/duckdb/extension/json/json_scan.cpp b/src/duckdb/extension/json/json_scan.cpp index 571fba93..1a2a0c83 100644 --- a/src/duckdb/extension/json/json_scan.cpp +++ b/src/duckdb/extension/json/json_scan.cpp @@ -214,17 +214,22 @@ unique_ptr JSONGlobalTableFunctionState::Init(ClientCo idx_t JSONGlobalTableFunctionState::MaxThreads() const { auto &bind_data = state.bind_data; - if (bind_data.options.format == JSONFormat::NEWLINE_DELIMITED) { - return state.system_threads; - } if (!state.json_readers.empty() && state.json_readers[0]->HasFileHandle()) { + // We opened and auto-detected a file, so we can get a better estimate auto &reader = *state.json_readers[0]; - if (reader.GetFormat() == JSONFormat::NEWLINE_DELIMITED) { // Auto-detected NDJSON - return state.system_threads; + if (bind_data.options.format == JSONFormat::NEWLINE_DELIMITED || + reader.GetFormat() == JSONFormat::NEWLINE_DELIMITED) { + return MaxValue(state.json_readers[0]->GetFileHandle().FileSize() / bind_data.maximum_object_size, + 1); } } + if (bind_data.options.format == JSONFormat::NEWLINE_DELIMITED) { + // We haven't opened any files, so this is our best bet + return state.system_threads; + } + // One reader per file return bind_data.files.size(); } diff --git a/src/duckdb/extension/parquet/parquet_extension.cpp b/src/duckdb/extension/parquet/parquet_extension.cpp index e897645e..d61ddafa 100644 --- a/src/duckdb/extension/parquet/parquet_extension.cpp +++ b/src/duckdb/extension/parquet/parquet_extension.cpp @@ -740,8 +740,8 @@ static void GetFieldIDs(const Value &field_ids_value, ChildFieldIDs &field_ids, } } -unique_ptr ParquetWriteBind(ClientContext &context, CopyInfo &info, vector &names, - vector &sql_types) { +unique_ptr ParquetWriteBind(ClientContext &context, const CopyInfo &info, const vector &names, + const vector &sql_types) { D_ASSERT(names.size() == sql_types.size()); bool row_group_size_bytes_set = false; auto bind_data = make_uniq(); diff --git a/src/duckdb/src/catalog/catalog_entry/view_catalog_entry.cpp b/src/duckdb/src/catalog/catalog_entry/view_catalog_entry.cpp index 1f41f740..9c3a12ca 100644 --- a/src/duckdb/src/catalog/catalog_entry/view_catalog_entry.cpp +++ b/src/duckdb/src/catalog/catalog_entry/view_catalog_entry.cpp @@ -32,6 +32,7 @@ unique_ptr ViewCatalogEntry::GetInfo() const { result->query = unique_ptr_cast(query->Copy()); result->aliases = aliases; result->types = types; + result->temporary = temporary; return std::move(result); } @@ -58,23 +59,16 @@ string ViewCatalogEntry::ToSQL() const { //! Return empty sql with view name so pragma view_tables don't complain return sql; } - return sql + "\n;"; + auto info = GetInfo(); + auto result = info->ToString(); + return result + ";\n"; } unique_ptr ViewCatalogEntry::Copy(ClientContext &context) const { D_ASSERT(!internal); - CreateViewInfo create_info(schema, name); - create_info.query = unique_ptr_cast(query->Copy()); - for (idx_t i = 0; i < aliases.size(); i++) { - create_info.aliases.push_back(aliases[i]); - } - for (idx_t i = 0; i < types.size(); i++) { - create_info.types.push_back(types[i]); - } - create_info.temporary = temporary; - create_info.sql = sql; + auto create_info = GetInfo(); - return make_uniq(catalog, schema, create_info); + return make_uniq(catalog, schema, create_info->Cast()); } } // namespace duckdb diff --git a/src/duckdb/src/catalog/catalog_set.cpp b/src/duckdb/src/catalog/catalog_set.cpp index 8440786d..b8986da6 100644 --- a/src/duckdb/src/catalog/catalog_set.cpp +++ b/src/duckdb/src/catalog/catalog_set.cpp @@ -199,6 +199,8 @@ bool CatalogSet::AlterOwnership(CatalogTransaction transaction, ChangeOwnershipI bool CatalogSet::AlterEntry(CatalogTransaction transaction, const string &name, AlterInfo &alter_info) { // lock the catalog for writing lock_guard write_lock(catalog.GetWriteLock()); + // lock this catalog set to disallow reading + lock_guard read_lock(catalog_lock); // first check if the entry exists in the unordered set EntryIndex entry_index; @@ -210,9 +212,6 @@ bool CatalogSet::AlterEntry(CatalogTransaction transaction, const string &name, throw CatalogException("Cannot alter entry \"%s\" because it is an internal system entry", entry->name); } - // lock this catalog set to disallow reading - lock_guard read_lock(catalog_lock); - // create a new entry and replace the currently stored one // set the timestamp to the timestamp of the current transaction // and point it to the updated table node @@ -316,6 +315,7 @@ void CatalogSet::DropEntryInternal(CatalogTransaction transaction, EntryIndex en bool CatalogSet::DropEntry(CatalogTransaction transaction, const string &name, bool cascade, bool allow_drop_internal) { // lock the catalog for writing lock_guard write_lock(catalog.GetWriteLock()); + lock_guard read_lock(catalog_lock); // we can only delete an entry that exists EntryIndex entry_index; auto entry = GetEntryInternal(transaction, name, &entry_index); @@ -326,7 +326,6 @@ bool CatalogSet::DropEntry(CatalogTransaction transaction, const string &name, b throw CatalogException("Cannot drop entry \"%s\" because it is an internal system entry", entry->name); } - lock_guard read_lock(catalog_lock); DropEntryInternal(transaction, std::move(entry_index), *entry, cascade); return true; } diff --git a/src/duckdb/src/common/arrow/appender/union_data.cpp b/src/duckdb/src/common/arrow/appender/union_data.cpp index cfe54f89..3adb8d05 100644 --- a/src/duckdb/src/common/arrow/appender/union_data.cpp +++ b/src/duckdb/src/common/arrow/appender/union_data.cpp @@ -24,7 +24,7 @@ void ArrowUnionData::Append(ArrowAppendData &append_data, Vector &input, idx_t f duckdb::vector child_vectors; for (const auto &child : UnionType::CopyMemberTypes(input.GetType())) { - child_vectors.emplace_back(child.second); + child_vectors.emplace_back(child.second, size); } for (idx_t input_idx = from; input_idx < to; input_idx++) { diff --git a/src/duckdb/src/common/arrow/arrow_appender.cpp b/src/duckdb/src/common/arrow/arrow_appender.cpp index 10d1e39e..bbca7775 100644 --- a/src/duckdb/src/common/arrow/arrow_appender.cpp +++ b/src/duckdb/src/common/arrow/arrow_appender.cpp @@ -193,26 +193,26 @@ static void InitializeFunctionPointers(ArrowAppendData &append_data, const Logic if (append_data.options.arrow_offset_size == ArrowOffsetSize::LARGE) { InitializeAppenderForType>(append_data); } else { - InitializeAppenderForType>(append_data); + InitializeAppenderForType>(append_data); } break; case LogicalTypeId::UUID: if (append_data.options.arrow_offset_size == ArrowOffsetSize::LARGE) { InitializeAppenderForType>(append_data); } else { - InitializeAppenderForType>(append_data); + InitializeAppenderForType>(append_data); } break; case LogicalTypeId::ENUM: switch (type.InternalType()) { case PhysicalType::UINT8: - InitializeAppenderForType>(append_data); + InitializeAppenderForType>(append_data); break; case PhysicalType::UINT16: - InitializeAppenderForType>(append_data); + InitializeAppenderForType>(append_data); break; case PhysicalType::UINT32: - InitializeAppenderForType>(append_data); + InitializeAppenderForType>(append_data); break; default: throw InternalException("Unsupported internal enum type"); @@ -227,11 +227,20 @@ static void InitializeFunctionPointers(ArrowAppendData &append_data, const Logic case LogicalTypeId::STRUCT: InitializeAppenderForType(append_data); break; - case LogicalTypeId::LIST: - InitializeAppenderForType(append_data); + case LogicalTypeId::LIST: { + if (append_data.options.arrow_offset_size == ArrowOffsetSize::LARGE) { + InitializeAppenderForType>(append_data); + } else { + InitializeAppenderForType>(append_data); + } break; + } case LogicalTypeId::MAP: - InitializeAppenderForType(append_data); + if (append_data.options.arrow_offset_size == ArrowOffsetSize::LARGE) { + InitializeAppenderForType>(append_data); + } else { + InitializeAppenderForType>(append_data); + } break; default: throw NotImplementedException("Unsupported type in DuckDB -> Arrow Conversion: %s\n", type.ToString()); diff --git a/src/duckdb/src/common/arrow/arrow_converter.cpp b/src/duckdb/src/common/arrow/arrow_converter.cpp index 0ecc46e0..d57bcc47 100644 --- a/src/duckdb/src/common/arrow/arrow_converter.cpp +++ b/src/duckdb/src/common/arrow/arrow_converter.cpp @@ -187,7 +187,11 @@ void SetArrowFormat(DuckDBArrowSchemaHolder &root_holder, ArrowSchema &child, co break; } case LogicalTypeId::LIST: { - child.format = "+l"; + if (options.arrow_offset_size == ArrowOffsetSize::LARGE) { + child.format = "+L"; + } else { + child.format = "+l"; + } child.n_children = 1; root_holder.nested_children.emplace_back(); root_holder.nested_children.back().resize(1); diff --git a/src/duckdb/src/common/enum_util.cpp b/src/duckdb/src/common/enum_util.cpp index 2d397936..64fa2599 100644 --- a/src/duckdb/src/common/enum_util.cpp +++ b/src/duckdb/src/common/enum_util.cpp @@ -64,6 +64,7 @@ #include "duckdb/common/types/timestamp.hpp" #include "duckdb/common/types/vector.hpp" #include "duckdb/common/types/vector_buffer.hpp" +#include "duckdb/core_functions/aggregate/quantile_enum.hpp" #include "duckdb/execution/index/art/art.hpp" #include "duckdb/execution/index/art/node.hpp" #include "duckdb/execution/operator/scan/csv/base_csv_reader.hpp" @@ -4571,6 +4572,44 @@ ProfilerPrintFormat EnumUtil::FromString(const char *value) throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value)); } +template<> +const char* EnumUtil::ToChars(QuantileSerializationType value) { + switch(value) { + case QuantileSerializationType::NON_DECIMAL: + return "NON_DECIMAL"; + case QuantileSerializationType::DECIMAL_DISCRETE: + return "DECIMAL_DISCRETE"; + case QuantileSerializationType::DECIMAL_DISCRETE_LIST: + return "DECIMAL_DISCRETE_LIST"; + case QuantileSerializationType::DECIMAL_CONTINUOUS: + return "DECIMAL_CONTINUOUS"; + case QuantileSerializationType::DECIMAL_CONTINUOUS_LIST: + return "DECIMAL_CONTINUOUS_LIST"; + default: + throw NotImplementedException(StringUtil::Format("Enum value: '%d' not implemented", value)); + } +} + +template<> +QuantileSerializationType EnumUtil::FromString(const char *value) { + if (StringUtil::Equals(value, "NON_DECIMAL")) { + return QuantileSerializationType::NON_DECIMAL; + } + if (StringUtil::Equals(value, "DECIMAL_DISCRETE")) { + return QuantileSerializationType::DECIMAL_DISCRETE; + } + if (StringUtil::Equals(value, "DECIMAL_DISCRETE_LIST")) { + return QuantileSerializationType::DECIMAL_DISCRETE_LIST; + } + if (StringUtil::Equals(value, "DECIMAL_CONTINUOUS")) { + return QuantileSerializationType::DECIMAL_CONTINUOUS; + } + if (StringUtil::Equals(value, "DECIMAL_CONTINUOUS_LIST")) { + return QuantileSerializationType::DECIMAL_CONTINUOUS_LIST; + } + throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value)); +} + template<> const char* EnumUtil::ToChars(QueryNodeType value) { switch(value) { @@ -5118,6 +5157,29 @@ SinkFinalizeType EnumUtil::FromString(const char *value) { throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value)); } +template<> +const char* EnumUtil::ToChars(SinkNextBatchType value) { + switch(value) { + case SinkNextBatchType::READY: + return "READY"; + case SinkNextBatchType::BLOCKED: + return "BLOCKED"; + default: + throw NotImplementedException(StringUtil::Format("Enum value: '%d' not implemented", value)); + } +} + +template<> +SinkNextBatchType EnumUtil::FromString(const char *value) { + if (StringUtil::Equals(value, "READY")) { + return SinkNextBatchType::READY; + } + if (StringUtil::Equals(value, "BLOCKED")) { + return SinkNextBatchType::BLOCKED; + } + throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value)); +} + template<> const char* EnumUtil::ToChars(SinkResultType value) { switch(value) { @@ -6010,6 +6072,8 @@ const char* EnumUtil::ToChars(UnionInvalidReason value) { return "VALIDITY_OVERLAP"; case UnionInvalidReason::TAG_MISMATCH: return "TAG_MISMATCH"; + case UnionInvalidReason::NULL_TAG: + return "NULL_TAG"; default: throw NotImplementedException(StringUtil::Format("Enum value: '%d' not implemented", value)); } @@ -6032,6 +6096,9 @@ UnionInvalidReason EnumUtil::FromString(const char *value) { if (StringUtil::Equals(value, "TAG_MISMATCH")) { return UnionInvalidReason::TAG_MISMATCH; } + if (StringUtil::Equals(value, "NULL_TAG")) { + return UnionInvalidReason::NULL_TAG; + } throw NotImplementedException(StringUtil::Format("Enum value: '%s' not implemented", value)); } diff --git a/src/duckdb/src/common/file_system.cpp b/src/duckdb/src/common/file_system.cpp index be51cda9..ed5a4c84 100644 --- a/src/duckdb/src/common/file_system.cpp +++ b/src/duckdb/src/common/file_system.cpp @@ -344,7 +344,7 @@ bool FileSystem::FileExists(const string &filename) { } bool FileSystem::IsPipe(const string &filename) { - throw NotImplementedException("%s: IsPipe is not implemented!", GetName()); + return false; } void FileSystem::RemoveFile(const string &filename) { @@ -500,6 +500,10 @@ bool FileHandle::CanSeek() { return file_system.CanSeek(); } +bool FileHandle::IsPipe() { + return file_system.IsPipe(path); +} + string FileHandle::ReadLine() { string result; char buffer[1]; diff --git a/src/duckdb/src/common/hive_partitioning.cpp b/src/duckdb/src/common/hive_partitioning.cpp index d3b24d76..d7211ff9 100644 --- a/src/duckdb/src/common/hive_partitioning.cpp +++ b/src/duckdb/src/common/hive_partitioning.cpp @@ -64,7 +64,10 @@ static void ConvertKnownColRefToConstants(unique_ptr &expr, // - s3://bucket/var1=value1/bla/bla/var2=value2 // - http(s)://domain(:port)/lala/kasdl/var1=value1/?not-a-var=not-a-value // - folder/folder/folder/../var1=value1/etc/.//var2=value2 -const string HivePartitioning::REGEX_STRING = "[\\/\\\\]([^\\/\\?\\\\]+)=([^\\/\\n\\?\\\\]+)"; +const string &HivePartitioning::RegexString() { + static string REGEX = "[\\/\\\\]([^\\/\\?\\\\]+)=([^\\/\\n\\?\\\\]+)"; + return REGEX; +} std::map HivePartitioning::Parse(const string &filename, duckdb_re2::RE2 ®ex) { std::map result; @@ -79,7 +82,7 @@ std::map HivePartitioning::Parse(const string &filename, duckdb_ } std::map HivePartitioning::Parse(const string &filename) { - duckdb_re2::RE2 regex(REGEX_STRING); + duckdb_re2::RE2 regex(RegexString()); return Parse(filename, regex); } @@ -94,7 +97,7 @@ void HivePartitioning::ApplyFiltersToFileList(ClientContext &context, vector have_preserved_filter(filters.size(), false); vector> pruned_filters; unordered_set filters_applied_to_files; - duckdb_re2::RE2 regex(REGEX_STRING); + duckdb_re2::RE2 regex(RegexString()); auto table_index = get.table_index; if ((!filename_enabled && !hive_enabled) || filters.empty()) { diff --git a/src/duckdb/src/common/multi_file_reader.cpp b/src/duckdb/src/common/multi_file_reader.cpp index c68d950c..9049008e 100644 --- a/src/duckdb/src/common/multi_file_reader.cpp +++ b/src/duckdb/src/common/multi_file_reader.cpp @@ -102,7 +102,9 @@ bool MultiFileReader::ComplexFilterPushdown(ClientContext &context, vector column_map; for (idx_t i = 0; i < get.column_ids.size(); i++) { - column_map.insert({get.names[get.column_ids[i]], i}); + if (!IsRowIdColumnId(get.column_ids[i])) { + column_map.insert({get.names[get.column_ids[i]], i}); + } } auto start_files = files.size(); @@ -432,7 +434,7 @@ void MultiFileReaderOptions::AutoDetectHiveTypesInternal(const string &file, Cli } Value value(part.second); for (auto &candidate : candidates) { - const bool success = value.TryCastAs(context, candidate); + const bool success = value.TryCastAs(context, candidate, true); if (success) { hive_types_schema[name] = candidate; break; diff --git a/src/duckdb/src/common/types.cpp b/src/duckdb/src/common/types.cpp index 043ac3e7..98f6ae2e 100644 --- a/src/duckdb/src/common/types.cpp +++ b/src/duckdb/src/common/types.cpp @@ -373,7 +373,9 @@ string LogicalType::ToString() const { string ret = "UNION("; size_t count = UnionType::GetMemberCount(*this); for (size_t i = 0; i < count; i++) { - ret += UnionType::GetMemberName(*this, i) + " " + UnionType::GetMemberType(*this, i).ToString(); + auto member_name = UnionType::GetMemberName(*this, i); + auto member_type = UnionType::GetMemberType(*this, i).ToString(); + ret += StringUtil::Format("%s %s", SQLIdentifier(member_name), member_type); if (i < count - 1) { ret += ", "; } diff --git a/src/duckdb/src/common/types/list_segment.cpp b/src/duckdb/src/common/types/list_segment.cpp index de350b60..2a14718b 100644 --- a/src/duckdb/src/common/types/list_segment.cpp +++ b/src/duckdb/src/common/types/list_segment.cpp @@ -462,6 +462,10 @@ void SegmentPrimitiveFunction(ListSegmentFunctions &functions) { void GetSegmentDataFunctions(ListSegmentFunctions &functions, const LogicalType &type) { + if (type.id() == LogicalTypeId::UNKNOWN) { + throw ParameterNotResolvedException(); + } + auto physical_type = type.InternalType(); switch (physical_type) { case PhysicalType::BIT: diff --git a/src/duckdb/src/common/types/vector.cpp b/src/duckdb/src/common/types/vector.cpp index 58b9f162..03af4fb9 100644 --- a/src/duckdb/src/common/types/vector.cpp +++ b/src/duckdb/src/common/types/vector.cpp @@ -1131,9 +1131,12 @@ void Vector::VerifyMap(Vector &vector_p, const SelectionVector &sel_p, idx_t cou void Vector::VerifyUnion(Vector &vector_p, const SelectionVector &sel_p, idx_t count) { #ifdef DEBUG + D_ASSERT(vector_p.GetType().id() == LogicalTypeId::UNION); auto valid_check = UnionVector::CheckUnionValidity(vector_p, count, sel_p); - D_ASSERT(valid_check == UnionInvalidReason::VALID); + if (valid_check != UnionInvalidReason::VALID) { + throw InternalException("Union not valid, reason: %s", EnumUtil::ToString(valid_check)); + } #endif // DEBUG } @@ -1250,7 +1253,8 @@ void Vector::Verify(Vector &vector_p, const SelectionVector &sel_p, idx_t count) } if (vector->GetType().id() == LogicalTypeId::UNION) { - VerifyUnion(*vector, *sel, count); + // Pass in raw vector + VerifyUnion(vector_p, sel_p, count); } } @@ -1911,7 +1915,13 @@ void UnionVector::SetToMember(Vector &union_vector, union_tag_t tag, Vector &mem // if the member vector is constant, we can set the union to constant as well union_vector.SetVectorType(VectorType::CONSTANT_VECTOR); ConstantVector::GetData(tag_vector)[0] = tag; - ConstantVector::SetNull(union_vector, ConstantVector::IsNull(member_vector)); + if (keep_tags_for_null) { + ConstantVector::SetNull(union_vector, false); + ConstantVector::SetNull(tag_vector, false); + } else { + ConstantVector::SetNull(union_vector, ConstantVector::IsNull(member_vector)); + ConstantVector::SetNull(tag_vector, ConstantVector::IsNull(member_vector)); + } } else { // otherwise flatten and set to flatvector @@ -1962,53 +1972,75 @@ union_tag_t UnionVector::GetTag(const Vector &vector, idx_t index) { return FlatVector::GetData(tag_vector)[index]; } -UnionInvalidReason UnionVector::CheckUnionValidity(Vector &vector, idx_t count, const SelectionVector &sel) { - D_ASSERT(vector.GetType().id() == LogicalTypeId::UNION); - auto member_count = UnionType::GetMemberCount(vector.GetType()); +//! Raw selection vector passed in (not merged with any other selection vectors) +UnionInvalidReason UnionVector::CheckUnionValidity(Vector &vector_p, idx_t count, const SelectionVector &sel_p) { + D_ASSERT(vector_p.GetType().id() == LogicalTypeId::UNION); + + // Will contain the (possibly) merged selection vector + const SelectionVector *sel = &sel_p; + SelectionVector owned_sel; + Vector *vector = &vector_p; + if (vector->GetVectorType() == VectorType::DICTIONARY_VECTOR) { + // In the case of a dictionary vector, unwrap the Vector, and merge the selection vectors. + auto &child = DictionaryVector::Child(*vector); + D_ASSERT(child.GetVectorType() != VectorType::DICTIONARY_VECTOR); + auto &dict_sel = DictionaryVector::SelVector(*vector); + // merge the selection vectors and verify the child + auto new_buffer = dict_sel.Slice(*sel, count); + owned_sel.Initialize(new_buffer); + sel = &owned_sel; + vector = &child; + } else if (vector->GetVectorType() == VectorType::CONSTANT_VECTOR) { + sel = ConstantVector::ZeroSelectionVector(count, owned_sel); + } + + auto member_count = UnionType::GetMemberCount(vector_p.GetType()); if (member_count == 0) { return UnionInvalidReason::NO_MEMBERS; } - UnifiedVectorFormat union_vdata; - vector.ToUnifiedFormat(count, union_vdata); + UnifiedVectorFormat vector_vdata; + vector_p.ToUnifiedFormat(count, vector_vdata); - UnifiedVectorFormat tags_vdata; - auto &tag_vector = UnionVector::GetTags(vector); - tag_vector.ToUnifiedFormat(count, tags_vdata); + auto &entries = StructVector::GetEntries(vector_p); + duckdb::vector child_vdata(entries.size()); + for (idx_t entry_idx = 0; entry_idx < entries.size(); entry_idx++) { + auto &child = *entries[entry_idx]; + child.ToUnifiedFormat(count, child_vdata[entry_idx]); + } + + auto &tag_vdata = child_vdata[0]; - // check that only one member is valid at a time for (idx_t row_idx = 0; row_idx < count; row_idx++) { - auto union_mapped_row_idx = sel.get_index(row_idx); - if (!union_vdata.validity.RowIsValid(union_mapped_row_idx)) { - continue; - } + auto mapped_idx = sel->get_index(row_idx); - auto tag_mapped_row_idx = tags_vdata.sel->get_index(row_idx); - if (!tags_vdata.validity.RowIsValid(tag_mapped_row_idx)) { + if (!vector_vdata.validity.RowIsValid(mapped_idx)) { continue; } - auto tag = (UnifiedVectorFormat::GetData(tags_vdata))[tag_mapped_row_idx]; + auto tag_idx = tag_vdata.sel->get_index(sel_p.get_index(row_idx)); + if (!tag_vdata.validity.RowIsValid(tag_idx)) { + // we can't have NULL tags! + return UnionInvalidReason::NULL_TAG; + } + auto tag = UnifiedVectorFormat::GetData(tag_vdata)[tag_idx]; if (tag >= member_count) { return UnionInvalidReason::TAG_OUT_OF_RANGE; } bool found_valid = false; - for (idx_t member_idx = 0; member_idx < member_count; member_idx++) { - - UnifiedVectorFormat member_vdata; - auto &member = UnionVector::GetMember(vector, member_idx); - member.ToUnifiedFormat(count, member_vdata); - - auto mapped_row_idx = member_vdata.sel->get_index(row_idx); - if (member_vdata.validity.RowIsValid(mapped_row_idx)) { - if (found_valid) { - return UnionInvalidReason::VALIDITY_OVERLAP; - } - found_valid = true; - if (tag != static_cast(member_idx)) { - return UnionInvalidReason::TAG_MISMATCH; - } + for (idx_t i = 0; i < member_count; i++) { + auto &member_vdata = child_vdata[1 + i]; // skip the tag + idx_t member_idx = member_vdata.sel->get_index(sel_p.get_index(row_idx)); + if (!member_vdata.validity.RowIsValid(member_idx)) { + continue; + } + if (found_valid) { + return UnionInvalidReason::VALIDITY_OVERLAP; + } + found_valid = true; + if (tag != static_cast(i)) { + return UnionInvalidReason::TAG_MISMATCH; } } } diff --git a/src/duckdb/src/core_functions/aggregate/holistic/quantile.cpp b/src/duckdb/src/core_functions/aggregate/holistic/quantile.cpp index 045ae65f..09f9204d 100644 --- a/src/duckdb/src/core_functions/aggregate/holistic/quantile.cpp +++ b/src/duckdb/src/core_functions/aggregate/holistic/quantile.cpp @@ -1,5 +1,6 @@ #include "duckdb/execution/expression_executor.hpp" #include "duckdb/core_functions/aggregate/holistic_functions.hpp" +#include "duckdb/core_functions/aggregate/quantile_enum.hpp" #include "duckdb/planner/expression.hpp" #include "duckdb/common/operator/cast_operators.hpp" #include "duckdb/common/operator/abs.hpp" @@ -442,6 +443,8 @@ inline Value QuantileAbs(const Value &v) { } } +void BindQuantileInner(AggregateFunction &function, const LogicalType &type, QuantileSerializationType quantile_type); + struct QuantileBindData : public FunctionData { QuantileBindData() { } @@ -507,15 +510,59 @@ struct QuantileBindData : public FunctionData { deserializer.ReadProperty(100, "quantiles", raw); deserializer.ReadProperty(101, "order", result->order); deserializer.ReadProperty(102, "desc", result->desc); + QuantileSerializationType deserialization_type; + deserializer.ReadPropertyWithDefault(103, "quantile_type", deserialization_type, + QuantileSerializationType::NON_DECIMAL); + + if (deserialization_type != QuantileSerializationType::NON_DECIMAL) { + LogicalType arg_type; + deserializer.ReadProperty(104, "logical_type", arg_type); + + BindQuantileInner(function, arg_type, deserialization_type); + } + for (const auto &r : raw) { result->quantiles.emplace_back(QuantileValue(r)); } return std::move(result); } - static void SerializeDecimal(Serializer &serializer, const optional_ptr bind_data_p, - const AggregateFunction &function) { - throw NotImplementedException("FIXME: serializing quantiles with decimals is not supported right now"); + static void SerializeDecimalDiscrete(Serializer &serializer, const optional_ptr bind_data_p, + const AggregateFunction &function) { + Serialize(serializer, bind_data_p, function); + + serializer.WritePropertyWithDefault( + 103, "quantile_type", QuantileSerializationType::DECIMAL_DISCRETE, QuantileSerializationType::NON_DECIMAL); + serializer.WriteProperty(104, "logical_type", function.arguments[0]); + } + static void SerializeDecimalDiscreteList(Serializer &serializer, const optional_ptr bind_data_p, + const AggregateFunction &function) { + + Serialize(serializer, bind_data_p, function); + + serializer.WritePropertyWithDefault(103, "quantile_type", + QuantileSerializationType::DECIMAL_DISCRETE_LIST, + QuantileSerializationType::NON_DECIMAL); + serializer.WriteProperty(104, "logical_type", function.arguments[0]); + } + static void SerializeDecimalContinuous(Serializer &serializer, const optional_ptr bind_data_p, + const AggregateFunction &function) { + Serialize(serializer, bind_data_p, function); + + serializer.WritePropertyWithDefault(103, "quantile_type", + QuantileSerializationType::DECIMAL_CONTINUOUS, + QuantileSerializationType::NON_DECIMAL); + serializer.WriteProperty(104, "logical_type", function.arguments[0]); + } + static void SerializeDecimalContinuousList(Serializer &serializer, const optional_ptr bind_data_p, + const AggregateFunction &function) { + + Serialize(serializer, bind_data_p, function); + + serializer.WritePropertyWithDefault( + 103, "quantile_type", QuantileSerializationType::DECIMAL_CONTINUOUS_LIST, + QuantileSerializationType::NON_DECIMAL); + serializer.WriteProperty(104, "logical_type", function.arguments[0]); } vector quantiles; @@ -1232,7 +1279,7 @@ unique_ptr BindMedianDecimal(ClientContext &context, AggregateFunc function = GetDiscreteQuantileAggregateFunction(arguments[0]->return_type); function.name = "median"; - function.serialize = QuantileBindData::SerializeDecimal; + function.serialize = QuantileBindData::SerializeDecimalDiscrete; function.deserialize = QuantileBindData::Deserialize; function.order_dependent = AggregateOrderDependent::NOT_ORDER_DEPENDENT; return bind_data; @@ -1283,50 +1330,62 @@ unique_ptr BindQuantile(ClientContext &context, AggregateFunction return make_uniq(quantiles); } +void BindQuantileInner(AggregateFunction &function, const LogicalType &type, QuantileSerializationType quantile_type) { + switch (quantile_type) { + case QuantileSerializationType::DECIMAL_DISCRETE: + function = GetDiscreteQuantileAggregateFunction(type); + function.serialize = QuantileBindData::SerializeDecimalDiscrete; + function.name = "quantile_disc"; + break; + case QuantileSerializationType::DECIMAL_DISCRETE_LIST: + function = GetDiscreteQuantileListAggregateFunction(type); + function.serialize = QuantileBindData::SerializeDecimalDiscreteList; + function.name = "quantile_disc"; + break; + case QuantileSerializationType::DECIMAL_CONTINUOUS: + function = GetContinuousQuantileAggregateFunction(type); + function.serialize = QuantileBindData::SerializeDecimalContinuous; + function.name = "quantile_cont"; + break; + case QuantileSerializationType::DECIMAL_CONTINUOUS_LIST: + function = GetContinuousQuantileListAggregateFunction(type); + function.serialize = QuantileBindData::SerializeDecimalContinuousList; + function.name = "quantile_cont"; + break; + case QuantileSerializationType::NON_DECIMAL: + throw SerializationException("NON_DECIMAL is not a valid quantile_type for BindQuantileInner"); + } + function.deserialize = QuantileBindData::Deserialize; + function.order_dependent = AggregateOrderDependent::NOT_ORDER_DEPENDENT; +} + unique_ptr BindDiscreteQuantileDecimal(ClientContext &context, AggregateFunction &function, vector> &arguments) { auto bind_data = BindQuantile(context, function, arguments); - function = GetDiscreteQuantileAggregateFunction(arguments[0]->return_type); - function.name = "quantile_disc"; - function.serialize = QuantileBindData::SerializeDecimal; - function.deserialize = QuantileBindData::Deserialize; - function.order_dependent = AggregateOrderDependent::NOT_ORDER_DEPENDENT; + BindQuantileInner(function, arguments[0]->return_type, QuantileSerializationType::DECIMAL_DISCRETE); return bind_data; } unique_ptr BindDiscreteQuantileDecimalList(ClientContext &context, AggregateFunction &function, vector> &arguments) { auto bind_data = BindQuantile(context, function, arguments); - function = GetDiscreteQuantileListAggregateFunction(arguments[0]->return_type); - function.name = "quantile_disc"; - function.serialize = QuantileBindData::SerializeDecimal; - function.deserialize = QuantileBindData::Deserialize; - function.order_dependent = AggregateOrderDependent::NOT_ORDER_DEPENDENT; + BindQuantileInner(function, arguments[0]->return_type, QuantileSerializationType::DECIMAL_DISCRETE_LIST); return bind_data; } unique_ptr BindContinuousQuantileDecimal(ClientContext &context, AggregateFunction &function, vector> &arguments) { auto bind_data = BindQuantile(context, function, arguments); - function = GetContinuousQuantileAggregateFunction(arguments[0]->return_type); - function.name = "quantile_cont"; - function.serialize = QuantileBindData::SerializeDecimal; - function.deserialize = QuantileBindData::Deserialize; - function.order_dependent = AggregateOrderDependent::NOT_ORDER_DEPENDENT; + BindQuantileInner(function, arguments[0]->return_type, QuantileSerializationType::DECIMAL_CONTINUOUS); return bind_data; } unique_ptr BindContinuousQuantileDecimalList(ClientContext &context, AggregateFunction &function, vector> &arguments) { auto bind_data = BindQuantile(context, function, arguments); - function = GetContinuousQuantileListAggregateFunction(arguments[0]->return_type); - function.name = "quantile_cont"; - function.serialize = QuantileBindData::SerializeDecimal; - function.deserialize = QuantileBindData::Deserialize; - function.order_dependent = AggregateOrderDependent::NOT_ORDER_DEPENDENT; + BindQuantileInner(function, arguments[0]->return_type, QuantileSerializationType::DECIMAL_CONTINUOUS_LIST); return bind_data; } - static bool CanInterpolate(const LogicalType &type) { switch (type.id()) { case LogicalTypeId::INTERVAL: diff --git a/src/duckdb/src/core_functions/function_list.cpp b/src/duckdb/src/core_functions/function_list.cpp index 9e9ef048..299fd59e 100644 --- a/src/duckdb/src/core_functions/function_list.cpp +++ b/src/duckdb/src/core_functions/function_list.cpp @@ -213,8 +213,9 @@ static StaticFunctionDefinition internal_functions[] = { DUCKDB_SCALAR_FUNCTION(ListTransformFun), DUCKDB_SCALAR_FUNCTION(ListUniqueFun), DUCKDB_SCALAR_FUNCTION(ListValueFun), + DUCKDB_AGGREGATE_FUNCTION_SET_ALIAS(ListaggFun), DUCKDB_SCALAR_FUNCTION(LnFun), - DUCKDB_SCALAR_FUNCTION_ALIAS(LogFun), + DUCKDB_SCALAR_FUNCTION_SET(LogFun), DUCKDB_SCALAR_FUNCTION(Log10Fun), DUCKDB_SCALAR_FUNCTION(Log2Fun), DUCKDB_SCALAR_FUNCTION(LpadFun), diff --git a/src/duckdb/src/core_functions/scalar/date/strftime.cpp b/src/duckdb/src/core_functions/scalar/date/strftime.cpp index a764c97e..708ff2c3 100644 --- a/src/duckdb/src/core_functions/scalar/date/strftime.cpp +++ b/src/duckdb/src/core_functions/scalar/date/strftime.cpp @@ -183,7 +183,14 @@ struct StrpTimeFunction { auto &func_expr = state.expr.Cast(); auto &info = func_expr.bind_info->Cast(); - if (args.data[1].GetVectorType() == VectorType::CONSTANT_VECTOR && ConstantVector::IsNull(args.data[1])) { + // There is a bizarre situation where the format column is foldable but not constant + // (i.e., the statistics tell us it has only one value) + // We have to check whether that value is NULL + const auto count = args.size(); + UnifiedVectorFormat format_unified; + args.data[1].ToUnifiedFormat(count, format_unified); + + if (!format_unified.validity.RowIsValid(0)) { result.SetVectorType(VectorType::CONSTANT_VECTOR); ConstantVector::SetNull(result, true); return; diff --git a/src/duckdb/src/core_functions/scalar/math/numeric.cpp b/src/duckdb/src/core_functions/scalar/math/numeric.cpp index 19c841b2..740b9ed4 100644 --- a/src/duckdb/src/core_functions/scalar/math/numeric.cpp +++ b/src/duckdb/src/core_functions/scalar/math/numeric.cpp @@ -816,6 +816,29 @@ ScalarFunction Log10Fun::GetFunction() { ScalarFunction::UnaryFunction); } +//===--------------------------------------------------------------------===// +// log with base +//===--------------------------------------------------------------------===// +struct LogBaseOperator { + template + static inline TR Operation(TA b, TB x) { + auto divisor = Log10Operator::Operation(b); + if (divisor == 0) { + throw OutOfRangeException("divison by zero in based logarithm"); + } + return Log10Operator::Operation(x) / divisor; + } +}; + +ScalarFunctionSet LogFun::GetFunctions() { + ScalarFunctionSet funcs; + funcs.AddFunction(ScalarFunction({LogicalType::DOUBLE}, LogicalType::DOUBLE, + ScalarFunction::UnaryFunction)); + funcs.AddFunction(ScalarFunction({LogicalType::DOUBLE, LogicalType::DOUBLE}, LogicalType::DOUBLE, + ScalarFunction::BinaryFunction)); + return funcs; +} + //===--------------------------------------------------------------------===// // log2 //===--------------------------------------------------------------------===// diff --git a/src/duckdb/src/core_functions/scalar/string/jaccard.cpp b/src/duckdb/src/core_functions/scalar/string/jaccard.cpp index 69024442..e3f081b6 100644 --- a/src/duckdb/src/core_functions/scalar/string/jaccard.cpp +++ b/src/duckdb/src/core_functions/scalar/string/jaccard.cpp @@ -1,48 +1,41 @@ -#include "duckdb/core_functions/scalar/string_functions.hpp" -#include "duckdb/common/vector_operations/vector_operations.hpp" #include "duckdb/common/map.hpp" +#include "duckdb/common/vector_operations/vector_operations.hpp" +#include "duckdb/core_functions/scalar/string_functions.hpp" +#include #include namespace duckdb { -static inline map GetSet(const string_t &str) { - auto map_of_chars = map {}; +namespace { +constexpr size_t MAX_SIZE = std::numeric_limits::max() + 1; +} + +static inline std::bitset GetSet(const string_t &str) { + std::bitset array_set; + idx_t str_len = str.GetSize(); auto s = str.GetData(); for (idx_t pos = 0; pos < str_len; pos++) { - map_of_chars.insert(std::make_pair(s[pos], 1)); + array_set.set(static_cast(s[pos])); } - return map_of_chars; + return array_set; } static double JaccardSimilarity(const string_t &str, const string_t &txt) { if (str.GetSize() < 1 || txt.GetSize() < 1) { throw InvalidInputException("Jaccard Function: An argument too short!"); } - map m_str, m_txt; + std::bitset m_str, m_txt; m_str = GetSet(str); m_txt = GetSet(txt); - if (m_str.size() > m_txt.size()) { - m_str.swap(m_txt); - } - - for (auto const &achar : m_str) { - ++m_txt[achar.first]; - } - // m_txt.size is now size of union. - - idx_t size_intersect = 0; - for (const auto &apair : m_txt) { - if (apair.second > 1) { - size_intersect++; - } - } + idx_t size_intersect = (m_str & m_txt).count(); + idx_t size_union = (m_str | m_txt).count(); - return (double)size_intersect / (double)m_txt.size(); + return static_cast(size_intersect) / static_cast(size_union); } static double JaccardScalarFunction(Vector &result, const string_t str, string_t tgt) { diff --git a/src/duckdb/src/execution/aggregate_hashtable.cpp b/src/duckdb/src/execution/aggregate_hashtable.cpp index be5aa968..d15583c2 100644 --- a/src/duckdb/src/execution/aggregate_hashtable.cpp +++ b/src/duckdb/src/execution/aggregate_hashtable.cpp @@ -328,7 +328,7 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V // Compute the entry in the table based on the hash using a modulo, // and precompute the hash salts for faster comparison below auto ht_offsets = FlatVector::GetData(state.ht_offsets); - auto hash_salts = FlatVector::GetData(state.hash_salts); + const auto hash_salts = FlatVector::GetData(state.hash_salts); for (idx_t r = 0; r < groups.size(); r++) { const auto &hash = hashes[r]; ht_offsets[r] = ApplyBitMask(hash); @@ -369,20 +369,29 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V for (idx_t i = 0; i < remaining_entries; i++) { const auto index = sel_vector->get_index(i); const auto &salt = hash_salts[index]; - auto &entry = entries[ht_offsets[index]]; - if (entry.IsOccupied()) { // Cell is occupied: Compare salts - if (entry.GetSalt() == salt) { - state.group_compare_vector.set_index(need_compare_count++, index); - } else { - state.no_match_vector.set_index(no_match_count++, index); + auto &ht_offset = ht_offsets[index]; + while (true) { + auto &entry = entries[ht_offset]; + if (entry.IsOccupied()) { // Cell is occupied: Compare salts + if (entry.GetSalt() == salt) { + // Same salt, compare group keys + state.group_compare_vector.set_index(need_compare_count++, index); + break; + } else { + // Different salts, move to next entry (linear probing) + if (++ht_offset >= capacity) { + ht_offset = 0; + } + continue; + } + } else { // Cell is unoccupied, let's claim it + // Set salt (also marks as occupied) + entry.SetSalt(salt); + // Update selection lists for outer loops + state.empty_vector.set_index(new_entry_count++, index); + new_groups_out.set_index(new_group_count++, index); + break; } - } else { // Cell is unoccupied - // Set salt (also marks as occupied) - entry.SetSalt(salt); - - // Update selection lists for outer loops - state.empty_vector.set_index(new_entry_count++, index); - new_groups_out.set_index(new_group_count++, index); } } @@ -422,10 +431,10 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V // Linear probing: each of the entries that do not match move to the next entry in the HT for (idx_t i = 0; i < no_match_count; i++) { - idx_t index = state.no_match_vector.get_index(i); - ht_offsets[index]++; - if (ht_offsets[index] >= capacity) { - ht_offsets[index] = 0; + const auto index = state.no_match_vector.get_index(i); + auto &ht_offset = ht_offsets[index]; + if (++ht_offset >= capacity) { + ht_offset = 0; } } sel_vector = &state.no_match_vector; diff --git a/src/duckdb/src/execution/index/art/art_key.cpp b/src/duckdb/src/execution/index/art/art_key.cpp index 9cc26be2..5f50b4e1 100644 --- a/src/duckdb/src/execution/index/art/art_key.cpp +++ b/src/duckdb/src/execution/index/art/art_key.cpp @@ -20,10 +20,10 @@ ARTKey ARTKey::CreateARTKey(ArenaAllocator &allocator, const LogicalType &type, // FIXME: rethink this if (type == LogicalType::BLOB || type == LogicalType::VARCHAR) { - // indexes cannot contain BLOBs (or BLOBs cast to VARCHARs) that contain null-terminated bytes + // indexes cannot contain BLOBs (or BLOBs cast to VARCHARs) that contain zero bytes for (uint32_t i = 0; i < len - 1; i++) { if (data[i] == '\0') { - throw NotImplementedException("Indexes cannot contain BLOBs that contain null-terminated bytes."); + throw NotImplementedException("ART indexes cannot contain BLOBs with zero bytes."); } } } @@ -45,10 +45,10 @@ void ARTKey::CreateARTKey(ArenaAllocator &allocator, const LogicalType &type, AR // FIXME: rethink this if (type == LogicalType::BLOB || type == LogicalType::VARCHAR) { - // indexes cannot contain BLOBs (or BLOBs cast to VARCHARs) that contain null-terminated bytes + // indexes cannot contain BLOBs (or BLOBs cast to VARCHARs) that contain zero bytes for (uint32_t i = 0; i < key.len - 1; i++) { if (key.data[i] == '\0') { - throw NotImplementedException("Indexes cannot contain BLOBs that contain null-terminated bytes."); + throw NotImplementedException("ART indexes cannot contain BLOBs with zero bytes."); } } } diff --git a/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp b/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp index dfbe9c0b..8561c5fb 100644 --- a/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +++ b/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp @@ -384,10 +384,10 @@ SinkResultType PhysicalHashAggregate::Sink(ExecutionContext &context, DataChunk // For every grouping set there is one radix_table for (idx_t i = 0; i < groupings.size(); i++) { - auto &grouping_local_state = global_state.grouping_states[i]; - auto &grouping_global_state = local_state.grouping_states[i]; + auto &grouping_global_state = global_state.grouping_states[i]; + auto &grouping_local_state = local_state.grouping_states[i]; InterruptState interrupt_state; - OperatorSinkInput sink_input {*grouping_local_state.table_state, *grouping_global_state.table_state, + OperatorSinkInput sink_input {*grouping_global_state.table_state, *grouping_local_state.table_state, interrupt_state}; auto &grouping = groupings[i]; diff --git a/src/duckdb/src/execution/operator/persistent/physical_batch_copy_to_file.cpp b/src/duckdb/src/execution/operator/persistent/physical_batch_copy_to_file.cpp index f951ddc4..52857768 100644 --- a/src/duckdb/src/execution/operator/persistent/physical_batch_copy_to_file.cpp +++ b/src/duckdb/src/execution/operator/persistent/physical_batch_copy_to_file.cpp @@ -175,8 +175,10 @@ void PhysicalBatchCopyToFile::FlushBatchData(ClientContext &context, GlobalSinkS //===--------------------------------------------------------------------===// // Next Batch //===--------------------------------------------------------------------===// -void PhysicalBatchCopyToFile::NextBatch(ExecutionContext &context, GlobalSinkState &gstate_p, - LocalSinkState &lstate) const { +SinkNextBatchType PhysicalBatchCopyToFile::NextBatch(ExecutionContext &context, + OperatorSinkNextBatchInput &input) const { + auto &lstate = input.local_state; + auto &gstate_p = input.global_state; auto &state = lstate.Cast(); if (state.collection && state.collection->Count() > 0) { // we finished processing this batch @@ -188,6 +190,7 @@ void PhysicalBatchCopyToFile::NextBatch(ExecutionContext &context, GlobalSinkSta state.batch_index = lstate.partition_info.batch_index.GetIndex(); state.InitializeCollection(context.client, *this); + return SinkNextBatchType::READY; } unique_ptr PhysicalBatchCopyToFile::GetLocalSinkState(ExecutionContext &context) const { diff --git a/src/duckdb/src/execution/operator/persistent/physical_batch_insert.cpp b/src/duckdb/src/execution/operator/persistent/physical_batch_insert.cpp index d807e0e9..d0da7d62 100644 --- a/src/duckdb/src/execution/operator/persistent/physical_batch_insert.cpp +++ b/src/duckdb/src/execution/operator/persistent/physical_batch_insert.cpp @@ -298,9 +298,9 @@ unique_ptr PhysicalBatchInsert::GetLocalSinkState(ExecutionConte return make_uniq(context.client, insert_types, bound_defaults); } -void PhysicalBatchInsert::NextBatch(ExecutionContext &context, GlobalSinkState &state, LocalSinkState &lstate_p) const { - auto &gstate = state.Cast(); - auto &lstate = lstate_p.Cast(); +SinkNextBatchType PhysicalBatchInsert::NextBatch(ExecutionContext &context, OperatorSinkNextBatchInput &input) const { + auto &gstate = input.global_state.Cast(); + auto &lstate = input.local_state.Cast(); auto &table = gstate.table; auto batch_index = lstate.partition_info.batch_index.GetIndex(); @@ -316,6 +316,7 @@ void PhysicalBatchInsert::NextBatch(ExecutionContext &context, GlobalSinkState & lstate.CreateNewCollection(table, insert_types); } lstate.current_index = batch_index; + return SinkNextBatchType::READY; } SinkResultType PhysicalBatchInsert::Sink(ExecutionContext &context, DataChunk &chunk, OperatorSinkInput &input) const { diff --git a/src/duckdb/src/execution/operator/persistent/physical_copy_to_file.cpp b/src/duckdb/src/execution/operator/persistent/physical_copy_to_file.cpp index c192a4d9..abe2ed28 100644 --- a/src/duckdb/src/execution/operator/persistent/physical_copy_to_file.cpp +++ b/src/duckdb/src/execution/operator/persistent/physical_copy_to_file.cpp @@ -19,6 +19,7 @@ class CopyToFunctionGlobalState : public GlobalSinkState { idx_t rows_copied; idx_t last_file_offset; unique_ptr global_state; + idx_t created_directories = 0; //! shared state for HivePartitionedColumnData shared_ptr partition_state; @@ -82,8 +83,8 @@ static void CreateDir(const string &dir_path, FileSystem &fs) { } } -static string CreateDirRecursive(const vector &cols, const vector &names, const vector &values, - string path, FileSystem &fs) { +static void CreateDirectories(const vector &cols, const vector &names, const vector &values, + string path, FileSystem &fs) { CreateDir(path, fs); for (idx_t i = 0; i < cols.size(); i++) { @@ -93,7 +94,16 @@ static string CreateDirRecursive(const vector &cols, const vector path = fs.JoinPath(path, p_dir); CreateDir(path, fs); } +} +static string GetDirectory(const vector &cols, const vector &names, const vector &values, + string path, FileSystem &fs) { + for (idx_t i = 0; i < cols.size(); i++) { + const auto &partition_col_name = names[cols[i]]; + const auto &partition_value = values[i]; + string p_dir = partition_col_name + "=" + partition_value.ToString(); + path = fs.JoinPath(path, p_dir); + } return path; } @@ -109,10 +119,21 @@ SinkCombineResultType PhysicalCopyToFile::Combine(ExecutionContext &context, Ope string trimmed_path = file_path; StringUtil::RTrim(trimmed_path, fs.PathSeparator(trimmed_path)); + { + // create directories + lock_guard global_lock(g.lock); + lock_guard global_lock_on_partition_state(g.partition_state->lock); + const auto &global_partitions = g.partition_state->partitions; + // global_partitions have partitions added only at the back, so it's fine to only traverse the last part + + for (idx_t i = g.created_directories; i < global_partitions.size(); i++) { + CreateDirectories(partition_columns, names, global_partitions[i]->first.values, trimmed_path, fs); + } + g.created_directories = global_partitions.size(); + } for (idx_t i = 0; i < partitions.size(); i++) { - string hive_path = - CreateDirRecursive(partition_columns, names, partition_key_map[i]->values, trimmed_path, fs); + string hive_path = GetDirectory(partition_columns, names, partition_key_map[i]->values, trimmed_path, fs); string full_path(filename_pattern.CreateFilename(fs, hive_path, function.extension, l.writer_offset)); if (fs.FileExists(full_path) && !overwrite_or_ignore) { throw IOException("failed to create " + full_path + diff --git a/src/duckdb/src/execution/operator/persistent/physical_fixed_batch_copy.cpp b/src/duckdb/src/execution/operator/persistent/physical_fixed_batch_copy.cpp index 27597c45..983ce31d 100644 --- a/src/duckdb/src/execution/operator/persistent/physical_fixed_batch_copy.cpp +++ b/src/duckdb/src/execution/operator/persistent/physical_fixed_batch_copy.cpp @@ -450,8 +450,10 @@ void PhysicalFixedBatchCopy::ExecuteTasks(ClientContext &context, GlobalSinkStat //===--------------------------------------------------------------------===// // Next Batch //===--------------------------------------------------------------------===// -void PhysicalFixedBatchCopy::NextBatch(ExecutionContext &context, GlobalSinkState &gstate_p, - LocalSinkState &lstate) const { +SinkNextBatchType PhysicalFixedBatchCopy::NextBatch(ExecutionContext &context, + OperatorSinkNextBatchInput &input) const { + auto &lstate = input.local_state; + auto &gstate_p = input.global_state; auto &state = lstate.Cast(); if (state.collection && state.collection->Count() > 0) { // we finished processing this batch @@ -468,6 +470,7 @@ void PhysicalFixedBatchCopy::NextBatch(ExecutionContext &context, GlobalSinkStat state.batch_index = lstate.partition_info.batch_index.GetIndex(); state.InitializeCollection(context.client, *this); + return SinkNextBatchType::READY; } unique_ptr PhysicalFixedBatchCopy::GetLocalSinkState(ExecutionContext &context) const { diff --git a/src/duckdb/src/execution/operator/schema/physical_drop.cpp b/src/duckdb/src/execution/operator/schema/physical_drop.cpp index c15eb7c1..36cc3976 100644 --- a/src/duckdb/src/execution/operator/schema/physical_drop.cpp +++ b/src/duckdb/src/execution/operator/schema/physical_drop.cpp @@ -24,7 +24,6 @@ SourceResultType PhysicalDrop::GetData(ExecutionContext &context, DataChunk &chu case CatalogType::SCHEMA_ENTRY: { auto &catalog = Catalog::GetCatalog(context.client, info->catalog); catalog.DropEntry(context.client, *info); - auto qualified_name = QualifiedName::Parse(info->name); // Check if the dropped schema was set as the current schema auto &client_data = ClientData::Get(context.client); diff --git a/src/duckdb/src/execution/physical_operator.cpp b/src/duckdb/src/execution/physical_operator.cpp index fae06215..accd4284 100644 --- a/src/duckdb/src/execution/physical_operator.cpp +++ b/src/duckdb/src/execution/physical_operator.cpp @@ -106,7 +106,8 @@ SinkFinalizeType PhysicalOperator::Finalize(Pipeline &pipeline, Event &event, Cl return SinkFinalizeType::READY; } -void PhysicalOperator::NextBatch(ExecutionContext &context, GlobalSinkState &state, LocalSinkState &lstate_p) const { +SinkNextBatchType PhysicalOperator::NextBatch(ExecutionContext &context, OperatorSinkNextBatchInput &input) const { + return SinkNextBatchType::READY; } unique_ptr PhysicalOperator::GetLocalSinkState(ExecutionContext &context) const { diff --git a/src/duckdb/src/execution/radix_partitioned_hashtable.cpp b/src/duckdb/src/execution/radix_partitioned_hashtable.cpp index 49d2257c..9a106960 100644 --- a/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +++ b/src/duckdb/src/execution/radix_partitioned_hashtable.cpp @@ -346,6 +346,11 @@ bool MaybeRepartition(ClientContext &context, RadixHTGlobalSinkState &gstate, Ra } } + // We can go external when there is only one active thread, but we shouldn't repartition here + if (gstate.active_threads < 2) { + return false; + } + const auto partition_count = partitioned_data->PartitionCount(); const auto current_radix_bits = RadixPartitioning::RadixBits(partition_count); D_ASSERT(current_radix_bits <= config.GetRadixBits()); diff --git a/src/duckdb/src/execution/window_executor.cpp b/src/duckdb/src/execution/window_executor.cpp index fb094e8d..91d39183 100644 --- a/src/duckdb/src/execution/window_executor.cpp +++ b/src/duckdb/src/execution/window_executor.cpp @@ -293,6 +293,18 @@ struct WindowBoundariesState { } } + static inline bool ExpressionNeedsPeer(const ExpressionType &type) { + switch (type) { + case ExpressionType::WINDOW_RANK: + case ExpressionType::WINDOW_RANK_DENSE: + case ExpressionType::WINDOW_PERCENT_RANK: + case ExpressionType::WINDOW_CUME_DIST: + return true; + default: + return false; + } + } + WindowBoundariesState(BoundWindowExpression &wexpr, const idx_t input_size); void Update(const idx_t row_idx, const WindowInputColumn &range_collection, const idx_t chunk_idx, @@ -532,7 +544,7 @@ WindowBoundariesState::WindowBoundariesState(BoundWindowExpression &wexpr, const partition_count(wexpr.partitions.size()), order_count(wexpr.orders.size()), range_sense(wexpr.orders.empty() ? OrderType::INVALID : wexpr.orders[0].type), has_preceding_range(HasPrecedingRange(wexpr)), has_following_range(HasFollowingRange(wexpr)), - needs_peer(BoundaryNeedsPeer(wexpr.end) || wexpr.type == ExpressionType::WINDOW_CUME_DIST) { + needs_peer(BoundaryNeedsPeer(wexpr.end) || ExpressionNeedsPeer(wexpr.type)) { } void WindowBoundariesState::Bounds(DataChunk &bounds, idx_t row_idx, const WindowInputColumn &range, const idx_t count, diff --git a/src/duckdb/src/function/cast/union/from_struct.cpp b/src/duckdb/src/function/cast/union/from_struct.cpp index 7e3a0ae6..559803b8 100644 --- a/src/duckdb/src/function/cast/union/from_struct.cpp +++ b/src/duckdb/src/function/cast/union/from_struct.cpp @@ -59,6 +59,28 @@ bool StructToUnionCast::Cast(Vector &source, Vector &result, idx_t count, CastPa D_ASSERT(converted); } + if (source.GetVectorType() == VectorType::CONSTANT_VECTOR) { + result.SetVectorType(VectorType::CONSTANT_VECTOR); + ConstantVector::SetNull(result, ConstantVector::IsNull(source)); + + // if the tag is NULL, the union should be NULL + auto &tag_vec = *target_children[0]; + ConstantVector::SetNull(result, ConstantVector::IsNull(tag_vec)); + } else { + source.Flatten(count); + FlatVector::Validity(result) = FlatVector::Validity(source); + + // if the tag is NULL, the union should be NULL + auto &tag_vec = *target_children[0]; + UnifiedVectorFormat tag_data; + tag_vec.ToUnifiedFormat(count, tag_data); + for (idx_t i = 0; i < count; i++) { + if (!tag_data.validity.RowIsValid(tag_data.sel->get_index(i))) { + FlatVector::SetNull(result, i, true); + } + } + } + auto check_tags = UnionVector::CheckUnionValidity(result, count); switch (check_tags) { case UnionInvalidReason::TAG_OUT_OF_RANGE: @@ -68,19 +90,14 @@ bool StructToUnionCast::Cast(Vector &source, Vector &result, idx_t count, CastPa case UnionInvalidReason::TAG_MISMATCH: throw ConversionException( "One or more rows in the produced UNION have tags that don't point to the valid member"); + case UnionInvalidReason::NULL_TAG: + throw ConversionException("One or more rows in the produced UNION have a NULL tag"); case UnionInvalidReason::VALID: break; default: throw InternalException("Struct to union cast failed for unknown reason"); } - if (source.GetVectorType() == VectorType::CONSTANT_VECTOR) { - result.SetVectorType(VectorType::CONSTANT_VECTOR); - ConstantVector::SetNull(result, ConstantVector::IsNull(source)); - } else { - source.Flatten(count); - FlatVector::Validity(result) = FlatVector::Validity(source); - } result.Verify(count); return true; } diff --git a/src/duckdb/src/function/cast/vector_cast_helpers.cpp b/src/duckdb/src/function/cast/vector_cast_helpers.cpp index 876f3841..98417a88 100644 --- a/src/duckdb/src/function/cast/vector_cast_helpers.cpp +++ b/src/duckdb/src/function/cast/vector_cast_helpers.cpp @@ -66,7 +66,7 @@ static bool SkipToClose(idx_t &idx, const char *buf, idx_t &len, idx_t &lvl, cha static idx_t StringTrim(const char *buf, idx_t &start_pos, idx_t pos) { idx_t trailing_whitespace = 0; - while (StringUtil::CharacterIsSpace(buf[pos - trailing_whitespace - 1])) { + while (pos > start_pos && StringUtil::CharacterIsSpace(buf[pos - trailing_whitespace - 1])) { trailing_whitespace++; } if ((buf[start_pos] == '"' && buf[pos - trailing_whitespace - 1] == '"') || diff --git a/src/duckdb/src/function/function_set.cpp b/src/duckdb/src/function/function_set.cpp index 41d54c88..dbbfdfa8 100644 --- a/src/duckdb/src/function/function_set.cpp +++ b/src/duckdb/src/function/function_set.cpp @@ -49,7 +49,7 @@ AggregateFunction AggregateFunctionSet::GetFunctionByArguments(ClientContext &co } bool is_prefix = true; for (idx_t k = 0; k < arguments.size(); k++) { - if (arguments[k] != func.arguments[k]) { + if (arguments[k].id() != func.arguments[k].id()) { is_prefix = false; break; } diff --git a/src/duckdb/src/function/pragma/pragma_queries.cpp b/src/duckdb/src/function/pragma/pragma_queries.cpp index 4e45d0dd..781efebb 100644 --- a/src/duckdb/src/function/pragma/pragma_queries.cpp +++ b/src/duckdb/src/function/pragma/pragma_queries.cpp @@ -194,6 +194,10 @@ string PragmaMetadataInfo(ClientContext &context, const FunctionParameters ¶ return "SELECT * FROM pragma_metadata_info();"; } +string PragmaUserAgent(ClientContext &context, const FunctionParameters ¶meters) { + return "SELECT * FROM pragma_user_agent()"; +} + void PragmaQueries::RegisterFunction(BuiltinFunctions &set) { set.AddFunction(PragmaFunction::PragmaCall("table_info", PragmaTableInfo, {LogicalType::VARCHAR})); set.AddFunction(PragmaFunction::PragmaCall("storage_info", PragmaStorageInfo, {LogicalType::VARCHAR})); @@ -210,6 +214,7 @@ void PragmaQueries::RegisterFunction(BuiltinFunctions &set) { set.AddFunction(PragmaFunction::PragmaStatement("functions", PragmaFunctionsQuery)); set.AddFunction(PragmaFunction::PragmaCall("import_database", PragmaImportDatabase, {LogicalType::VARCHAR})); set.AddFunction(PragmaFunction::PragmaStatement("all_profiling_output", PragmaAllProfiling)); + set.AddFunction(PragmaFunction::PragmaStatement("user_agent", PragmaUserAgent)); } } // namespace duckdb diff --git a/src/duckdb/src/function/scalar/string/concat.cpp b/src/duckdb/src/function/scalar/string/concat.cpp index fc3b4114..f5b04a4b 100644 --- a/src/duckdb/src/function/scalar/string/concat.cpp +++ b/src/duckdb/src/function/scalar/string/concat.cpp @@ -118,7 +118,10 @@ static void TemplatedConcatWS(DataChunk &args, const string_t *sep_data, const S const SelectionVector &rsel, idx_t count, Vector &result) { vector result_lengths(args.size(), 0); vector has_results(args.size(), false); - auto orrified_data = make_unsafe_uniq_array(args.ColumnCount() - 1); + + // we overallocate here, but this is important for static analysis + auto orrified_data = make_unsafe_uniq_array(args.ColumnCount()); + for (idx_t col_idx = 1; col_idx < args.ColumnCount(); col_idx++) { args.data[col_idx].ToUnifiedFormat(args.size(), orrified_data[col_idx - 1]); } diff --git a/src/duckdb/src/function/table/arrow.cpp b/src/duckdb/src/function/table/arrow.cpp index 9601e41d..f6244996 100644 --- a/src/duckdb/src/function/table/arrow.cpp +++ b/src/duckdb/src/function/table/arrow.cpp @@ -14,7 +14,7 @@ namespace duckdb { -unique_ptr ArrowTableFunction::GetArrowLogicalType(ArrowSchema &schema) { +static unique_ptr GetArrowLogicalTypeNoDictionary(ArrowSchema &schema) { auto format = string(schema.format); if (format == "n") { return make_uniq(LogicalType::SQLNULL); @@ -87,13 +87,13 @@ unique_ptr ArrowTableFunction::GetArrowLogicalType(ArrowSchema &schem } else if (format == "tin") { return make_uniq(LogicalType::INTERVAL, ArrowDateTimeType::MONTH_DAY_NANO); } else if (format == "+l") { - auto child_type = GetArrowLogicalType(*schema.children[0]); + auto child_type = ArrowTableFunction::GetArrowLogicalType(*schema.children[0]); auto list_type = make_uniq(LogicalType::LIST(child_type->GetDuckType()), ArrowVariableSizeType::NORMAL); list_type->AddChild(std::move(child_type)); return list_type; } else if (format == "+L") { - auto child_type = GetArrowLogicalType(*schema.children[0]); + auto child_type = ArrowTableFunction::GetArrowLogicalType(*schema.children[0]); auto list_type = make_uniq(LogicalType::LIST(child_type->GetDuckType()), ArrowVariableSizeType::SUPER_SIZE); list_type->AddChild(std::move(child_type)); @@ -101,7 +101,7 @@ unique_ptr ArrowTableFunction::GetArrowLogicalType(ArrowSchema &schem } else if (format[0] == '+' && format[1] == 'w') { std::string parameters = format.substr(format.find(':') + 1); idx_t fixed_size = std::stoi(parameters); - auto child_type = GetArrowLogicalType(*schema.children[0]); + auto child_type = ArrowTableFunction::GetArrowLogicalType(*schema.children[0]); auto list_type = make_uniq(LogicalType::LIST(child_type->GetDuckType()), fixed_size); list_type->AddChild(std::move(child_type)); return list_type; @@ -109,7 +109,7 @@ unique_ptr ArrowTableFunction::GetArrowLogicalType(ArrowSchema &schem child_list_t child_types; vector> children; for (idx_t type_idx = 0; type_idx < (idx_t)schema.n_children; type_idx++) { - children.emplace_back(GetArrowLogicalType(*schema.children[type_idx])); + children.emplace_back(ArrowTableFunction::GetArrowLogicalType(*schema.children[type_idx])); child_types.emplace_back(schema.children[type_idx]->name, children.back()->GetDuckType()); } auto struct_type = make_uniq(LogicalType::STRUCT(std::move(child_types))); @@ -130,7 +130,7 @@ unique_ptr ArrowTableFunction::GetArrowLogicalType(ArrowSchema &schem for (idx_t type_idx = 0; type_idx < (idx_t)schema.n_children; type_idx++) { auto type = schema.children[type_idx]; - children.emplace_back(GetArrowLogicalType(*type)); + children.emplace_back(ArrowTableFunction::GetArrowLogicalType(*type)); members.emplace_back(type->name, children.back()->GetDuckType()); } @@ -140,8 +140,8 @@ unique_ptr ArrowTableFunction::GetArrowLogicalType(ArrowSchema &schem } else if (format == "+m") { auto &arrow_struct_type = *schema.children[0]; D_ASSERT(arrow_struct_type.n_children == 2); - auto key_type = GetArrowLogicalType(*arrow_struct_type.children[0]); - auto value_type = GetArrowLogicalType(*arrow_struct_type.children[1]); + auto key_type = ArrowTableFunction::GetArrowLogicalType(*arrow_struct_type.children[0]); + auto value_type = ArrowTableFunction::GetArrowLogicalType(*arrow_struct_type.children[1]); auto map_type = make_uniq(LogicalType::MAP(key_type->GetDuckType(), value_type->GetDuckType()), ArrowVariableSizeType::NORMAL); child_list_t key_value; @@ -184,6 +184,15 @@ unique_ptr ArrowTableFunction::GetArrowLogicalType(ArrowSchema &schem } } +unique_ptr ArrowTableFunction::GetArrowLogicalType(ArrowSchema &schema) { + auto arrow_type = GetArrowLogicalTypeNoDictionary(schema); + if (schema.dictionary) { + auto dictionary = GetArrowLogicalType(*schema.dictionary); + arrow_type->SetDictionary(std::move(dictionary)); + } + return arrow_type; +} + void ArrowTableFunction::RenameArrowColumns(vector &names) { unordered_map name_map; for (auto &column_name : names) { @@ -216,15 +225,7 @@ void ArrowTableFunction::PopulateArrowTableType(ArrowTableType &arrow_table, Arr throw InvalidInputException("arrow_scan: released schema passed"); } auto arrow_type = GetArrowLogicalType(schema); - if (schema.dictionary) { - auto logical_type = arrow_type->GetDuckType(); - auto dictionary = GetArrowLogicalType(*schema.dictionary); - return_types.emplace_back(dictionary->GetDuckType()); - // The dictionary might have different attributes (size type, datetime precision, etc..) - arrow_type->SetDictionary(std::move(dictionary)); - } else { - return_types.emplace_back(arrow_type->GetDuckType()); - } + return_types.emplace_back(arrow_type->GetDuckType(true)); arrow_table.AddColumn(col_idx, std::move(arrow_type)); auto format = string(schema.format); auto name = string(schema.name); @@ -266,6 +267,7 @@ unique_ptr ProduceArrowScan(const ArrowScanFunctionData auto &schema = *function.schema_root.arrow_schema.children[col_idx]; parameters.projected_columns.projection_map[idx] = schema.name; parameters.projected_columns.columns.emplace_back(schema.name); + parameters.projected_columns.filter_to_col[idx] = col_idx; } } parameters.filters = filters; diff --git a/src/duckdb/src/function/table/arrow/arrow_array_scan_state.cpp b/src/duckdb/src/function/table/arrow/arrow_array_scan_state.cpp new file mode 100644 index 00000000..88aa49bf --- /dev/null +++ b/src/duckdb/src/function/table/arrow/arrow_array_scan_state.cpp @@ -0,0 +1,32 @@ +#include "duckdb/function/table/arrow.hpp" + +namespace duckdb { + +ArrowArrayScanState::ArrowArrayScanState(ArrowScanLocalState &state) : state(state) { +} + +ArrowArrayScanState &ArrowArrayScanState::GetChild(idx_t child_idx) { + auto it = children.find(child_idx); + if (it == children.end()) { + auto child_p = make_uniq(state); + auto &child = *child_p; + children.emplace(std::make_pair(child_idx, std::move(child_p))); + return child; + } + return *it->second; +} + +void ArrowArrayScanState::AddDictionary(unique_ptr dictionary_p) { + this->dictionary = std::move(dictionary_p); +} + +bool ArrowArrayScanState::HasDictionary() const { + return dictionary != nullptr; +} + +Vector &ArrowArrayScanState::GetDictionary() { + D_ASSERT(HasDictionary()); + return *dictionary; +} + +} // namespace duckdb diff --git a/src/duckdb/src/function/table/arrow/arrow_duck_schema.cpp b/src/duckdb/src/function/table/arrow/arrow_duck_schema.cpp index 42d04d2e..933c4da4 100644 --- a/src/duckdb/src/function/table/arrow/arrow_duck_schema.cpp +++ b/src/duckdb/src/function/table/arrow/arrow_duck_schema.cpp @@ -27,13 +27,57 @@ void ArrowType::SetDictionary(unique_ptr dictionary) { dictionary_type = std::move(dictionary); } +bool ArrowType::HasDictionary() const { + return dictionary_type != nullptr; +} + const ArrowType &ArrowType::GetDictionary() const { D_ASSERT(dictionary_type); return *dictionary_type; } -const LogicalType &ArrowType::GetDuckType() const { - return type; +LogicalType ArrowType::GetDuckType(bool use_dictionary) const { + if (use_dictionary && dictionary_type) { + return dictionary_type->GetDuckType(); + } + if (!use_dictionary) { + return type; + } + // Dictionaries can exist in arbitrarily nested schemas + // have to reconstruct the type + auto id = type.id(); + switch (id) { + case LogicalTypeId::STRUCT: { + child_list_t new_children; + for (idx_t i = 0; i < children.size(); i++) { + auto &child = children[i]; + auto &child_name = StructType::GetChildName(type, i); + new_children.emplace_back(std::make_pair(child_name, child->GetDuckType(true))); + } + return LogicalType::STRUCT(std::move(new_children)); + } + case LogicalTypeId::LIST: { + auto &child = children[0]; + return LogicalType::LIST(child->GetDuckType(true)); + } + case LogicalTypeId::MAP: { + auto &struct_child = children[0]; + auto struct_type = struct_child->GetDuckType(true); + return LogicalType::MAP(StructType::GetChildType(struct_type, 0), StructType::GetChildType(struct_type, 1)); + } + case LogicalTypeId::UNION: { + child_list_t new_children; + for (idx_t i = 0; i < children.size(); i++) { + auto &child = children[i]; + auto &child_name = UnionType::GetMemberName(type, i); + new_children.emplace_back(std::make_pair(child_name, child->GetDuckType(true))); + } + return LogicalType::UNION(std::move(new_children)); + } + default: { + return type; + } + } } ArrowVariableSizeType ArrowType::GetSizeType() const { diff --git a/src/duckdb/src/function/table/arrow_conversion.cpp b/src/duckdb/src/function/table/arrow_conversion.cpp index ba7d011a..20407835 100644 --- a/src/duckdb/src/function/table/arrow_conversion.cpp +++ b/src/duckdb/src/function/table/arrow_conversion.cpp @@ -80,14 +80,20 @@ static void SetValidityMask(Vector &vector, ArrowArray &array, ArrowScanLocalSta GetValidityMask(mask, array, scan_state, size, nested_offset, add_null); } -static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowScanLocalState &scan_state, idx_t size, +static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowArrayScanState &array_state, idx_t size, const ArrowType &arrow_type, int64_t nested_offset = -1, ValidityMask *parent_mask = nullptr, uint64_t parent_offset = 0); -static void ArrowToDuckDBList(Vector &vector, ArrowArray &array, ArrowScanLocalState &scan_state, idx_t size, +static void ColumnArrowToDuckDBDictionary(Vector &vector, ArrowArray &array, ArrowArrayScanState &array_state, + idx_t size, const ArrowType &arrow_type, int64_t nested_offset = -1, + ValidityMask *parent_mask = nullptr, uint64_t parent_offset = 0); + +static void ArrowToDuckDBList(Vector &vector, ArrowArray &array, ArrowArrayScanState &array_state, idx_t size, const ArrowType &arrow_type, int64_t nested_offset, ValidityMask *parent_mask) { auto size_type = arrow_type.GetSizeType(); idx_t list_size = 0; + auto &scan_state = array_state.state; + SetValidityMask(vector, array, scan_state, size, nested_offset); idx_t start_offset = 0; idx_t cur_offset = 0; @@ -152,10 +158,19 @@ static void ArrowToDuckDBList(Vector &vector, ArrowArray &array, ArrowScanLocalS } } } + auto &child_state = array_state.GetChild(0); + auto &child_array = *array.children[0]; + auto &child_type = arrow_type[0]; if (list_size == 0 && start_offset == 0) { - ColumnArrowToDuckDB(child_vector, *array.children[0], scan_state, list_size, arrow_type[0], -1); + D_ASSERT(!child_array.dictionary); + ColumnArrowToDuckDB(child_vector, child_array, child_state, list_size, child_type, -1); } else { - ColumnArrowToDuckDB(child_vector, *array.children[0], scan_state, list_size, arrow_type[0], start_offset); + if (child_array.dictionary) { + // TODO: add support for offsets + ColumnArrowToDuckDBDictionary(child_vector, child_array, child_state, list_size, child_type, start_offset); + } else { + ColumnArrowToDuckDB(child_vector, child_array, child_state, list_size, child_type, start_offset); + } } } @@ -343,9 +358,11 @@ static void IntervalConversionMonthDayNanos(Vector &vector, ArrowArray &array, A } } -static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowScanLocalState &scan_state, idx_t size, +static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowArrayScanState &array_state, idx_t size, const ArrowType &arrow_type, int64_t nested_offset, ValidityMask *parent_mask, uint64_t parent_offset) { + auto &scan_state = array_state.state; + D_ASSERT(!array.dictionary); switch (vector.GetType().id()) { case LogicalTypeId::SQLNULL: vector.Reference(Value()); @@ -601,11 +618,11 @@ static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowScanLoca break; } case LogicalTypeId::LIST: { - ArrowToDuckDBList(vector, array, scan_state, size, arrow_type, nested_offset, parent_mask); + ArrowToDuckDBList(vector, array, array_state, size, arrow_type, nested_offset, parent_mask); break; } case LogicalTypeId::MAP: { - ArrowToDuckDBList(vector, array, scan_state, size, arrow_type, nested_offset, parent_mask); + ArrowToDuckDBList(vector, array, array_state, size, arrow_type, nested_offset, parent_mask); ArrowToDuckDBMapVerify(vector, size); break; } @@ -613,18 +630,29 @@ static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowScanLoca //! Fill the children auto &child_entries = StructVector::GetEntries(vector); auto &struct_validity_mask = FlatVector::Validity(vector); - for (idx_t type_idx = 0; type_idx < static_cast(array.n_children); type_idx++) { - SetValidityMask(*child_entries[type_idx], *array.children[type_idx], scan_state, size, nested_offset); + for (int64_t child_idx = 0; child_idx < array.n_children; child_idx++) { + auto &child_entry = *child_entries[child_idx]; + auto &child_array = *array.children[child_idx]; + auto &child_type = arrow_type[child_idx]; + auto &child_state = array_state.GetChild(child_idx); + + SetValidityMask(child_entry, child_array, scan_state, size, nested_offset); if (!struct_validity_mask.AllValid()) { - auto &child_validity_mark = FlatVector::Validity(*child_entries[type_idx]); + auto &child_validity_mark = FlatVector::Validity(child_entry); for (idx_t i = 0; i < size; i++) { if (!struct_validity_mask.RowIsValid(i)) { child_validity_mark.SetInvalid(i); } } } - ColumnArrowToDuckDB(*child_entries[type_idx], *array.children[type_idx], scan_state, size, - arrow_type[type_idx], nested_offset, &struct_validity_mask, array.offset); + if (child_array.dictionary) { + // TODO: add support for offsets + ColumnArrowToDuckDBDictionary(child_entry, child_array, child_state, size, child_type, nested_offset, + &struct_validity_mask, array.offset); + } else { + ColumnArrowToDuckDB(child_entry, child_array, child_state, size, child_type, nested_offset, + &struct_validity_mask, array.offset); + } } break; } @@ -636,14 +664,19 @@ static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowScanLoca auto &validity_mask = FlatVector::Validity(vector); duckdb::vector children; - for (idx_t type_idx = 0; type_idx < static_cast(array.n_children); type_idx++) { - Vector child(members[type_idx].second); - auto arrow_array = array.children[type_idx]; - auto &child_type = arrow_type[type_idx]; + for (int64_t child_idx = 0; child_idx < array.n_children; child_idx++) { + Vector child(members[child_idx].second, size); + auto &child_array = *array.children[child_idx]; + auto &child_state = array_state.GetChild(child_idx); + auto &child_type = arrow_type[child_idx]; - SetValidityMask(child, *arrow_array, scan_state, size, nested_offset); + SetValidityMask(child, child_array, scan_state, size, nested_offset); - ColumnArrowToDuckDB(child, *arrow_array, scan_state, size, child_type, nested_offset, &validity_mask); + if (child_array.dictionary) { + ColumnArrowToDuckDBDictionary(child, child_array, child_state, size, child_type); + } else { + ColumnArrowToDuckDB(child, child_array, child_state, size, child_type, nested_offset, &validity_mask); + } children.push_back(std::move(child)); } @@ -790,30 +823,31 @@ static void SetSelectionVector(SelectionVector &sel, data_ptr_t indices_p, Logic } } -static void ColumnArrowToDuckDBDictionary(Vector &vector, ArrowArray &array, ArrowScanLocalState &scan_state, - idx_t size, const ArrowType &arrow_type, idx_t col_idx) { +static void ColumnArrowToDuckDBDictionary(Vector &vector, ArrowArray &array, ArrowArrayScanState &array_state, + idx_t size, const ArrowType &arrow_type, int64_t nested_offset, + ValidityMask *parent_mask, uint64_t parent_offset) { SelectionVector sel; - auto &dict_vectors = scan_state.arrow_dictionary_vectors; - if (!dict_vectors.count(col_idx)) { + auto &scan_state = array_state.state; + if (!array_state.HasDictionary()) { //! We need to set the dictionary data for this column auto base_vector = make_uniq(vector.GetType(), array.dictionary->length); SetValidityMask(*base_vector, *array.dictionary, scan_state, array.dictionary->length, 0, array.null_count > 0); - ColumnArrowToDuckDB(*base_vector, *array.dictionary, scan_state, array.dictionary->length, + ColumnArrowToDuckDB(*base_vector, *array.dictionary, array_state, array.dictionary->length, arrow_type.GetDictionary()); - dict_vectors[col_idx] = std::move(base_vector); + array_state.AddDictionary(std::move(base_vector)); } - auto dictionary_type = arrow_type.GetDuckType(); + auto offset_type = arrow_type.GetDuckType(); //! Get Pointer to Indices of Dictionary auto indices = ArrowBufferData(array, 1) + - GetTypeIdSize(dictionary_type.InternalType()) * (scan_state.chunk_offset + array.offset); + GetTypeIdSize(offset_type.InternalType()) * (scan_state.chunk_offset + array.offset); if (array.null_count > 0) { ValidityMask indices_validity; GetValidityMask(indices_validity, array, scan_state, size); - SetSelectionVector(sel, indices, dictionary_type, size, &indices_validity, array.dictionary->length); + SetSelectionVector(sel, indices, offset_type, size, &indices_validity, array.dictionary->length); } else { - SetSelectionVector(sel, indices, dictionary_type, size); + SetSelectionVector(sel, indices, offset_type, size); } - vector.Slice(*dict_vectors[col_idx], sel, size); + vector.Slice(array_state.GetDictionary(), sel, size); } void ArrowTableFunction::ArrowToDuckDB(ArrowScanLocalState &scan_state, const arrow_column_map_t &arrow_convert_data, @@ -849,11 +883,13 @@ void ArrowTableFunction::ArrowToDuckDB(ArrowScanLocalState &scan_state, const ar D_ASSERT(arrow_convert_data.find(col_idx) != arrow_convert_data.end()); auto &arrow_type = *arrow_convert_data.at(col_idx); + auto &array_state = scan_state.GetState(col_idx); + if (array.dictionary) { - ColumnArrowToDuckDBDictionary(output.data[idx], array, scan_state, output.size(), arrow_type, col_idx); + ColumnArrowToDuckDBDictionary(output.data[idx], array, array_state, output.size(), arrow_type); } else { SetValidityMask(output.data[idx], array, scan_state, output.size(), -1); - ColumnArrowToDuckDB(output.data[idx], array, scan_state, output.size(), arrow_type); + ColumnArrowToDuckDB(output.data[idx], array, array_state, output.size(), arrow_type); } } } diff --git a/src/duckdb/src/function/table/copy_csv.cpp b/src/duckdb/src/function/table/copy_csv.cpp index b1f1b9f1..e721fe46 100644 --- a/src/duckdb/src/function/table/copy_csv.cpp +++ b/src/duckdb/src/function/table/copy_csv.cpp @@ -91,15 +91,15 @@ void BaseCSVData::Finalize() { } } -static unique_ptr WriteCSVBind(ClientContext &context, CopyInfo &info, vector &names, - vector &sql_types) { +static unique_ptr WriteCSVBind(ClientContext &context, const CopyInfo &info, const vector &names, + const vector &sql_types) { auto bind_data = make_uniq(info.file_path, sql_types, names); // check all the options in the copy info for (auto &option : info.options) { auto loption = StringUtil::Lower(option.first); auto &set = option.second; - bind_data->options.SetWriteOption(loption, ConvertVectorToValue(std::move(set))); + bind_data->options.SetWriteOption(loption, ConvertVectorToValue(set)); } // verify the parsed options if (bind_data->options.force_quote.empty()) { diff --git a/src/duckdb/src/function/table/system/pragma_user_agent.cpp b/src/duckdb/src/function/table/system/pragma_user_agent.cpp new file mode 100644 index 00000000..3803f719 --- /dev/null +++ b/src/duckdb/src/function/table/system/pragma_user_agent.cpp @@ -0,0 +1,50 @@ +#include "duckdb/function/table/system_functions.hpp" +#include "duckdb/main/config.hpp" + +namespace duckdb { + +struct PragmaUserAgentData : public GlobalTableFunctionState { + PragmaUserAgentData() : finished(false) { + } + + std::string user_agent; + bool finished; +}; + +static unique_ptr PragmaUserAgentBind(ClientContext &context, TableFunctionBindInput &input, + vector &return_types, vector &names) { + + names.emplace_back("user_agent"); + return_types.emplace_back(LogicalType::VARCHAR); + + return nullptr; +} + +unique_ptr PragmaUserAgentInit(ClientContext &context, TableFunctionInitInput &input) { + auto result = make_uniq(); + auto &config = DBConfig::GetConfig(context); + result->user_agent = config.UserAgent(); + + return std::move(result); +} + +void PragmaUserAgentFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) { + auto &data = data_p.global_state->Cast(); + + if (data.finished) { + // signal end of output + return; + } + + output.SetCardinality(1); + output.SetValue(0, 0, data.user_agent); + + data.finished = true; +} + +void PragmaUserAgent::RegisterFunction(BuiltinFunctions &set) { + set.AddFunction( + TableFunction("pragma_user_agent", {}, PragmaUserAgentFunction, PragmaUserAgentBind, PragmaUserAgentInit)); +} + +} // namespace duckdb diff --git a/src/duckdb/src/function/table/system_functions.cpp b/src/duckdb/src/function/table/system_functions.cpp index 7b6a5b04..bf46bee9 100644 --- a/src/duckdb/src/function/table/system_functions.cpp +++ b/src/duckdb/src/function/table/system_functions.cpp @@ -18,6 +18,7 @@ void BuiltinFunctions::RegisterSQLiteFunctions() { PragmaDatabaseSize::RegisterFunction(*this); PragmaLastProfilingOutput::RegisterFunction(*this); PragmaDetailedProfilingOutput::RegisterFunction(*this); + PragmaUserAgent::RegisterFunction(*this); DuckDBColumnsFun::RegisterFunction(*this); DuckDBConstraintsFun::RegisterFunction(*this); diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index a1a987ad..c02fa554 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,8 +1,8 @@ #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v0.9.1" +#define DUCKDB_VERSION "v0.9.2" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "401c8061c6" +#define DUCKDB_SOURCE_ID "3c695d7ba9" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb.h b/src/duckdb/src/include/duckdb.h index 8fa38f7d..86b0b1b4 100644 --- a/src/duckdb/src/include/duckdb.h +++ b/src/duckdb/src/include/duckdb.h @@ -953,7 +953,7 @@ Returns NULL if the index is out of range for the provided prepared statement. * prepared_statement: The prepared statement for which to get the parameter name from. */ -const char *duckdb_parameter_name(duckdb_prepared_statement prepared_statement, idx_t index); +DUCKDB_API const char *duckdb_parameter_name(duckdb_prepared_statement prepared_statement, idx_t index); /*! Returns the parameter type for the parameter at the given index. diff --git a/src/duckdb/src/include/duckdb/catalog/catalog_set.hpp b/src/duckdb/src/include/duckdb/catalog/catalog_set.hpp index dba2d661..5e980e3a 100644 --- a/src/duckdb/src/include/duckdb/catalog/catalog_set.hpp +++ b/src/duckdb/src/include/duckdb/catalog/catalog_set.hpp @@ -122,6 +122,10 @@ class CatalogSet { void UpdateTimestamp(CatalogEntry &entry, transaction_t timestamp); + mutex &GetCatalogLock() { + return catalog_lock; + } + void Verify(Catalog &catalog); private: diff --git a/src/duckdb/src/include/duckdb/common/arrow/appender/enum_data.hpp b/src/duckdb/src/include/duckdb/common/arrow/appender/enum_data.hpp index ffcf729f..087c622e 100644 --- a/src/duckdb/src/include/duckdb/common/arrow/appender/enum_data.hpp +++ b/src/duckdb/src/include/duckdb/common/arrow/appender/enum_data.hpp @@ -8,14 +8,19 @@ namespace duckdb { //===--------------------------------------------------------------------===// // Enums //===--------------------------------------------------------------------===// + +// FIXME: support Large offsets (int64_t), this does not currently respect the 'arrow_large_buffer_size' setting + template struct ArrowEnumData : public ArrowScalarBaseData { static idx_t GetLength(string_t input) { return input.GetSize(); } + static void WriteData(data_ptr_t target, string_t input) { memcpy(target, input.GetData(), input.GetSize()); } + static void EnumAppendVector(ArrowAppendData &append_data, const Vector &input, idx_t size) { D_ASSERT(input.GetVectorType() == VectorType::FLAT_VECTOR); @@ -23,9 +28,9 @@ struct ArrowEnumData : public ArrowScalarBaseData { ResizeValidity(append_data.validity, append_data.row_count + size); // resize the offset buffer - the offset buffer holds the offsets into the child array - append_data.main_buffer.resize(append_data.main_buffer.size() + sizeof(uint32_t) * (size + 1)); + append_data.main_buffer.resize(append_data.main_buffer.size() + sizeof(int32_t) * (size + 1)); auto data = FlatVector::GetData(input); - auto offset_data = append_data.main_buffer.GetData(); + auto offset_data = append_data.main_buffer.GetData(); if (append_data.row_count == 0) { // first entry offset_data[0] = 0; @@ -50,6 +55,7 @@ struct ArrowEnumData : public ArrowScalarBaseData { } append_data.row_count += size; } + static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) { result.main_buffer.reserve(capacity * sizeof(TGT)); // construct the enum child data diff --git a/src/duckdb/src/include/duckdb/common/arrow/appender/list_data.hpp b/src/duckdb/src/include/duckdb/common/arrow/appender/list_data.hpp index 9507ac72..534ddef2 100644 --- a/src/duckdb/src/include/duckdb/common/arrow/appender/list_data.hpp +++ b/src/duckdb/src/include/duckdb/common/arrow/appender/list_data.hpp @@ -4,15 +4,85 @@ namespace duckdb { +template struct ArrowListData { public: - static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity); - static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size); - static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result); + static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) { + auto &child_type = ListType::GetChildType(type); + result.main_buffer.reserve((capacity + 1) * sizeof(BUFTYPE)); + auto child_buffer = ArrowAppender::InitializeChild(child_type, capacity, result.options); + result.child_data.push_back(std::move(child_buffer)); + } + + static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) { + UnifiedVectorFormat format; + input.ToUnifiedFormat(input_size, format); + idx_t size = to - from; + vector child_indices; + AppendValidity(append_data, format, from, to); + AppendOffsets(append_data, format, from, to, child_indices); + + // append the child vector of the list + SelectionVector child_sel(child_indices.data()); + auto &child = ListVector::GetEntry(input); + auto child_size = child_indices.size(); + Vector child_copy(child.GetType()); + child_copy.Slice(child, child_sel, child_size); + append_data.child_data[0]->append_vector(*append_data.child_data[0], child_copy, 0, child_size, child_size); + append_data.row_count += size; + } + + static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) { + result->n_buffers = 2; + result->buffers[1] = append_data.main_buffer.data(); + + auto &child_type = ListType::GetChildType(type); + ArrowAppender::AddChildren(append_data, 1); + result->children = append_data.child_pointers.data(); + result->n_children = 1; + append_data.child_arrays[0] = *ArrowAppender::FinalizeChild(child_type, std::move(append_data.child_data[0])); + } public: static void AppendOffsets(ArrowAppendData &append_data, UnifiedVectorFormat &format, idx_t from, idx_t to, - vector &child_sel); + vector &child_sel) { + // resize the offset buffer - the offset buffer holds the offsets into the child array + idx_t size = to - from; + append_data.main_buffer.resize(append_data.main_buffer.size() + sizeof(BUFTYPE) * (size + 1)); + auto data = UnifiedVectorFormat::GetData(format); + auto offset_data = append_data.main_buffer.GetData(); + if (append_data.row_count == 0) { + // first entry + offset_data[0] = 0; + } + // set up the offsets using the list entries + auto last_offset = offset_data[append_data.row_count]; + for (idx_t i = from; i < to; i++) { + auto source_idx = format.sel->get_index(i); + auto offset_idx = append_data.row_count + i + 1 - from; + + if (!format.validity.RowIsValid(source_idx)) { + offset_data[offset_idx] = last_offset; + continue; + } + + // append the offset data + auto list_length = data[source_idx].length; + if (std::is_same::value == true && + (uint64_t)last_offset + list_length > NumericLimits::Maximum()) { + throw InvalidInputException( + "Arrow Appender: The maximum combined list offset for regular list buffers is " + "%u but the offset of %lu exceeds this.", + NumericLimits::Maximum(), last_offset); + } + last_offset += list_length; + offset_data[offset_idx] = last_offset; + + for (idx_t k = 0; k < list_length; k++) { + child_sel.push_back(data[source_idx].offset + k); + } + } + } }; } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/common/arrow/appender/map_data.hpp b/src/duckdb/src/include/duckdb/common/arrow/appender/map_data.hpp index 9bb31c2f..e881c532 100644 --- a/src/duckdb/src/include/duckdb/common/arrow/appender/map_data.hpp +++ b/src/duckdb/src/include/duckdb/common/arrow/appender/map_data.hpp @@ -2,17 +2,96 @@ #include "duckdb/common/arrow/arrow_appender.hpp" #include "duckdb/common/arrow/appender/append_data.hpp" +#include "duckdb/common/arrow/appender/list_data.hpp" namespace duckdb { //===--------------------------------------------------------------------===// // Maps //===--------------------------------------------------------------------===// +template struct ArrowMapData { public: - static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity); - static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size); - static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result); + static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) { + // map types are stored in a (too) clever way + // the main buffer holds the null values and the offsets + // then we have a single child, which is a struct of the map_type, and the key_type + result.main_buffer.reserve((capacity + 1) * sizeof(BUFTYPE)); + + auto &key_type = MapType::KeyType(type); + auto &value_type = MapType::ValueType(type); + auto internal_struct = make_uniq(result.options); + internal_struct->child_data.push_back(ArrowAppender::InitializeChild(key_type, capacity, result.options)); + internal_struct->child_data.push_back(ArrowAppender::InitializeChild(value_type, capacity, result.options)); + + result.child_data.push_back(std::move(internal_struct)); + } + + static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) { + UnifiedVectorFormat format; + input.ToUnifiedFormat(input_size, format); + idx_t size = to - from; + AppendValidity(append_data, format, from, to); + vector child_indices; + ArrowListData::AppendOffsets(append_data, format, from, to, child_indices); + + SelectionVector child_sel(child_indices.data()); + auto &key_vector = MapVector::GetKeys(input); + auto &value_vector = MapVector::GetValues(input); + auto list_size = child_indices.size(); + + auto &struct_data = *append_data.child_data[0]; + auto &key_data = *struct_data.child_data[0]; + auto &value_data = *struct_data.child_data[1]; + + Vector key_vector_copy(key_vector.GetType()); + key_vector_copy.Slice(key_vector, child_sel, list_size); + Vector value_vector_copy(value_vector.GetType()); + value_vector_copy.Slice(value_vector, child_sel, list_size); + key_data.append_vector(key_data, key_vector_copy, 0, list_size, list_size); + value_data.append_vector(value_data, value_vector_copy, 0, list_size, list_size); + + append_data.row_count += size; + struct_data.row_count += size; + } + + static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) { + // set up the main map buffer + D_ASSERT(result); + result->n_buffers = 2; + result->buffers[1] = append_data.main_buffer.data(); + + // the main map buffer has a single child: a struct + ArrowAppender::AddChildren(append_data, 1); + result->children = append_data.child_pointers.data(); + result->n_children = 1; + + auto &struct_data = *append_data.child_data[0]; + auto struct_result = ArrowAppender::FinalizeChild(type, std::move(append_data.child_data[0])); + + // Initialize the struct array data + const auto struct_child_count = 2; + ArrowAppender::AddChildren(struct_data, struct_child_count); + struct_result->children = struct_data.child_pointers.data(); + struct_result->n_buffers = 1; + struct_result->n_children = struct_child_count; + struct_result->length = struct_data.child_data[0]->row_count; + + append_data.child_arrays[0] = *struct_result; + + D_ASSERT(struct_data.child_data[0]->row_count == struct_data.child_data[1]->row_count); + + auto &key_type = MapType::KeyType(type); + auto &value_type = MapType::ValueType(type); + auto key_data = ArrowAppender::FinalizeChild(key_type, std::move(struct_data.child_data[0])); + struct_data.child_arrays[0] = *key_data; + struct_data.child_arrays[1] = *ArrowAppender::FinalizeChild(value_type, std::move(struct_data.child_data[1])); + + // keys cannot have null values + if (key_data->null_count > 0) { + throw std::runtime_error("Arrow doesn't accept NULL keys on Maps"); + } + } }; } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/common/arrow/appender/varchar_data.hpp b/src/duckdb/src/include/duckdb/common/arrow/appender/varchar_data.hpp index 03984fc7..fd2a2385 100644 --- a/src/duckdb/src/include/duckdb/common/arrow/appender/varchar_data.hpp +++ b/src/duckdb/src/include/duckdb/common/arrow/appender/varchar_data.hpp @@ -32,7 +32,7 @@ struct ArrowUUIDConverter { } }; -template +template struct ArrowVarcharData { static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) { result.main_buffer.reserve((capacity + 1) * sizeof(BUFTYPE)); @@ -40,7 +40,8 @@ struct ArrowVarcharData { result.aux_buffer.reserve(capacity); } - static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) { + template + static void AppendTemplated(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) { idx_t size = to - from; UnifiedVectorFormat format; input.ToUnifiedFormat(input_size, format); @@ -60,13 +61,6 @@ struct ArrowVarcharData { // now append the string data to the auxiliary buffer // the auxiliary buffer's length depends on the string lengths, so we resize as required auto last_offset = offset_data[append_data.row_count]; - idx_t max_offset = append_data.row_count + to - from; - if (max_offset > NumericLimits::Maximum() && - append_data.options.arrow_offset_size == ArrowOffsetSize::REGULAR) { - throw InvalidInputException("Arrow Appender: The maximum total string size for regular string buffers is " - "%u but the offset of %lu exceeds this.", - NumericLimits::Maximum(), max_offset); - } for (idx_t i = from; i < to; i++) { auto source_idx = format.sel->get_index(i); auto offset_idx = append_data.row_count + i + 1 - from; @@ -84,6 +78,13 @@ struct ArrowVarcharData { // append the offset data auto current_offset = last_offset + string_length; + if (!LARGE_STRING && (int64_t)last_offset + string_length > NumericLimits::Maximum()) { + D_ASSERT(append_data.options.arrow_offset_size == ArrowOffsetSize::REGULAR); + throw InvalidInputException( + "Arrow Appender: The maximum total string size for regular string buffers is " + "%u but the offset of %lu exceeds this.", + NumericLimits::Maximum(), current_offset); + } offset_data[offset_idx] = current_offset; // resize the string buffer if required, and write the string data @@ -95,6 +96,15 @@ struct ArrowVarcharData { append_data.row_count += size; } + static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) { + if (append_data.options.arrow_offset_size == ArrowOffsetSize::REGULAR) { + // Check if the offset exceeds the max supported value + AppendTemplated(append_data, input, from, to, input_size); + } else { + AppendTemplated(append_data, input, from, to, input_size); + } + } + static void Finalize(ArrowAppendData &append_data, const LogicalType &type, ArrowArray *result) { result->n_buffers = 3; result->buffers[1] = append_data.main_buffer.data(); diff --git a/src/duckdb/src/include/duckdb/common/enum_util.hpp b/src/duckdb/src/include/duckdb/common/enum_util.hpp index 50cf642d..3677a2fb 100644 --- a/src/duckdb/src/include/duckdb/common/enum_util.hpp +++ b/src/duckdb/src/include/duckdb/common/enum_util.hpp @@ -208,6 +208,8 @@ enum class PreparedParamType : uint8_t; enum class ProfilerPrintFormat : uint8_t; +enum class QuantileSerializationType : uint8_t; + enum class QueryNodeType : uint8_t; enum class QueryResultType : uint8_t; @@ -236,6 +238,8 @@ enum class SinkCombineResultType : uint8_t; enum class SinkFinalizeType : uint8_t; +enum class SinkNextBatchType : uint8_t; + enum class SinkResultType : uint8_t; enum class SourceResultType : uint8_t; @@ -555,6 +559,9 @@ const char* EnumUtil::ToChars(PreparedParamType value); template<> const char* EnumUtil::ToChars(ProfilerPrintFormat value); +template<> +const char* EnumUtil::ToChars(QuantileSerializationType value); + template<> const char* EnumUtil::ToChars(QueryNodeType value); @@ -597,6 +604,9 @@ const char* EnumUtil::ToChars(SinkCombineResultType value template<> const char* EnumUtil::ToChars(SinkFinalizeType value); +template<> +const char* EnumUtil::ToChars(SinkNextBatchType value); + template<> const char* EnumUtil::ToChars(SinkResultType value); @@ -943,6 +953,9 @@ PreparedParamType EnumUtil::FromString(const char *value); template<> ProfilerPrintFormat EnumUtil::FromString(const char *value); +template<> +QuantileSerializationType EnumUtil::FromString(const char *value); + template<> QueryNodeType EnumUtil::FromString(const char *value); @@ -985,6 +998,9 @@ SinkCombineResultType EnumUtil::FromString(const char *va template<> SinkFinalizeType EnumUtil::FromString(const char *value); +template<> +SinkNextBatchType EnumUtil::FromString(const char *value); + template<> SinkResultType EnumUtil::FromString(const char *value); diff --git a/src/duckdb/src/include/duckdb/common/enums/operator_result_type.hpp b/src/duckdb/src/include/duckdb/common/enums/operator_result_type.hpp index f7ada047..5016ae2d 100644 --- a/src/duckdb/src/include/duckdb/common/enums/operator_result_type.hpp +++ b/src/duckdb/src/include/duckdb/common/enums/operator_result_type.hpp @@ -56,4 +56,10 @@ enum class SinkCombineResultType : uint8_t { FINISHED, BLOCKED }; //! BLOCKED means the finalize call to the sink is currently blocked, e.g. by some async I/O. enum class SinkFinalizeType : uint8_t { READY, NO_OUTPUT_POSSIBLE, BLOCKED }; +//! The SinkNextBatchType is used to indicate the result of a NextBatch call on a sink +//! There are two possible results: +//! READY means the sink is ready for further processing +//! BLOCKED means the NextBatch call to the sink is currently blocked, e.g. by some async I/O. +enum class SinkNextBatchType : uint8_t { READY, BLOCKED }; + } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/common/file_system.hpp b/src/duckdb/src/include/duckdb/common/file_system.hpp index 1f06b3ed..a9237d6f 100644 --- a/src/duckdb/src/include/duckdb/common/file_system.hpp +++ b/src/duckdb/src/include/duckdb/common/file_system.hpp @@ -66,6 +66,7 @@ struct FileHandle { DUCKDB_API string ReadLine(); DUCKDB_API bool CanSeek(); + DUCKDB_API bool IsPipe(); DUCKDB_API bool OnDiskFile(); DUCKDB_API idx_t GetFileSize(); DUCKDB_API FileType GetType(); diff --git a/src/duckdb/src/include/duckdb/common/filename_pattern.hpp b/src/duckdb/src/include/duckdb/common/filename_pattern.hpp index 3795fc36..98899fcd 100644 --- a/src/duckdb/src/include/duckdb/common/filename_pattern.hpp +++ b/src/duckdb/src/include/duckdb/common/filename_pattern.hpp @@ -13,7 +13,11 @@ namespace duckdb { +class Serializer; +class Deserializer; + class FilenamePattern { + friend Deserializer; public: FilenamePattern() : _base("data_"), _pos(_base.length()), _uuid(false) { @@ -25,6 +29,9 @@ class FilenamePattern { void SetFilenamePattern(const string &pattern); string CreateFilename(FileSystem &fs, const string &path, const string &extension, idx_t offset) const; + void Serialize(Serializer &serializer) const; + static FilenamePattern Deserialize(Deserializer &deserializer); + private: string _base; idx_t _pos; diff --git a/src/duckdb/src/include/duckdb/common/hive_partitioning.hpp b/src/duckdb/src/include/duckdb/common/hive_partitioning.hpp index 1ca824fa..7dd55e30 100644 --- a/src/duckdb/src/include/duckdb/common/hive_partitioning.hpp +++ b/src/duckdb/src/include/duckdb/common/hive_partitioning.hpp @@ -35,7 +35,7 @@ class HivePartitioning { bool hive_enabled, bool filename_enabled); //! Returns the compiled regex pattern to match hive partitions - DUCKDB_API static const string REGEX_STRING; + DUCKDB_API static const string &RegexString(); }; struct HivePartitionKey { diff --git a/src/duckdb/src/include/duckdb/common/pipe_file_system.hpp b/src/duckdb/src/include/duckdb/common/pipe_file_system.hpp index 8d050219..7806ce02 100644 --- a/src/duckdb/src/include/duckdb/common/pipe_file_system.hpp +++ b/src/duckdb/src/include/duckdb/common/pipe_file_system.hpp @@ -28,6 +28,9 @@ class PipeFileSystem : public FileSystem { bool CanSeek() override { return false; } + bool IsPipe(const string &filename) override { + return true; + } void FileSync(FileHandle &handle) override; std::string GetName() const override { diff --git a/src/duckdb/src/include/duckdb/common/types/vector.hpp b/src/duckdb/src/include/duckdb/common/types/vector.hpp index 885aabee..2ef4de98 100644 --- a/src/duckdb/src/include/duckdb/common/types/vector.hpp +++ b/src/duckdb/src/include/duckdb/common/types/vector.hpp @@ -447,7 +447,14 @@ struct StructVector { DUCKDB_API static vector> &GetEntries(Vector &vector); }; -enum class UnionInvalidReason : uint8_t { VALID, TAG_OUT_OF_RANGE, NO_MEMBERS, VALIDITY_OVERLAP, TAG_MISMATCH }; +enum class UnionInvalidReason : uint8_t { + VALID, + TAG_OUT_OF_RANGE, + NO_MEMBERS, + VALIDITY_OVERLAP, + TAG_MISMATCH, + NULL_TAG +}; struct UnionVector { // Unions are stored as structs, but the first child is always the "tag" @@ -460,7 +467,12 @@ struct UnionVector { // 2. The validity of the tag vector always matches the validity of the // union vector itself. // - // 3. For each tag in the tag vector, 0 <= tag < |members| + // 3. A valid union cannot have a NULL tag, but the selected member can + // be NULL. therefore, there is a difference between a union that "is" + // NULL and a union that "holds" a NULL. The latter still has a valid + // tag. + // + // 4. For each tag in the tag vector, 0 <= tag < |members| //! Get the tag vector of a union vector DUCKDB_API static const Vector &GetTags(const Vector &v); diff --git a/src/duckdb/src/include/duckdb/core_functions/aggregate/distributive_functions.hpp b/src/duckdb/src/include/duckdb/core_functions/aggregate/distributive_functions.hpp index a454620a..6a918578 100644 --- a/src/duckdb/src/include/duckdb/core_functions/aggregate/distributive_functions.hpp +++ b/src/duckdb/src/include/duckdb/core_functions/aggregate/distributive_functions.hpp @@ -210,6 +210,12 @@ struct GroupConcatFun { static constexpr const char *Name = "group_concat"; }; +struct ListaggFun { + using ALIAS = StringAggFun; + + static constexpr const char *Name = "listagg"; +}; + struct SumFun { static constexpr const char *Name = "sum"; static constexpr const char *Parameters = "arg"; diff --git a/src/duckdb/src/include/duckdb/core_functions/aggregate/quantile_enum.hpp b/src/duckdb/src/include/duckdb/core_functions/aggregate/quantile_enum.hpp new file mode 100644 index 00000000..161f1a90 --- /dev/null +++ b/src/duckdb/src/include/duckdb/core_functions/aggregate/quantile_enum.hpp @@ -0,0 +1,21 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// duckdb/core_functions/aggregate/quantile_enum.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +namespace duckdb { + +enum class QuantileSerializationType : uint8_t { + NON_DECIMAL = 0, + DECIMAL_DISCRETE, + DECIMAL_DISCRETE_LIST, + DECIMAL_CONTINUOUS, + DECIMAL_CONTINUOUS_LIST +}; + +} diff --git a/src/duckdb/src/include/duckdb/core_functions/scalar/math_functions.hpp b/src/duckdb/src/include/duckdb/core_functions/scalar/math_functions.hpp index 771ffa17..367b360b 100644 --- a/src/duckdb/src/include/duckdb/core_functions/scalar/math_functions.hpp +++ b/src/duckdb/src/include/duckdb/core_functions/scalar/math_functions.hpp @@ -298,9 +298,12 @@ struct Log10Fun { }; struct LogFun { - using ALIAS = Log10Fun; - static constexpr const char *Name = "log"; + static constexpr const char *Parameters = "b, x"; + static constexpr const char *Description = "Computes the logarithm of x to base b. b may be omitted, in which case the default 10"; + static constexpr const char *Example = "log(2, 64)"; + + static ScalarFunctionSet GetFunctions(); }; struct NextAfterFun { diff --git a/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp b/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp index 10f57b11..12c9bc61 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp @@ -1,13 +1,16 @@ #pragma once -#include "duckdb.hpp" -#ifndef DUCKDB_AMALGAMATION #include "duckdb/storage/object_cache.hpp" -#endif +#include "duckdb/common/mutex.hpp" +#include "duckdb/common/typedefs.hpp" +#include "duckdb/common/shared_ptr.hpp" +#include "duckdb/common/string.hpp" namespace duckdb { struct ReadCSVData; +class TableCatalogEntry; +class ClientContext; class CSVRejectsTable : public ObjectCacheEntry { public: diff --git a/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_batch_copy_to_file.hpp b/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_batch_copy_to_file.hpp index e91e4d92..ff471a5d 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_batch_copy_to_file.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_batch_copy_to_file.hpp @@ -46,7 +46,7 @@ class PhysicalBatchCopyToFile : public PhysicalOperator { OperatorSinkFinalizeInput &input) const override; unique_ptr GetLocalSinkState(ExecutionContext &context) const override; unique_ptr GetGlobalSinkState(ClientContext &context) const override; - void NextBatch(ExecutionContext &context, GlobalSinkState &state, LocalSinkState &lstate_p) const override; + SinkNextBatchType NextBatch(ExecutionContext &context, OperatorSinkNextBatchInput &input) const override; bool RequiresBatchIndex() const override { return true; diff --git a/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_batch_insert.hpp b/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_batch_insert.hpp index bc87f59f..55ec0a01 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_batch_insert.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_batch_insert.hpp @@ -52,7 +52,7 @@ class PhysicalBatchInsert : public PhysicalOperator { // Sink interface unique_ptr GetGlobalSinkState(ClientContext &context) const override; unique_ptr GetLocalSinkState(ExecutionContext &context) const override; - void NextBatch(ExecutionContext &context, GlobalSinkState &state, LocalSinkState &lstate_p) const override; + SinkNextBatchType NextBatch(ExecutionContext &context, OperatorSinkNextBatchInput &input) const override; SinkResultType Sink(ExecutionContext &context, DataChunk &chunk, OperatorSinkInput &input) const override; SinkCombineResultType Combine(ExecutionContext &context, OperatorSinkCombineInput &input) const override; SinkFinalizeType Finalize(Pipeline &pipeline, Event &event, ClientContext &context, diff --git a/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_fixed_batch_copy.hpp b/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_fixed_batch_copy.hpp index b8ca7329..432234b9 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_fixed_batch_copy.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/persistent/physical_fixed_batch_copy.hpp @@ -45,7 +45,7 @@ class PhysicalFixedBatchCopy : public PhysicalOperator { OperatorSinkFinalizeInput &input) const override; unique_ptr GetLocalSinkState(ExecutionContext &context) const override; unique_ptr GetGlobalSinkState(ClientContext &context) const override; - void NextBatch(ExecutionContext &context, GlobalSinkState &state, LocalSinkState &lstate_p) const override; + SinkNextBatchType NextBatch(ExecutionContext &context, OperatorSinkNextBatchInput &input) const override; bool RequiresBatchIndex() const override { return true; diff --git a/src/duckdb/src/include/duckdb/execution/physical_operator.hpp b/src/duckdb/src/include/duckdb/execution/physical_operator.hpp index e1304673..076e5f5e 100644 --- a/src/duckdb/src/include/duckdb/execution/physical_operator.hpp +++ b/src/duckdb/src/include/duckdb/execution/physical_operator.hpp @@ -145,8 +145,8 @@ class PhysicalOperator { virtual SinkFinalizeType Finalize(Pipeline &pipeline, Event &event, ClientContext &context, OperatorSinkFinalizeInput &input) const; //! For sinks with RequiresBatchIndex set to true, when a new batch starts being processed this method is called - //! This allows flushing of the current batch (e.g. to disk) TODO: should this be able to block too? - virtual void NextBatch(ExecutionContext &context, GlobalSinkState &state, LocalSinkState &lstate_p) const; + //! This allows flushing of the current batch (e.g. to disk) + virtual SinkNextBatchType NextBatch(ExecutionContext &context, OperatorSinkNextBatchInput &input) const; virtual unique_ptr GetLocalSinkState(ExecutionContext &context) const; virtual unique_ptr GetGlobalSinkState(ClientContext &context) const; diff --git a/src/duckdb/src/include/duckdb/execution/physical_operator_states.hpp b/src/duckdb/src/include/duckdb/execution/physical_operator_states.hpp index e3963efb..d799ec0d 100644 --- a/src/duckdb/src/include/duckdb/execution/physical_operator_states.hpp +++ b/src/duckdb/src/include/duckdb/execution/physical_operator_states.hpp @@ -176,6 +176,12 @@ struct OperatorSinkFinalizeInput { InterruptState &interrupt_state; }; +struct OperatorSinkNextBatchInput { + GlobalSinkState &global_state; + LocalSinkState &local_state; + InterruptState &interrupt_state; +}; + // LCOV_EXCL_STOP } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/function/copy_function.hpp b/src/duckdb/src/include/duckdb/function/copy_function.hpp index 1dd25eb0..8f0e5f75 100644 --- a/src/duckdb/src/include/duckdb/function/copy_function.hpp +++ b/src/duckdb/src/include/duckdb/function/copy_function.hpp @@ -71,8 +71,8 @@ struct PreparedBatchData { enum class CopyFunctionExecutionMode { REGULAR_COPY_TO_FILE, PARALLEL_COPY_TO_FILE, BATCH_COPY_TO_FILE }; typedef BoundStatement (*copy_to_plan_t)(Binder &binder, CopyStatement &stmt); -typedef unique_ptr (*copy_to_bind_t)(ClientContext &context, CopyInfo &info, vector &names, - vector &sql_types); +typedef unique_ptr (*copy_to_bind_t)(ClientContext &context, const CopyInfo &info, + const vector &names, const vector &sql_types); typedef unique_ptr (*copy_to_initialize_local_t)(ExecutionContext &context, FunctionData &bind_data); typedef unique_ptr (*copy_to_initialize_global_t)(ClientContext &context, FunctionData &bind_data, const string &file_path); diff --git a/src/duckdb/src/include/duckdb/function/table/arrow.hpp b/src/duckdb/src/include/duckdb/function/table/arrow.hpp index fae51810..86caf588 100644 --- a/src/duckdb/src/include/duckdb/function/table/arrow.hpp +++ b/src/duckdb/src/include/duckdb/function/table/arrow.hpp @@ -33,6 +33,8 @@ struct ArrowInterval { struct ArrowProjectedColumns { unordered_map projection_map; vector columns; + // Map from filter index to column index + unordered_map filter_to_col; }; struct ArrowStreamParameters { @@ -61,10 +63,30 @@ struct ArrowScanFunctionData : public PyTableFunctionData { ArrowTableType arrow_table; }; +struct ArrowScanLocalState; +struct ArrowArrayScanState { +public: + ArrowArrayScanState(ArrowScanLocalState &state); + +public: + ArrowScanLocalState &state; + unordered_map> children; + // Cache the (optional) dictionary of this array + unique_ptr dictionary; + +public: + ArrowArrayScanState &GetChild(idx_t child_idx); + void AddDictionary(unique_ptr dictionary_p); + bool HasDictionary() const; + Vector &GetDictionary(); +}; + struct ArrowScanLocalState : public LocalTableFunctionState { +public: explicit ArrowScanLocalState(unique_ptr current_chunk) : chunk(current_chunk.release()) { } +public: unique_ptr stream; shared_ptr chunk; // This vector hold the Arrow Vectors owned by DuckDB to allow for zero-copy @@ -73,11 +95,22 @@ struct ArrowScanLocalState : public LocalTableFunctionState { idx_t chunk_offset = 0; idx_t batch_index = 0; vector column_ids; - //! Store child vectors for Arrow Dictionary Vectors (col-idx,vector) - unordered_map> arrow_dictionary_vectors; + unordered_map> array_states; TableFilterSet *filters = nullptr; //! The DataChunk containing all read columns (even filter columns that are immediately removed) DataChunk all_columns; + +public: + ArrowArrayScanState &GetState(idx_t child_idx) { + auto it = array_states.find(child_idx); + if (it == array_states.end()) { + auto child_p = make_uniq(*this); + auto &child = *child_p; + array_states.emplace(std::make_pair(child_idx, std::move(child_p))); + return child; + } + return *it->second; + } }; struct ArrowScanGlobalState : public GlobalTableFunctionState { @@ -148,6 +181,8 @@ struct ArrowTableFunction { const GlobalTableFunctionState *global_state); //! Renames repeated columns and case sensitive columns static void RenameArrowColumns(vector &names); + +public: //! Helper function to get the DuckDB logical type static unique_ptr GetArrowLogicalType(ArrowSchema &schema); }; diff --git a/src/duckdb/src/include/duckdb/function/table/arrow/arrow_duck_schema.hpp b/src/duckdb/src/include/duckdb/function/table/arrow/arrow_duck_schema.hpp index bd15f89d..d475875f 100644 --- a/src/duckdb/src/include/duckdb/function/table/arrow/arrow_duck_schema.hpp +++ b/src/duckdb/src/include/duckdb/function/table/arrow/arrow_duck_schema.hpp @@ -57,7 +57,7 @@ class ArrowType { void AssignChildren(vector> children); - const LogicalType &GetDuckType() const; + LogicalType GetDuckType(bool use_dictionary = false) const; ArrowVariableSizeType GetSizeType() const; @@ -65,6 +65,8 @@ class ArrowType { void SetDictionary(unique_ptr dictionary); + bool HasDictionary() const; + ArrowDateTimeType GetDateTimeType() const; const ArrowType &GetDictionary() const; diff --git a/src/duckdb/src/include/duckdb/function/table/system_functions.hpp b/src/duckdb/src/include/duckdb/function/table/system_functions.hpp index f6ff5308..8cada10f 100644 --- a/src/duckdb/src/include/duckdb/function/table/system_functions.hpp +++ b/src/duckdb/src/include/duckdb/function/table/system_functions.hpp @@ -133,4 +133,8 @@ struct TestVectorTypesFun { static void RegisterFunction(BuiltinFunctions &set); }; +struct PragmaUserAgent { + static void RegisterFunction(BuiltinFunctions &set); +}; + } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/function/udf_function.hpp b/src/duckdb/src/include/duckdb/function/udf_function.hpp index 65569293..a679d4ce 100644 --- a/src/duckdb/src/include/duckdb/function/udf_function.hpp +++ b/src/duckdb/src/include/duckdb/function/udf_function.hpp @@ -370,7 +370,7 @@ struct UDFWrapper { inline static AggregateFunction CreateBinaryAggregateFunction(const string &name, LogicalType ret_type, LogicalType input_typeA, LogicalType input_typeB) { AggregateFunction aggr_function = - AggregateFunction::BinaryAggregate(input_typeA, input_typeB, ret_type); + AggregateFunction::BinaryAggregate(input_typeA, input_typeB, ret_type); aggr_function.name = name; return aggr_function; } diff --git a/src/duckdb/src/include/duckdb/main/config.hpp b/src/duckdb/src/include/duckdb/main/config.hpp index 654bc80c..3cd2e885 100644 --- a/src/duckdb/src/include/duckdb/main/config.hpp +++ b/src/duckdb/src/include/duckdb/main/config.hpp @@ -173,6 +173,10 @@ struct DBConfigOptions { static bool debug_print_bindings; //! The peak allocation threshold at which to flush the allocator after completing a task (1 << 27, ~128MB) idx_t allocator_flush_threshold = 134217728; + //! DuckDB API surface + string duckdb_api; + //! Metadata from DuckDB callers + string custom_user_agent; bool operator==(const DBConfigOptions &other) const; }; @@ -259,6 +263,7 @@ struct DBConfig { OrderType ResolveOrder(OrderType order_type) const; OrderByNullType ResolveNullOrder(OrderType order_type, OrderByNullType null_type) const; + const std::string UserAgent() const; private: unique_ptr compression_functions; diff --git a/src/duckdb/src/include/duckdb/main/extension/generated_extension_loader.hpp b/src/duckdb/src/include/duckdb/main/extension/generated_extension_loader.hpp index 2b4a6662..2dd9190e 100644 --- a/src/duckdb/src/include/duckdb/main/extension/generated_extension_loader.hpp +++ b/src/duckdb/src/include/duckdb/main/extension/generated_extension_loader.hpp @@ -8,20 +8,21 @@ #pragma once -#include "duckdb/main/database.hpp" #include "duckdb/common/string.hpp" #include "duckdb/common/vector.hpp" +#include "duckdb/main/database.hpp" #if defined(GENERATED_EXTENSION_HEADERS) and !defined(DUCKDB_AMALGAMATION) -#include "generated_extension_headers.hpp" #include "duckdb/common/common.hpp" +#include "generated_extension_headers.hpp" namespace duckdb { //! Looks through the CMake-generated list of extensions that are linked into DuckDB currently to try load bool TryLoadLinkedExtension(DuckDB &db, const string &extension); -extern vector linked_extensions; -extern vector loaded_extension_test_paths; + +const vector &LinkedExtensions(); +const vector &LoadedExtensionTestPaths(); } // namespace duckdb #endif diff --git a/src/duckdb/src/include/duckdb/main/extension_entries.hpp b/src/duckdb/src/include/duckdb/main/extension_entries.hpp index 97e8adc8..041c72e2 100644 --- a/src/duckdb/src/include/duckdb/main/extension_entries.hpp +++ b/src/duckdb/src/include/duckdb/main/extension_entries.hpp @@ -72,8 +72,10 @@ static constexpr ExtensionEntry EXTENSION_FUNCTIONS[] = { {"parquet_metadata", "parquet"}, {"parquet_scan", "parquet"}, {"parquet_schema", "parquet"}, + {"pg_clear_cache", "postgres_scanner"}, {"pg_timezone_names", "icu"}, {"postgres_attach", "postgres_scanner"}, + {"postgres_query", "postgres_scanner"}, {"postgres_scan", "postgres_scanner"}, {"postgres_scan_pushdown", "postgres_scanner"}, {"read_json", "json"}, @@ -114,6 +116,7 @@ static constexpr ExtensionEntry EXTENSION_FUNCTIONS[] = { {"st_distance", "spatial"}, {"st_distance_spheroid", "spatial"}, {"st_drivers", "spatial"}, + {"st_dump", "spatial"}, {"st_dwithin", "spatial"}, {"st_dwithin_spheroid", "spatial"}, {"st_endpoint", "spatial"}, @@ -140,9 +143,12 @@ static constexpr ExtensionEntry EXTENSION_FUNCTIONS[] = { {"st_isvalid", "spatial"}, {"st_length", "spatial"}, {"st_length_spheroid", "spatial"}, + {"st_linemerge", "spatial"}, {"st_linestring2dfromwkb", "spatial"}, {"st_list_proj_crs", "spatial"}, + {"st_makeenvelope", "spatial"}, {"st_makeline", "spatial"}, + {"st_makepolygon", "spatial"}, {"st_ngeometries", "spatial"}, {"st_ninteriorrings", "spatial"}, {"st_normalize", "spatial"}, @@ -204,6 +210,12 @@ static constexpr ExtensionEntry EXTENSION_SETTINGS[] = { {"http_retry_backoff", "httpfs"}, {"http_retry_wait_ms", "httpfs"}, {"http_timeout", "httpfs"}, + {"pg_debug_show_queries", "postgres_scanner"}, + {"pg_use_binary_copy", "postgres_scanner"}, + {"pg_experimental_filter_pushdown", "postgres_scanner"}, + {"pg_connection_limit", "postgres_scanner"}, + {"pg_pages_per_task", "postgres_scanner"}, + {"pg_array_as_varchar", "postgres_scanner"}, {"s3_access_key_id", "httpfs"}, {"s3_endpoint", "httpfs"}, {"s3_region", "httpfs"}, diff --git a/src/duckdb/src/include/duckdb/main/settings.hpp b/src/duckdb/src/include/duckdb/main/settings.hpp index 65d2b6c4..f0e92267 100644 --- a/src/duckdb/src/include/duckdb/main/settings.hpp +++ b/src/duckdb/src/include/duckdb/main/settings.hpp @@ -552,4 +552,22 @@ struct FlushAllocatorSetting { static Value GetSetting(ClientContext &context); }; +struct DuckDBApiSetting { + static constexpr const char *Name = "duckdb_api"; + static constexpr const char *Description = "DuckDB API surface"; + static constexpr const LogicalTypeId InputType = LogicalTypeId::VARCHAR; + static void SetGlobal(DatabaseInstance *db, DBConfig &config, const Value ¶meter); + static void ResetGlobal(DatabaseInstance *db, DBConfig &config); + static Value GetSetting(ClientContext &context); +}; + +struct CustomUserAgentSetting { + static constexpr const char *Name = "custom_user_agent"; + static constexpr const char *Description = "Metadata from DuckDB callers"; + static constexpr const LogicalTypeId InputType = LogicalTypeId::VARCHAR; + static void SetGlobal(DatabaseInstance *db, DBConfig &config, const Value ¶meter); + static void ResetGlobal(DatabaseInstance *db, DBConfig &config); + static Value GetSetting(ClientContext &context); +}; + } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/optimizer/filter_combiner.hpp b/src/duckdb/src/include/duckdb/optimizer/filter_combiner.hpp index 3764915d..f7591657 100644 --- a/src/duckdb/src/include/duckdb/optimizer/filter_combiner.hpp +++ b/src/duckdb/src/include/duckdb/optimizer/filter_combiner.hpp @@ -54,7 +54,7 @@ class FilterCombiner { private: FilterResult AddFilter(Expression &expr); FilterResult AddBoundComparisonFilter(Expression &expr); - FilterResult AddTransitiveFilters(BoundComparisonExpression &comparison); + FilterResult AddTransitiveFilters(BoundComparisonExpression &comparison, bool is_root = true); unique_ptr FindTransitiveFilter(Expression &expr); // unordered_map> // FindZonemapChecks(vector &column_ids, unordered_set ¬_constants, Expression *filter); diff --git a/src/duckdb/src/include/duckdb/optimizer/rule.hpp b/src/duckdb/src/include/duckdb/optimizer/rule.hpp index a04549fa..30061a29 100644 --- a/src/duckdb/src/include/duckdb/optimizer/rule.hpp +++ b/src/duckdb/src/include/duckdb/optimizer/rule.hpp @@ -23,8 +23,6 @@ class Rule { //! The expression rewriter this rule belongs to ExpressionRewriter &rewriter; - //! The root - unique_ptr logical_root; //! The expression matcher of the rule unique_ptr root; diff --git a/src/duckdb/src/include/duckdb/parallel/pipeline_executor.hpp b/src/duckdb/src/include/duckdb/parallel/pipeline_executor.hpp index 6169425f..63194449 100644 --- a/src/duckdb/src/include/duckdb/parallel/pipeline_executor.hpp +++ b/src/duckdb/src/include/duckdb/parallel/pipeline_executor.hpp @@ -106,6 +106,10 @@ class PipelineExecutor { //! This flag is set when the pipeline gets interrupted by the Sink -> the final_chunk should be re-sink-ed. bool remaining_sink_chunk = false; + //! This flag is set when the pipeline gets interrupted by NextBatch -> NextBatch should be called again and the + //! source_chunk should be sent through the pipeline + bool next_batch_blocked = false; + //! Current operator being flushed idx_t flushing_idx; //! Whether the current flushing_idx should be flushed: this needs to be stored to make flushing code re-entrant @@ -131,6 +135,9 @@ class PipelineExecutor { //! Returns whether or not a new input chunk is needed, or whether or not we are finished OperatorResultType Execute(DataChunk &input, DataChunk &result, idx_t initial_index = 0); + //! Notifies the sink that a new batch has started + SinkNextBatchType NextBatch(DataChunk &source_chunk); + //! Tries to flush all state from intermediate operators. Will return true if all state is flushed, false in the //! case of a blocked sink. bool TryFlushCachingOperators(); @@ -143,6 +150,7 @@ class PipelineExecutor { int debug_blocked_sink_count = 0; int debug_blocked_source_count = 0; int debug_blocked_combine_count = 0; + int debug_blocked_next_batch_count = 0; //! Number of times the Sink/Source will block before actually returning data int debug_blocked_target_count = 1; #endif diff --git a/src/duckdb/src/include/duckdb/parser/parsed_data/create_info.hpp b/src/duckdb/src/include/duckdb/parser/parsed_data/create_info.hpp index 3fd94128..70e04728 100644 --- a/src/duckdb/src/include/duckdb/parser/parsed_data/create_info.hpp +++ b/src/duckdb/src/include/duckdb/parser/parsed_data/create_info.hpp @@ -10,6 +10,7 @@ #include "duckdb/common/enums/catalog_type.hpp" #include "duckdb/parser/parsed_data/parse_info.hpp" +#include "duckdb/common/enum_util.hpp" namespace duckdb { struct AlterInfo; @@ -61,6 +62,10 @@ struct CreateInfo : public ParseInfo { DUCKDB_API void CopyProperties(CreateInfo &other) const; //! Generates an alter statement from the create statement - used for OnCreateConflict::ALTER_ON_CONFLICT DUCKDB_API virtual unique_ptr GetAlterInfo() const; + virtual string ToString() const { + throw InternalException("ToString not supported for this type of CreateInfo: '%s'", + EnumUtil::ToString(info_type)); + } }; } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/parser/parsed_data/create_view_info.hpp b/src/duckdb/src/include/duckdb/parser/parsed_data/create_view_info.hpp index 4f9ff34b..c70a273c 100644 --- a/src/duckdb/src/include/duckdb/parser/parsed_data/create_view_info.hpp +++ b/src/duckdb/src/include/duckdb/parser/parsed_data/create_view_info.hpp @@ -15,11 +15,13 @@ namespace duckdb { class SchemaCatalogEntry; struct CreateViewInfo : public CreateInfo { +public: CreateViewInfo(); CreateViewInfo(SchemaCatalogEntry &schema, string view_name); CreateViewInfo(string catalog_p, string schema_p, string view_name); - //! Table name to insert to +public: + //! View name string view_name; //! Aliases of the view vector aliases; @@ -38,6 +40,7 @@ struct CreateViewInfo : public CreateInfo { DUCKDB_API void Serialize(Serializer &serializer) const override; DUCKDB_API static unique_ptr Deserialize(Deserializer &deserializer); + string ToString() const override; }; } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/parser/statement/create_statement.hpp b/src/duckdb/src/include/duckdb/parser/statement/create_statement.hpp index 362af264..74177d87 100644 --- a/src/duckdb/src/include/duckdb/parser/statement/create_statement.hpp +++ b/src/duckdb/src/include/duckdb/parser/statement/create_statement.hpp @@ -27,6 +27,7 @@ class CreateStatement : public SQLStatement { public: unique_ptr Copy() const override; + string ToString() const override; }; } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/parser/transformer.hpp b/src/duckdb/src/include/duckdb/parser/transformer.hpp index d16dfb8c..8371796c 100644 --- a/src/duckdb/src/include/duckdb/parser/transformer.hpp +++ b/src/duckdb/src/include/duckdb/parser/transformer.hpp @@ -48,6 +48,7 @@ class Transformer { unique_ptr base; unique_ptr column; unique_ptr subquery; + bool has_parameters; }; public: @@ -90,7 +91,7 @@ class Transformer { bool GetParam(const string &name, idx_t &index, PreparedParamType type); void AddPivotEntry(string enum_name, unique_ptr source, unique_ptr column, - unique_ptr subquery); + unique_ptr subquery, bool has_parameters); unique_ptr GenerateCreateEnumStmt(unique_ptr entry); bool HasPivotEntries(); idx_t PivotEntryCount(); diff --git a/src/duckdb/src/include/duckdb/planner/operator/logical_copy_to_file.hpp b/src/duckdb/src/include/duckdb/planner/operator/logical_copy_to_file.hpp index 8bac2757..0e1a7789 100644 --- a/src/duckdb/src/include/duckdb/planner/operator/logical_copy_to_file.hpp +++ b/src/duckdb/src/include/duckdb/planner/operator/logical_copy_to_file.hpp @@ -20,12 +20,14 @@ class LogicalCopyToFile : public LogicalOperator { static constexpr const LogicalOperatorType TYPE = LogicalOperatorType::LOGICAL_COPY_TO_FILE; public: - LogicalCopyToFile(CopyFunction function, unique_ptr bind_data) - : LogicalOperator(LogicalOperatorType::LOGICAL_COPY_TO_FILE), function(function), - bind_data(std::move(bind_data)) { + LogicalCopyToFile(CopyFunction function, unique_ptr bind_data, unique_ptr copy_info) + : LogicalOperator(LogicalOperatorType::LOGICAL_COPY_TO_FILE), function(std::move(function)), + bind_data(std::move(bind_data)), copy_info(std::move(copy_info)) { } CopyFunction function; unique_ptr bind_data; + unique_ptr copy_info; + std::string file_path; bool use_tmp_file; FilenamePattern filename_pattern; @@ -39,10 +41,6 @@ class LogicalCopyToFile : public LogicalOperator { public: idx_t EstimateCardinality(ClientContext &context) override; - //! Skips the serialization check in VerifyPlan - bool SupportSerialization() const override { - return false; - } void Serialize(Serializer &serializer) const override; static unique_ptr Deserialize(Deserializer &deserializer); diff --git a/src/duckdb/src/include/duckdb/storage/table/column_segment.hpp b/src/duckdb/src/include/duckdb/storage/table/column_segment.hpp index 89a9e2b5..0ca09840 100644 --- a/src/duckdb/src/include/duckdb/storage/table/column_segment.hpp +++ b/src/duckdb/src/include/duckdb/storage/table/column_segment.hpp @@ -21,7 +21,6 @@ namespace duckdb { class ColumnSegment; class BlockManager; -class ColumnSegment; class ColumnData; class DatabaseInstance; class Transaction; diff --git a/src/duckdb/src/main/capi/config-c.cpp b/src/duckdb/src/main/capi/config-c.cpp index 67b4b46b..d607f0bb 100644 --- a/src/duckdb/src/main/capi/config-c.cpp +++ b/src/duckdb/src/main/capi/config-c.cpp @@ -13,6 +13,7 @@ duckdb_state duckdb_create_config(duckdb_config *out_config) { DBConfig *config; try { config = new DBConfig(); + config->SetOptionByName("duckdb_api", "capi"); } catch (...) { // LCOV_EXCL_START return DuckDBError; } // LCOV_EXCL_STOP diff --git a/src/duckdb/src/main/capi/duckdb-c.cpp b/src/duckdb/src/main/capi/duckdb-c.cpp index 5377df46..df522d17 100644 --- a/src/duckdb/src/main/capi/duckdb-c.cpp +++ b/src/duckdb/src/main/capi/duckdb-c.cpp @@ -8,7 +8,15 @@ using duckdb::DuckDB; duckdb_state duckdb_open_ext(const char *path, duckdb_database *out, duckdb_config config, char **error) { auto wrapper = new DatabaseData(); try { - auto db_config = (DBConfig *)config; + DBConfig default_config; + default_config.SetOptionByName("duckdb_api", "capi"); + + DBConfig *db_config = &default_config; + DBConfig *user_config = (DBConfig *)config; + if (user_config) { + db_config = user_config; + } + wrapper->database = duckdb::make_uniq(path, db_config); } catch (std::exception &ex) { if (error) { diff --git a/src/duckdb/src/main/config.cpp b/src/duckdb/src/main/config.cpp index 89d5e1ba..ee8f3d2b 100644 --- a/src/duckdb/src/main/config.cpp +++ b/src/duckdb/src/main/config.cpp @@ -115,6 +115,8 @@ static ConfigurationOption internal_options[] = {DUCKDB_GLOBAL(AccessModeSetting DUCKDB_GLOBAL_ALIAS("wal_autocheckpoint", CheckpointThresholdSetting), DUCKDB_GLOBAL_ALIAS("worker_threads", ThreadsSetting), DUCKDB_GLOBAL(FlushAllocatorSetting), + DUCKDB_GLOBAL(DuckDBApiSetting), + DUCKDB_GLOBAL(CustomUserAgentSetting), FINAL_SETTING}; vector DBConfig::GetOptions() { @@ -169,6 +171,13 @@ void DBConfig::SetOptionByName(const string &name, const Value &value) { auto option = DBConfig::GetOptionByName(name); if (option) { SetOption(*option, value); + return; + } + + auto param = extension_parameters.find(name); + if (param != extension_parameters.end()) { + Value target_value = value.DefaultCastAs(param->second.type); + SetOption(name, std::move(target_value)); } else { options.unrecognized_options[name] = value; } @@ -411,4 +420,13 @@ OrderByNullType DBConfig::ResolveNullOrder(OrderType order_type, OrderByNullType } } +const std::string DBConfig::UserAgent() const { + auto user_agent = options.duckdb_api; + + if (!options.custom_user_agent.empty()) { + user_agent += " " + options.custom_user_agent; + } + return user_agent; +} + } // namespace duckdb diff --git a/src/duckdb/src/main/database.cpp b/src/duckdb/src/main/database.cpp index 31103869..a1741306 100644 --- a/src/duckdb/src/main/database.cpp +++ b/src/duckdb/src/main/database.cpp @@ -31,6 +31,7 @@ DBConfig::DBConfig() { compression_functions = make_uniq(); cast_functions = make_uniq(); error_manager = make_uniq(); + options.duckdb_api = StringUtil::Format("duckdb/%s(%s)", DuckDB::LibraryVersion(), DuckDB::Platform()); } DBConfig::DBConfig(std::unordered_map &config_dict, bool read_only) : DBConfig::DBConfig() { diff --git a/src/duckdb/src/main/extension/extension_alias.cpp b/src/duckdb/src/main/extension/extension_alias.cpp index cce16429..9d9992cd 100644 --- a/src/duckdb/src/main/extension/extension_alias.cpp +++ b/src/duckdb/src/main/extension/extension_alias.cpp @@ -4,7 +4,8 @@ namespace duckdb { static ExtensionAlias internal_aliases[] = {{"http", "httpfs"}, // httpfs {"https", "httpfs"}, - {"md", "motherduck"}, // motherduck + {"md", "motherduck"}, // motherduck + {"mysql", "mysql_scanner"}, // mysql {"s3", "httpfs"}, {"postgres", "postgres_scanner"}, // postgres {"sqlite", "sqlite_scanner"}, // sqlite diff --git a/src/duckdb/src/main/extension/extension_helper.cpp b/src/duckdb/src/main/extension/extension_helper.cpp index 9c39eed2..940d1fad 100644 --- a/src/duckdb/src/main/extension/extension_helper.cpp +++ b/src/duckdb/src/main/extension/extension_helper.cpp @@ -112,8 +112,9 @@ static DefaultExtension internal_extensions[] = { {"jemalloc", "Overwrites system allocator with JEMalloc", DUCKDB_EXTENSION_JEMALLOC_LINKED}, {"autocomplete", "Adds support for autocomplete in the shell", DUCKDB_EXTENSION_AUTOCOMPLETE_LINKED}, {"motherduck", "Enables motherduck integration with the system", false}, - {"sqlite_scanner", "Adds support for reading SQLite database files", false}, - {"postgres_scanner", "Adds support for reading from a Postgres database", false}, + {"mysql_scanner", "Adds support for connecting to a MySQL database", false}, + {"sqlite_scanner", "Adds support for reading and writing SQLite database files", false}, + {"postgres_scanner", "Adds support for connecting to a Postgres database", false}, {"inet", "Adds support for IP-related data types and functions", false}, {"spatial", "Geospatial extension that adds support for working with spatial data and functions", false}, {"substrait", "Adds support for the Substrait integration", false}, @@ -139,7 +140,7 @@ DefaultExtension ExtensionHelper::GetDefaultExtension(idx_t index) { //===--------------------------------------------------------------------===// // Allow Auto-Install Extensions //===--------------------------------------------------------------------===// -static const char *auto_install[] = {"motherduck", "postgres_scanner", "sqlite_scanner", nullptr}; +static const char *auto_install[] = {"motherduck", "postgres_scanner", "mysql_scanner", "sqlite_scanner", nullptr}; // TODO: unify with new autoload mechanism bool ExtensionHelper::AllowAutoInstall(const string &extension) { @@ -246,7 +247,7 @@ void ExtensionHelper::LoadAllExtensions(DuckDB &db) { } #if defined(GENERATED_EXTENSION_HEADERS) && GENERATED_EXTENSION_HEADERS - for (auto &ext : linked_extensions) { + for (const auto &ext : LinkedExtensions()) { LoadExtensionInternal(db, ext, true); } #endif diff --git a/src/duckdb/src/main/settings/settings.cpp b/src/duckdb/src/main/settings/settings.cpp index 1fe56fed..e75c8f50 100644 --- a/src/duckdb/src/main/settings/settings.cpp +++ b/src/duckdb/src/main/settings/settings.cpp @@ -1203,4 +1203,53 @@ Value FlushAllocatorSetting::GetSetting(ClientContext &context) { return Value(StringUtil::BytesToHumanReadableString(config.options.allocator_flush_threshold)); } +//===--------------------------------------------------------------------===// +// DuckDBApi Setting +//===--------------------------------------------------------------------===// + +void DuckDBApiSetting::SetGlobal(DatabaseInstance *db, DBConfig &config, const Value &input) { + auto new_value = input.GetValue(); + if (db) { + throw InvalidInputException("Cannot change duckdb_api setting while database is running"); + } + config.options.duckdb_api += " " + new_value; +} + +void DuckDBApiSetting::ResetGlobal(DatabaseInstance *db, DBConfig &config) { + if (db) { + throw InvalidInputException("Cannot change duckdb_api setting while database is running"); + } + config.options.duckdb_api = DBConfig().options.duckdb_api; +} + +Value DuckDBApiSetting::GetSetting(ClientContext &context) { + auto &config = DBConfig::GetConfig(context); + return Value(config.options.duckdb_api); +} + +//===--------------------------------------------------------------------===// +// CustomUserAgent Setting +//===--------------------------------------------------------------------===// + +void CustomUserAgentSetting::SetGlobal(DatabaseInstance *db, DBConfig &config, const Value &input) { + auto new_value = input.GetValue(); + if (db) { + throw InvalidInputException("Cannot change custom_user_agent setting while database is running"); + } + config.options.custom_user_agent = + config.options.custom_user_agent.empty() ? new_value : config.options.custom_user_agent + " " + new_value; +} + +void CustomUserAgentSetting::ResetGlobal(DatabaseInstance *db, DBConfig &config) { + if (db) { + throw InvalidInputException("Cannot change custom_user_agent setting while database is running"); + } + config.options.custom_user_agent = DBConfig().options.custom_user_agent; +} + +Value CustomUserAgentSetting::GetSetting(ClientContext &context) { + auto &config = DBConfig::GetConfig(context); + return Value(config.options.custom_user_agent); +} + } // namespace duckdb diff --git a/src/duckdb/src/optimizer/expression_rewriter.cpp b/src/duckdb/src/optimizer/expression_rewriter.cpp index 2e3a19f2..fea861e9 100644 --- a/src/duckdb/src/optimizer/expression_rewriter.cpp +++ b/src/duckdb/src/optimizer/expression_rewriter.cpp @@ -59,16 +59,8 @@ void ExpressionRewriter::VisitOperator(LogicalOperator &op) { to_apply_rules.clear(); for (auto &rule : rules) { - if (rule->logical_root && !rule->logical_root->Match(op.type)) { - // this rule does not apply to this type of LogicalOperator - continue; - } to_apply_rules.push_back(*rule); } - if (to_apply_rules.empty()) { - // no rules to apply on this node - return; - } VisitOperatorExpressions(op); diff --git a/src/duckdb/src/optimizer/filter_combiner.cpp b/src/duckdb/src/optimizer/filter_combiner.cpp index 34162724..9461c200 100644 --- a/src/duckdb/src/optimizer/filter_combiner.cpp +++ b/src/duckdb/src/optimizer/filter_combiner.cpp @@ -782,29 +782,41 @@ FilterResult FilterCombiner::AddFilter(Expression &expr) { * Create and add new transitive filters from a two non-scalar filter such as j > i, j >= i, j < i, and j <= i * It's missing to create another method to add transitive filters from scalar filters, e.g, i > 10 */ -FilterResult FilterCombiner::AddTransitiveFilters(BoundComparisonExpression &comparison) { +FilterResult FilterCombiner::AddTransitiveFilters(BoundComparisonExpression &comparison, bool is_root) { D_ASSERT(IsGreaterThan(comparison.type) || IsLessThan(comparison.type)); // get the LHS and RHS nodes auto &left_node = GetNode(*comparison.left); reference right_node = GetNode(*comparison.right); // In case with filters like CAST(i) = j and i = 5 we replace the COLUMN_REF i with the constant 5 - if (right_node.get().type == ExpressionType::OPERATOR_CAST) { + do { + if (right_node.get().type != ExpressionType::OPERATOR_CAST) { + break; + } auto &bound_cast_expr = right_node.get().Cast(); - if (bound_cast_expr.child->type == ExpressionType::BOUND_COLUMN_REF) { - auto &col_ref = bound_cast_expr.child->Cast(); - for (auto &stored_exp : stored_expressions) { - if (stored_exp.first.get().type == ExpressionType::BOUND_COLUMN_REF) { - auto &st_col_ref = stored_exp.second->Cast(); - if (st_col_ref.binding == col_ref.binding && - bound_cast_expr.return_type == stored_exp.second->return_type) { - bound_cast_expr.child = stored_exp.second->Copy(); - right_node = GetNode(*bound_cast_expr.child); - break; - } - } + if (bound_cast_expr.child->type != ExpressionType::BOUND_COLUMN_REF) { + break; + } + auto &col_ref = bound_cast_expr.child->Cast(); + for (auto &stored_exp : stored_expressions) { + reference expr = stored_exp.first; + if (expr.get().type == ExpressionType::OPERATOR_CAST) { + expr = *(right_node.get().Cast().child); + } + if (expr.get().type != ExpressionType::BOUND_COLUMN_REF) { + continue; + } + auto &st_col_ref = expr.get().Cast(); + if (st_col_ref.binding != col_ref.binding) { + continue; } + if (bound_cast_expr.return_type != stored_exp.second->return_type) { + continue; + } + bound_cast_expr.child = stored_exp.second->Copy(); + right_node = GetNode(*bound_cast_expr.child); + break; } - } + } while (false); if (left_node.Equals(right_node)) { return FilterResult::UNSUPPORTED; @@ -874,14 +886,16 @@ FilterResult FilterCombiner::AddTransitiveFilters(BoundComparisonExpression &com is_successful = true; } if (is_successful) { - // now check for remaining trasitive filters from the left column - auto transitive_filter = FindTransitiveFilter(*comparison.left); - if (transitive_filter != nullptr) { - // try to add transitive filters - if (AddTransitiveFilters(transitive_filter->Cast()) == - FilterResult::UNSUPPORTED) { - // in case of unsuccessful re-add filter into remaining ones - remaining_filters.push_back(std::move(transitive_filter)); + if (is_root) { + // now check for remaining transitive filters from the left column + auto transitive_filter = FindTransitiveFilter(*comparison.left); + if (transitive_filter != nullptr) { + // try to add transitive filters + auto &transitive_cast = transitive_filter->Cast(); + if (AddTransitiveFilters(transitive_cast, false) == FilterResult::UNSUPPORTED) { + // in case of unsuccessful re-add filter into remaining ones + remaining_filters.push_back(std::move(transitive_filter)); + } } } return FilterResult::SUCCESS; diff --git a/src/duckdb/src/optimizer/join_order/plan_enumerator.cpp b/src/duckdb/src/optimizer/join_order/plan_enumerator.cpp index 37c5616c..a6efb84b 100644 --- a/src/duckdb/src/optimizer/join_order/plan_enumerator.cpp +++ b/src/duckdb/src/optimizer/join_order/plan_enumerator.cpp @@ -1,7 +1,10 @@ -#include "duckdb/optimizer/join_order/join_node.hpp" #include "duckdb/optimizer/join_order/plan_enumerator.hpp" -#include "duckdb/optimizer/join_order/query_graph_manager.hpp" + #include "duckdb/main/client_context.hpp" +#include "duckdb/optimizer/join_order/join_node.hpp" +#include "duckdb/optimizer/join_order/query_graph_manager.hpp" + +#include namespace duckdb { @@ -76,7 +79,7 @@ static vector> GetAllNeighborSets(vector neighbors) // drive by test to make sure we have an accurate amount of // subsets, and that each neighbor is in a correct amount // of those subsets. - D_ASSERT(ret.size() == pow(2, neighbors.size()) - 1); + D_ASSERT(ret.size() == std::pow(2, neighbors.size()) - 1); for (auto &n : neighbors) { idx_t count = 0; for (auto &set : ret) { @@ -84,7 +87,7 @@ static vector> GetAllNeighborSets(vector neighbors) count += 1; } } - D_ASSERT(count == pow(2, neighbors.size() - 1)); + D_ASSERT(count == std::pow(2, neighbors.size() - 1)); } #endif return ret; diff --git a/src/duckdb/src/optimizer/join_order/relation_manager.cpp b/src/duckdb/src/optimizer/join_order/relation_manager.cpp index 9294cd07..0af2a25c 100644 --- a/src/duckdb/src/optimizer/join_order/relation_manager.cpp +++ b/src/duckdb/src/optimizer/join_order/relation_manager.cpp @@ -231,10 +231,11 @@ bool RelationManager::ExtractJoinRelations(LogicalOperator &input_op, return true; } case LogicalOperatorType::LOGICAL_DELIM_GET: { - auto &delim_get = op->Cast(); - auto stats = RelationStatisticsHelper::ExtractDelimGetStats(delim_get, context); - AddRelation(input_op, parent, stats); - return true; + // Removed until we can extract better stats from delim gets. See #596 + // auto &delim_get = op->Cast(); + // auto stats = RelationStatisticsHelper::ExtractDelimGetStats(delim_get, context); + // AddRelation(input_op, parent, stats); + return false; } case LogicalOperatorType::LOGICAL_PROJECTION: { auto child_stats = RelationStats(); diff --git a/src/duckdb/src/optimizer/statistics/operator/propagate_join.cpp b/src/duckdb/src/optimizer/statistics/operator/propagate_join.cpp index 38dcc278..810d0054 100644 --- a/src/duckdb/src/optimizer/statistics/operator/propagate_join.cpp +++ b/src/duckdb/src/optimizer/statistics/operator/propagate_join.cpp @@ -67,6 +67,21 @@ void StatisticsPropagator::PropagateStatistics(LogicalComparisonJoin &join, uniq break; case FilterPropagateResult::FILTER_ALWAYS_TRUE: // filter is always true + // If this is the inequality for an AsOf join, + // then we must leave it in because it also flags + // the semantics of restricting to a single match + // so we can't replace it with an equi-join on the remaining conditions. + if (join.type == LogicalOperatorType::LOGICAL_ASOF_JOIN) { + switch (condition.comparison) { + case ExpressionType::COMPARE_GREATERTHAN: + case ExpressionType::COMPARE_GREATERTHANOREQUALTO: + case ExpressionType::COMPARE_LESSTHAN: + case ExpressionType::COMPARE_LESSTHANOREQUALTO: + continue; + default: + break; + } + } if (join.conditions.size() > 1) { // there are multiple conditions: erase this condition join.conditions.erase(join.conditions.begin() + i); @@ -93,10 +108,6 @@ void StatisticsPropagator::PropagateStatistics(LogicalComparisonJoin &join, uniq *node_ptr = std::move(cross_product); return; } - case JoinType::ANTI: - // anti join on true: empty result - ReplaceWithEmptyResult(*node_ptr); - return; default: // we don't handle mark/single join here yet break; diff --git a/src/duckdb/src/parallel/pipeline_executor.cpp b/src/duckdb/src/parallel/pipeline_executor.cpp index 80fc57ca..c495d408 100644 --- a/src/duckdb/src/parallel/pipeline_executor.cpp +++ b/src/duckdb/src/parallel/pipeline_executor.cpp @@ -106,6 +106,58 @@ bool PipelineExecutor::TryFlushCachingOperators() { return true; } +SinkNextBatchType PipelineExecutor::NextBatch(duckdb::DataChunk &source_chunk) { + D_ASSERT(requires_batch_index); + idx_t next_batch_index; + if (source_chunk.size() == 0) { + next_batch_index = NumericLimits::Maximum(); + } else { + next_batch_index = + pipeline.source->GetBatchIndex(context, source_chunk, *pipeline.source_state, *local_source_state); + // we start with the base_batch_index as a valid starting value. Make sure that next batch is called below + next_batch_index += pipeline.base_batch_index + 1; + } + auto &partition_info = local_sink_state->partition_info; + if (next_batch_index == partition_info.batch_index.GetIndex()) { + // no changes, return + return SinkNextBatchType::READY; + } + // batch index has changed - update it + if (partition_info.batch_index.GetIndex() > next_batch_index) { + throw InternalException( + "Pipeline batch index - gotten lower batch index %llu (down from previous batch index of %llu)", + next_batch_index, partition_info.batch_index.GetIndex()); + } +#ifdef DUCKDB_DEBUG_ASYNC_SINK_SOURCE + if (debug_blocked_next_batch_count < debug_blocked_target_count) { + debug_blocked_next_batch_count++; + + auto &callback_state = interrupt_state; + std::thread rewake_thread([callback_state] { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + callback_state.Callback(); + }); + rewake_thread.detach(); + + return SinkNextBatchType::BLOCKED; + } +#endif + auto current_batch = partition_info.batch_index.GetIndex(); + partition_info.batch_index = next_batch_index; + OperatorSinkNextBatchInput next_batch_input {*pipeline.sink->sink_state, *local_sink_state, interrupt_state}; + // call NextBatch before updating min_batch_index to provide the opportunity to flush the previous batch + auto next_batch_result = pipeline.sink->NextBatch(context, next_batch_input); + + if (next_batch_result == SinkNextBatchType::BLOCKED) { + partition_info.batch_index = current_batch; // set batch_index back to what it was before + return SinkNextBatchType::BLOCKED; + } + + partition_info.min_batch_index = pipeline.UpdateBatchIndex(current_batch, next_batch_index); + + return SinkNextBatchType::READY; +} + PipelineExecuteResult PipelineExecutor::Execute(idx_t max_chunks) { D_ASSERT(pipeline.sink); auto &source_chunk = pipeline.operators.empty() ? final_chunk : *intermediate_chunks[0]; @@ -115,7 +167,8 @@ PipelineExecuteResult PipelineExecutor::Execute(idx_t max_chunks) { } OperatorResultType result; - if (exhausted_source && done_flushing && !remaining_sink_chunk && in_process_operators.empty()) { + if (exhausted_source && done_flushing && !remaining_sink_chunk && !next_batch_blocked && + in_process_operators.empty()) { break; } else if (remaining_sink_chunk) { // The pipeline was interrupted by the Sink. We should retry sinking the final chunk. @@ -127,7 +180,7 @@ PipelineExecuteResult PipelineExecutor::Execute(idx_t max_chunks) { // the result for the pipeline D_ASSERT(source_chunk.size() > 0); result = ExecutePushInternal(source_chunk); - } else if (exhausted_source && !done_flushing) { + } else if (exhausted_source && !next_batch_blocked && !done_flushing) { // The source was exhausted, try flushing all operators auto flush_completed = TryFlushCachingOperators(); if (flush_completed) { @@ -136,21 +189,33 @@ PipelineExecuteResult PipelineExecutor::Execute(idx_t max_chunks) { } else { return PipelineExecuteResult::INTERRUPTED; } - } else if (!exhausted_source) { - // "Regular" path: fetch a chunk from the source and push it through the pipeline - source_chunk.Reset(); - SourceResultType source_result = FetchFromSource(source_chunk); - - if (source_result == SourceResultType::BLOCKED) { - return PipelineExecuteResult::INTERRUPTED; + } else if (!exhausted_source || next_batch_blocked) { + SourceResultType source_result; + if (!next_batch_blocked) { + // "Regular" path: fetch a chunk from the source and push it through the pipeline + source_chunk.Reset(); + source_result = FetchFromSource(source_chunk); + if (source_result == SourceResultType::BLOCKED) { + return PipelineExecuteResult::INTERRUPTED; + } + if (source_result == SourceResultType::FINISHED) { + exhausted_source = true; + } } - if (source_result == SourceResultType::FINISHED) { - exhausted_source = true; - if (source_chunk.size() == 0) { - continue; + if (requires_batch_index) { + auto next_batch_result = NextBatch(source_chunk); + next_batch_blocked = next_batch_result == SinkNextBatchType::BLOCKED; + if (next_batch_blocked) { + return PipelineExecuteResult::INTERRUPTED; } } + + if (exhausted_source && source_chunk.size() == 0) { + // To ensure that we're not early-terminating the pipeline + continue; + } + result = ExecutePushInternal(source_chunk); } else { throw InternalException("Unexpected state reached in pipeline executor"); @@ -285,8 +350,9 @@ void PipelineExecutor::ExecutePull(DataChunk &result) { auto &executor = pipeline.executor; try { D_ASSERT(!pipeline.sink); + D_ASSERT(!requires_batch_index); auto &source_chunk = pipeline.operators.empty() ? result : *intermediate_chunks[0]; - while (result.size() == 0 && !exhausted_source) { + while (result.size() == 0 && (!exhausted_source || !in_process_operators.empty())) { if (in_process_operators.empty()) { source_chunk.Reset(); @@ -296,6 +362,7 @@ void PipelineExecutor::ExecutePull(DataChunk &result) { // Repeatedly try to fetch from the source until it doesn't block. Note that it may block multiple times while (true) { + D_ASSERT(!exhausted_source); source_result = FetchFromSource(source_chunk); // No interrupt happened, all good. @@ -489,32 +556,6 @@ SourceResultType PipelineExecutor::FetchFromSource(DataChunk &result) { // Ensures Sinks only return empty results when Blocking or Finished D_ASSERT(res != SourceResultType::BLOCKED || result.size() == 0); - if (requires_batch_index && res != SourceResultType::BLOCKED) { - idx_t next_batch_index; - if (result.size() == 0) { - next_batch_index = NumericLimits::Maximum(); - } else { - next_batch_index = - pipeline.source->GetBatchIndex(context, result, *pipeline.source_state, *local_source_state); - // we start with the base_batch_index as a valid starting value. Make sure that next batch is called below - next_batch_index += pipeline.base_batch_index + 1; - } - auto &partition_info = local_sink_state->partition_info; - if (next_batch_index != partition_info.batch_index.GetIndex()) { - // batch index has changed - update it - if (partition_info.batch_index.GetIndex() > next_batch_index) { - throw InternalException( - "Pipeline batch index - gotten lower batch index %llu (down from previous batch index of %llu)", - next_batch_index, partition_info.batch_index.GetIndex()); - } - auto current_batch = partition_info.batch_index.GetIndex(); - partition_info.batch_index = next_batch_index; - // call NextBatch before updating min_batch_index to provide the opportunity to flush the previous batch - pipeline.sink->NextBatch(context, *pipeline.sink->sink_state, *local_sink_state); - partition_info.min_batch_index = pipeline.UpdateBatchIndex(current_batch, next_batch_index); - } - } - EndOperator(*pipeline.source, &result); return res; diff --git a/src/duckdb/src/parser/parsed_data/create_view_info.cpp b/src/duckdb/src/parser/parsed_data/create_view_info.cpp index fde4a2c8..792f2547 100644 --- a/src/duckdb/src/parser/parsed_data/create_view_info.cpp +++ b/src/duckdb/src/parser/parsed_data/create_view_info.cpp @@ -19,6 +19,33 @@ CreateViewInfo::CreateViewInfo(SchemaCatalogEntry &schema, string view_name) : CreateViewInfo(schema.catalog.GetName(), schema.name, std::move(view_name)) { } +string CreateViewInfo::ToString() const { + string result; + + result += "CREATE"; + if (on_conflict == OnCreateConflict::REPLACE_ON_CONFLICT) { + result += " OR REPLACE"; + } + if (temporary) { + result += " TEMPORARY"; + } + result += " VIEW "; + if (schema != DEFAULT_SCHEMA) { + result += KeywordHelper::WriteOptionallyQuoted(schema); + result += "."; + } + result += KeywordHelper::WriteOptionallyQuoted(view_name); + if (!aliases.empty()) { + result += " ("; + result += StringUtil::Join(aliases, aliases.size(), ", ", + [](const string &name) { return KeywordHelper::WriteOptionallyQuoted(name); }); + result += ")"; + } + result += " AS "; + result += query->ToString(); + return result; +} + unique_ptr CreateViewInfo::Copy() const { auto result = make_uniq(catalog, schema, view_name); CopyProperties(*result); diff --git a/src/duckdb/src/parser/statement/create_statement.cpp b/src/duckdb/src/parser/statement/create_statement.cpp index 514807f0..fbc86c87 100644 --- a/src/duckdb/src/parser/statement/create_statement.cpp +++ b/src/duckdb/src/parser/statement/create_statement.cpp @@ -12,4 +12,8 @@ unique_ptr CreateStatement::Copy() const { return unique_ptr(new CreateStatement(*this)); } +string CreateStatement::ToString() const { + return info->ToString(); +} + } // namespace duckdb diff --git a/src/duckdb/src/parser/transform/statement/transform_pivot_stmt.cpp b/src/duckdb/src/parser/transform/statement/transform_pivot_stmt.cpp index 849af1a3..a554b2a6 100644 --- a/src/duckdb/src/parser/transform/statement/transform_pivot_stmt.cpp +++ b/src/duckdb/src/parser/transform/statement/transform_pivot_stmt.cpp @@ -19,9 +19,10 @@ namespace duckdb { void Transformer::AddPivotEntry(string enum_name, unique_ptr base, unique_ptr column, - unique_ptr subquery) { + unique_ptr subquery, bool has_parameters) { if (parent) { - parent->AddPivotEntry(std::move(enum_name), std::move(base), std::move(column), std::move(subquery)); + parent->AddPivotEntry(std::move(enum_name), std::move(base), std::move(column), std::move(subquery), + has_parameters); return; } auto result = make_uniq(); @@ -29,6 +30,7 @@ void Transformer::AddPivotEntry(string enum_name, unique_ptr base, u result->base = std::move(base); result->column = std::move(column); result->subquery = std::move(subquery); + result->has_parameters = has_parameters; pivot_entries.push_back(std::move(result)); } @@ -113,6 +115,13 @@ unique_ptr Transformer::GenerateCreateEnumStmt(unique_ptr Transformer::CreatePivotStatement(unique_ptr statement) { auto result = make_uniq(); for (auto &pivot : pivot_entries) { + if (pivot->has_parameters) { + throw ParserException( + "PIVOT statements with pivot elements extracted from the data cannot have parameters in their source.\n" + "In order to use parameters the PIVOT values must be manually specified, e.g.:\n" + "PIVOT ... ON %s IN (val1, val2, ...)", + pivot->column->ToString()); + } result->statements.push_back(GenerateCreateEnumStmt(std::move(pivot))); } result->statements.push_back(std::move(statement)); @@ -125,7 +134,10 @@ unique_ptr Transformer::CreatePivotStatement(unique_ptr Transformer::TransformPivotStatement(duckdb_libpgquery::PGSelectStmt &select) { auto pivot = select.pivot; + auto current_param_count = ParamCount(); auto source = TransformTableRefNode(*pivot->source); + auto next_param_count = ParamCount(); + bool has_parameters = next_param_count > current_param_count; auto select_node = make_uniq(); vector> materialized_ctes; @@ -171,7 +183,8 @@ unique_ptr Transformer::TransformPivotStatement(duckdb_libpgquery::PG auto new_select = make_uniq(); ExtractCTEsRecursive(new_select->cte_map); new_select->from_table = source->Copy(); - AddPivotEntry(enum_name, std::move(new_select), col.pivot_expressions[0]->Copy(), std::move(col.subquery)); + AddPivotEntry(enum_name, std::move(new_select), col.pivot_expressions[0]->Copy(), std::move(col.subquery), + has_parameters); col.pivot_enum = enum_name; } diff --git a/src/duckdb/src/planner/binder/expression/bind_window_expression.cpp b/src/duckdb/src/planner/binder/expression/bind_window_expression.cpp index 644c96ed..6a59ea4a 100644 --- a/src/duckdb/src/planner/binder/expression/bind_window_expression.cpp +++ b/src/duckdb/src/planner/binder/expression/bind_window_expression.cpp @@ -151,6 +151,13 @@ BindResult BaseSelectBinder::BindWindow(WindowExpression &window, idx_t depth) { // failed to bind children of window function return BindResult(error); } + + // Restore any collation expressions + for (auto &order : window.orders) { + auto &order_expr = order.expression; + auto &bound_order = BoundExpression::GetExpression(*order_expr); + ExpressionBinder::PushCollation(context, bound_order, bound_order->return_type, false); + } // successfully bound all children: create bound window function vector types; vector> children; diff --git a/src/duckdb/src/planner/binder/statement/bind_copy.cpp b/src/duckdb/src/planner/binder/statement/bind_copy.cpp index 35cc798e..251fb01e 100644 --- a/src/duckdb/src/planner/binder/statement/bind_copy.cpp +++ b/src/duckdb/src/planner/binder/statement/bind_copy.cpp @@ -141,12 +141,13 @@ BoundStatement Binder::BindCopyTo(CopyStatement &stmt) { } auto unique_column_names = GetUniqueNames(select_node.names); + auto file_path = stmt.info->file_path; auto function_data = copy_function.function.copy_to_bind(context, *stmt.info, unique_column_names, select_node.types); // now create the copy information - auto copy = make_uniq(copy_function.function, std::move(function_data)); - copy->file_path = stmt.info->file_path; + auto copy = make_uniq(copy_function.function, std::move(function_data), std::move(stmt.info)); + copy->file_path = file_path; copy->use_tmp_file = use_tmp_file; copy->overwrite_or_ignore = overwrite_or_ignore; copy->filename_pattern = filename_pattern; diff --git a/src/duckdb/src/planner/binder/statement/bind_create.cpp b/src/duckdb/src/planner/binder/statement/bind_create.cpp index 972a7128..1a572228 100644 --- a/src/duckdb/src/planner/binder/statement/bind_create.cpp +++ b/src/duckdb/src/planner/binder/statement/bind_create.cpp @@ -502,6 +502,9 @@ BoundStatement Binder::Bind(CreateStatement &stmt) { case CatalogType::INDEX_ENTRY: { auto &base = stmt.info->Cast(); + auto catalog = BindCatalog(base.catalog); + properties.modified_databases.insert(catalog); + // visit the table reference auto table_ref = make_uniq(); table_ref->catalog_name = base.catalog; diff --git a/src/duckdb/src/planner/binder/statement/bind_drop.cpp b/src/duckdb/src/planner/binder/statement/bind_drop.cpp index 700aa167..056c70b6 100644 --- a/src/duckdb/src/planner/binder/statement/bind_drop.cpp +++ b/src/duckdb/src/planner/binder/statement/bind_drop.cpp @@ -44,7 +44,7 @@ BoundStatement Binder::Bind(DropStatement &stmt) { } stmt.info->catalog = entry->ParentCatalog().GetName(); if (!entry->temporary) { - // we can only drop temporary tables in read-only mode + // we can only drop temporary schema entries in read-only mode properties.modified_databases.insert(stmt.info->catalog); } stmt.info->schema = entry->ParentSchema().name; diff --git a/src/duckdb/src/planner/operator/logical_copy_to_file.cpp b/src/duckdb/src/planner/operator/logical_copy_to_file.cpp index c3654b86..93572634 100644 --- a/src/duckdb/src/planner/operator/logical_copy_to_file.cpp +++ b/src/duckdb/src/planner/operator/logical_copy_to_file.cpp @@ -2,15 +2,89 @@ #include "duckdb/catalog/catalog_entry/copy_function_catalog_entry.hpp" #include "duckdb/function/copy_function.hpp" +#include "duckdb/function/function_serialization.hpp" + +#include "duckdb/common/serializer/serializer.hpp" +#include "duckdb/common/serializer/deserializer.hpp" namespace duckdb { void LogicalCopyToFile::Serialize(Serializer &serializer) const { - throw SerializationException("LogicalCopyToFile not implemented yet"); + LogicalOperator::Serialize(serializer); + serializer.WriteProperty(200, "file_path", file_path); + serializer.WriteProperty(201, "use_tmp_file", use_tmp_file); + serializer.WriteProperty(202, "filename_pattern", filename_pattern); + serializer.WriteProperty(203, "overwrite_or_ignore", overwrite_or_ignore); + serializer.WriteProperty(204, "per_thread_output", per_thread_output); + serializer.WriteProperty(205, "partition_output", partition_output); + serializer.WriteProperty(206, "partition_columns", partition_columns); + serializer.WriteProperty(207, "names", names); + serializer.WriteProperty(208, "expected_types", expected_types); + serializer.WriteProperty(209, "copy_info", copy_info); + + // Serialize function + serializer.WriteProperty(210, "function_name", function.name); + + bool has_serialize = function.serialize; + serializer.WriteProperty(211, "function_has_serialize", has_serialize); + if (has_serialize) { + D_ASSERT(function.deserialize); // if serialize is set, deserialize should be set as well + serializer.WriteObject(212, "function_data", + [&](Serializer &obj) { function.serialize(obj, *bind_data, function); }); + } } unique_ptr LogicalCopyToFile::Deserialize(Deserializer &deserializer) { - throw SerializationException("LogicalCopyToFile not implemented yet"); + auto file_path = deserializer.ReadProperty(200, "file_path"); + auto use_tmp_file = deserializer.ReadProperty(201, "use_tmp_file"); + auto filename_pattern = deserializer.ReadProperty(202, "filename_pattern"); + auto overwrite_or_ignore = deserializer.ReadProperty(203, "overwrite_or_ignore"); + auto per_thread_output = deserializer.ReadProperty(204, "per_thread_output"); + auto partition_output = deserializer.ReadProperty(205, "partition_output"); + auto partition_columns = deserializer.ReadProperty>(206, "partition_columns"); + auto names = deserializer.ReadProperty>(207, "names"); + auto expected_types = deserializer.ReadProperty>(208, "expected_types"); + auto copy_info = + unique_ptr_cast(deserializer.ReadProperty>(209, "copy_info")); + + // Deserialize function + auto &context = deserializer.Get(); + auto name = deserializer.ReadProperty(210, "function_name"); + + auto &func_catalog_entry = + Catalog::GetEntry(context, CatalogType::COPY_FUNCTION_ENTRY, SYSTEM_CATALOG, DEFAULT_SCHEMA, name); + if (func_catalog_entry.type != CatalogType::COPY_FUNCTION_ENTRY) { + throw InternalException("DeserializeFunction - cant find catalog entry for function %s", name); + } + auto &function_entry = func_catalog_entry.Cast(); + auto function = function_entry.function; + // Deserialize function data + unique_ptr bind_data; + auto has_serialize = deserializer.ReadProperty(211, "function_has_serialize"); + if (has_serialize) { + // Just deserialize the bind data + deserializer.ReadObject(212, "function_data", + [&](Deserializer &obj) { bind_data = function.deserialize(obj, function); }); + } else { + // Otherwise, re-bind with the copy info + if (!function.copy_to_bind) { + throw InternalException("Copy function \"%s\" has neither bind nor (de)serialize", function.name); + } + bind_data = function.copy_to_bind(context, *copy_info, names, expected_types); + } + + auto result = make_uniq(function, std::move(bind_data), std::move(copy_info)); + result->file_path = file_path; + result->use_tmp_file = use_tmp_file; + result->filename_pattern = filename_pattern; + result->overwrite_or_ignore = overwrite_or_ignore; + result->per_thread_output = per_thread_output; + result->partition_output = partition_output; + result->partition_columns = partition_columns; + result->names = names; + result->expected_types = expected_types; + + return std::move(result); } idx_t LogicalCopyToFile::EstimateCardinality(ClientContext &context) { diff --git a/src/duckdb/src/storage/data_table.cpp b/src/duckdb/src/storage/data_table.cpp index 45a0a379..4cc7e206 100644 --- a/src/duckdb/src/storage/data_table.cpp +++ b/src/duckdb/src/storage/data_table.cpp @@ -1317,8 +1317,14 @@ idx_t DataTable::GetTotalRows() { } void DataTable::CommitDropTable() { - // commit a drop of this table: mark all blocks as modified so they can be reclaimed later on + // commit a drop of this table: mark all blocks as modified, so they can be reclaimed later on row_groups->CommitDropTable(); + + // propagate dropping this table to its indexes: frees all index memory + info->indexes.Scan([&](Index &index) { + index.CommitDrop(); + return false; + }); } //===--------------------------------------------------------------------===// diff --git a/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp b/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp index f64dcab5..ab8dee98 100644 --- a/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp +++ b/src/duckdb/src/storage/serialization/serialize_logical_operator.cpp @@ -189,6 +189,20 @@ unique_ptr LogicalOperator::Deserialize(Deserializer &deseriali return result; } +void FilenamePattern::Serialize(Serializer &serializer) const { + serializer.WritePropertyWithDefault(200, "base", _base); + serializer.WritePropertyWithDefault(201, "pos", _pos); + serializer.WritePropertyWithDefault(202, "uuid", _uuid); +} + +FilenamePattern FilenamePattern::Deserialize(Deserializer &deserializer) { + FilenamePattern result; + deserializer.ReadPropertyWithDefault(200, "base", result._base); + deserializer.ReadPropertyWithDefault(201, "pos", result._pos); + deserializer.ReadPropertyWithDefault(202, "uuid", result._uuid); + return result; +} + void LogicalAggregate::Serialize(Serializer &serializer) const { LogicalOperator::Serialize(serializer); serializer.WritePropertyWithDefault>>(200, "expressions", expressions); diff --git a/src/duckdb/src/storage/storage_info.cpp b/src/duckdb/src/storage/storage_info.cpp index e3f0fffc..0e124b5e 100644 --- a/src/duckdb/src/storage/storage_info.cpp +++ b/src/duckdb/src/storage/storage_info.cpp @@ -9,7 +9,8 @@ struct StorageVersionInfo { idx_t storage_version; }; -static StorageVersionInfo storage_version_info[] = {{"v0.8.0 or v0.8.1", 51}, +static StorageVersionInfo storage_version_info[] = {{"v0.9.0 or v0.9.1", 64}, + {"v0.8.0 or v0.8.1", 51}, {"v0.7.0 or v0.7.1", 43}, {"v0.6.0 or v0.6.1", 39}, {"v0.5.0 or v0.5.1", 38}, diff --git a/src/duckdb/src/storage/table/row_version_manager.cpp b/src/duckdb/src/storage/table/row_version_manager.cpp index 9fa7c46c..945d0a3b 100644 --- a/src/duckdb/src/storage/table/row_version_manager.cpp +++ b/src/duckdb/src/storage/table/row_version_manager.cpp @@ -110,6 +110,9 @@ void RowVersionManager::AppendVersionInfo(TransactionData transaction, idx_t cou } void RowVersionManager::CommitAppend(transaction_t commit_id, idx_t row_group_start, idx_t count) { + if (count == 0) { + return; + } idx_t row_group_end = row_group_start + count; lock_guard lock(version_lock); @@ -119,9 +122,8 @@ void RowVersionManager::CommitAppend(transaction_t commit_id, idx_t row_group_st idx_t vstart = vector_idx == start_vector_idx ? row_group_start - start_vector_idx * STANDARD_VECTOR_SIZE : 0; idx_t vend = vector_idx == end_vector_idx ? row_group_end - end_vector_idx * STANDARD_VECTOR_SIZE : STANDARD_VECTOR_SIZE; - - auto info = vector_info[vector_idx].get(); - info->CommitAppend(commit_id, vstart, vend); + auto &info = *vector_info[vector_idx]; + info.CommitAppend(commit_id, vstart, vend); } } diff --git a/src/duckdb/src/transaction/commit_state.cpp b/src/duckdb/src/transaction/commit_state.cpp index 6f844317..5715d437 100644 --- a/src/duckdb/src/transaction/commit_state.cpp +++ b/src/duckdb/src/transaction/commit_state.cpp @@ -254,6 +254,7 @@ void CommitState::CommitEntry(UndoFlags type, data_ptr_t data) { // Grab a write lock on the catalog auto &duck_catalog = catalog.Cast(); lock_guard write_lock(duck_catalog.GetWriteLock()); + lock_guard read_lock(catalog_entry->set->GetCatalogLock()); catalog_entry->set->UpdateTimestamp(*catalog_entry->parent, commit_id); if (catalog_entry->name != catalog_entry->parent->name) { catalog_entry->set->UpdateTimestamp(*catalog_entry, commit_id); diff --git a/src/duckdb/third_party/parquet/parquet_types.cpp b/src/duckdb/third_party/parquet/parquet_types.cpp index daa065bc..9df4b8e9 100644 --- a/src/duckdb/third_party/parquet/parquet_types.cpp +++ b/src/duckdb/third_party/parquet/parquet_types.cpp @@ -13,226 +13,228 @@ namespace duckdb_parquet { namespace format { -int _kTypeValues[] = { - Type::BOOLEAN, - Type::INT32, - Type::INT64, - Type::INT96, - Type::FLOAT, - Type::DOUBLE, - Type::BYTE_ARRAY, - Type::FIXED_LEN_BYTE_ARRAY -}; -const char* _kTypeNames[] = { - "BOOLEAN", - "INT32", - "INT64", - "INT96", - "FLOAT", - "DOUBLE", - "BYTE_ARRAY", - "FIXED_LEN_BYTE_ARRAY" -}; -const std::map _Type_VALUES_TO_NAMES(::duckdb_apache::thrift::TEnumIterator(8, _kTypeValues, _kTypeNames), ::duckdb_apache::thrift::TEnumIterator(-1, NULL, NULL)); - -std::ostream& operator<<(std::ostream& out, const Type::type& val) { - std::map::const_iterator it = _Type_VALUES_TO_NAMES.find(val); - if (it != _Type_VALUES_TO_NAMES.end()) { - out << it->second; - } else { - out << static_cast(val); - } - return out; -} - -int _kConvertedTypeValues[] = { - ConvertedType::UTF8, - ConvertedType::MAP, - ConvertedType::MAP_KEY_VALUE, - ConvertedType::LIST, - ConvertedType::ENUM, - ConvertedType::DECIMAL, - ConvertedType::DATE, - ConvertedType::TIME_MILLIS, - ConvertedType::TIME_MICROS, - ConvertedType::TIMESTAMP_MILLIS, - ConvertedType::TIMESTAMP_MICROS, - ConvertedType::UINT_8, - ConvertedType::UINT_16, - ConvertedType::UINT_32, - ConvertedType::UINT_64, - ConvertedType::INT_8, - ConvertedType::INT_16, - ConvertedType::INT_32, - ConvertedType::INT_64, - ConvertedType::JSON, - ConvertedType::BSON, - ConvertedType::INTERVAL -}; -const char* _kConvertedTypeNames[] = { - "UTF8", - "MAP", - "MAP_KEY_VALUE", - "LIST", - "ENUM", - "DECIMAL", - "DATE", - "TIME_MILLIS", - "TIME_MICROS", - "TIMESTAMP_MILLIS", - "TIMESTAMP_MICROS", - "UINT_8", - "UINT_16", - "UINT_32", - "UINT_64", - "INT_8", - "INT_16", - "INT_32", - "INT_64", - "JSON", - "BSON", - "INTERVAL" -}; -const std::map _ConvertedType_VALUES_TO_NAMES(::duckdb_apache::thrift::TEnumIterator(22, _kConvertedTypeValues, _kConvertedTypeNames), ::duckdb_apache::thrift::TEnumIterator(-1, NULL, NULL)); - -std::ostream& operator<<(std::ostream& out, const ConvertedType::type& val) { - std::map::const_iterator it = _ConvertedType_VALUES_TO_NAMES.find(val); - if (it != _ConvertedType_VALUES_TO_NAMES.end()) { - out << it->second; - } else { - out << static_cast(val); - } - return out; -} - -int _kFieldRepetitionTypeValues[] = { - FieldRepetitionType::REQUIRED, - FieldRepetitionType::OPTIONAL, - FieldRepetitionType::REPEATED -}; -const char* _kFieldRepetitionTypeNames[] = { - "REQUIRED", - "OPTIONAL", - "REPEATED" -}; -const std::map _FieldRepetitionType_VALUES_TO_NAMES(::duckdb_apache::thrift::TEnumIterator(3, _kFieldRepetitionTypeValues, _kFieldRepetitionTypeNames), ::duckdb_apache::thrift::TEnumIterator(-1, NULL, NULL)); - -std::ostream& operator<<(std::ostream& out, const FieldRepetitionType::type& val) { - std::map::const_iterator it = _FieldRepetitionType_VALUES_TO_NAMES.find(val); - if (it != _FieldRepetitionType_VALUES_TO_NAMES.end()) { - out << it->second; - } else { - out << static_cast(val); - } - return out; -} - -int _kEncodingValues[] = { - Encoding::PLAIN, - Encoding::PLAIN_DICTIONARY, - Encoding::RLE, - Encoding::BIT_PACKED, - Encoding::DELTA_BINARY_PACKED, - Encoding::DELTA_LENGTH_BYTE_ARRAY, - Encoding::DELTA_BYTE_ARRAY, - Encoding::RLE_DICTIONARY -}; -const char* _kEncodingNames[] = { - "PLAIN", - "PLAIN_DICTIONARY", - "RLE", - "BIT_PACKED", - "DELTA_BINARY_PACKED", - "DELTA_LENGTH_BYTE_ARRAY", - "DELTA_BYTE_ARRAY", - "RLE_DICTIONARY" -}; -const std::map _Encoding_VALUES_TO_NAMES(::duckdb_apache::thrift::TEnumIterator(8, _kEncodingValues, _kEncodingNames), ::duckdb_apache::thrift::TEnumIterator(-1, NULL, NULL)); - -std::ostream& operator<<(std::ostream& out, const Encoding::type& val) { - std::map::const_iterator it = _Encoding_VALUES_TO_NAMES.find(val); - if (it != _Encoding_VALUES_TO_NAMES.end()) { - out << it->second; - } else { - out << static_cast(val); - } - return out; -} - -int _kCompressionCodecValues[] = { - CompressionCodec::UNCOMPRESSED, - CompressionCodec::SNAPPY, - CompressionCodec::GZIP, - CompressionCodec::LZO, - CompressionCodec::BROTLI, - CompressionCodec::LZ4, - CompressionCodec::ZSTD -}; -const char* _kCompressionCodecNames[] = { - "UNCOMPRESSED", - "SNAPPY", - "GZIP", - "LZO", - "BROTLI", - "LZ4", - "ZSTD" -}; -const std::map _CompressionCodec_VALUES_TO_NAMES(::duckdb_apache::thrift::TEnumIterator(7, _kCompressionCodecValues, _kCompressionCodecNames), ::duckdb_apache::thrift::TEnumIterator(-1, NULL, NULL)); - -std::ostream& operator<<(std::ostream& out, const CompressionCodec::type& val) { - std::map::const_iterator it = _CompressionCodec_VALUES_TO_NAMES.find(val); - if (it != _CompressionCodec_VALUES_TO_NAMES.end()) { - out << it->second; - } else { - out << static_cast(val); - } - return out; -} - -int _kPageTypeValues[] = { - PageType::DATA_PAGE, - PageType::INDEX_PAGE, - PageType::DICTIONARY_PAGE, - PageType::DATA_PAGE_V2 -}; -const char* _kPageTypeNames[] = { - "DATA_PAGE", - "INDEX_PAGE", - "DICTIONARY_PAGE", - "DATA_PAGE_V2" -}; -const std::map _PageType_VALUES_TO_NAMES(::duckdb_apache::thrift::TEnumIterator(4, _kPageTypeValues, _kPageTypeNames), ::duckdb_apache::thrift::TEnumIterator(-1, NULL, NULL)); - -std::ostream& operator<<(std::ostream& out, const PageType::type& val) { - std::map::const_iterator it = _PageType_VALUES_TO_NAMES.find(val); - if (it != _PageType_VALUES_TO_NAMES.end()) { - out << it->second; - } else { - out << static_cast(val); - } - return out; -} - -int _kBoundaryOrderValues[] = { - BoundaryOrder::UNORDERED, - BoundaryOrder::ASCENDING, - BoundaryOrder::DESCENDING -}; -const char* _kBoundaryOrderNames[] = { - "UNORDERED", - "ASCENDING", - "DESCENDING" -}; -const std::map _BoundaryOrder_VALUES_TO_NAMES(::duckdb_apache::thrift::TEnumIterator(3, _kBoundaryOrderValues, _kBoundaryOrderNames), ::duckdb_apache::thrift::TEnumIterator(-1, NULL, NULL)); - -std::ostream& operator<<(std::ostream& out, const BoundaryOrder::type& val) { - std::map::const_iterator it = _BoundaryOrder_VALUES_TO_NAMES.find(val); - if (it != _BoundaryOrder_VALUES_TO_NAMES.end()) { - out << it->second; - } else { - out << static_cast(val); - } - return out; +std::ostream &operator<<(std::ostream &out, const Type::type &val) { + switch (val) { + case Type::BOOLEAN: + out << "BOOLEAN"; + return out; + case Type::INT32: + out << "INT32"; + return out; + case Type::INT64: + out << "INT64"; + return out; + case Type::INT96: + out << "INT96"; + return out; + case Type::FLOAT: + out << "FLOAT"; + return out; + case Type::DOUBLE: + out << "DOUBLE"; + return out; + case Type::BYTE_ARRAY: + out << "BYTE_ARRAY"; + return out; + case Type::FIXED_LEN_BYTE_ARRAY: + out << "FIXED_LEN_BYTE_ARRAY"; + return out; + // no default for compiler error on missing enum + } + out << static_cast(val); + return out; +} + +std::ostream &operator<<(std::ostream &out, const ConvertedType::type &val) { + switch (val) { + case ConvertedType::UTF8: + out << "UTF8"; + return out; + case ConvertedType::MAP: + out << "MAP"; + return out; + case ConvertedType::MAP_KEY_VALUE: + out << "MAP_KEY_VALUE"; + return out; + case ConvertedType::LIST: + out << "LIST"; + return out; + case ConvertedType::ENUM: + out << "ENUM"; + return out; + case ConvertedType::DECIMAL: + out << "DECIMAL"; + return out; + case ConvertedType::DATE: + out << "DATE"; + return out; + case ConvertedType::TIME_MILLIS: + out << "TIME_MILLIS"; + return out; + case ConvertedType::TIME_MICROS: + out << "TIME_MICROS"; + return out; + case ConvertedType::TIMESTAMP_MILLIS: + out << "TIMESTAMP_MILLIS"; + return out; + case ConvertedType::TIMESTAMP_MICROS: + out << "TIMESTAMP_MICROS"; + return out; + case ConvertedType::UINT_8: + out << "UINT_8"; + return out; + case ConvertedType::UINT_16: + out << "UINT_16"; + return out; + case ConvertedType::UINT_32: + out << "UINT_32"; + return out; + case ConvertedType::UINT_64: + out << "UINT_64"; + return out; + case ConvertedType::INT_8: + out << "INT_8"; + return out; + case ConvertedType::INT_16: + out << "INT_16"; + return out; + case ConvertedType::INT_32: + out << "INT_32"; + return out; + case ConvertedType::INT_64: + out << "INT_64"; + return out; + case ConvertedType::JSON: + out << "JSON"; + return out; + case ConvertedType::BSON: + out << "BSON"; + return out; + case ConvertedType::INTERVAL: + out << "INTERVAL"; + return out; + // no default for compiler error on missing enum + } + out << static_cast(val); + return out; +} + +std::ostream &operator<<(std::ostream &out, const FieldRepetitionType::type &val) { + switch (val) { + case FieldRepetitionType::REQUIRED: + out << "REQUIRED"; + return out; + case FieldRepetitionType::OPTIONAL: + out << "OPTIONAL"; + return out; + case FieldRepetitionType::REPEATED: + out << "REPEATED"; + return out; + // no default for compiler error on missing enum + } + out << static_cast(val); + return out; +} + +std::ostream &operator<<(std::ostream &out, const Encoding::type &val) { + switch (val) { + case Encoding::PLAIN: + out << "PLAIN"; + return out; + case Encoding::PLAIN_DICTIONARY: + out << "PLAIN_DICTIONARY"; + return out; + case Encoding::RLE: + out << "RLE"; + return out; + case Encoding::BIT_PACKED: + out << "BIT_PACKED"; + return out; + case Encoding::DELTA_BINARY_PACKED: + out << "DELTA_BINARY_PACKED"; + return out; + case Encoding::DELTA_LENGTH_BYTE_ARRAY: + out << "DELTA_LENGTH_BYTE_ARRAY"; + return out; + case Encoding::DELTA_BYTE_ARRAY: + out << "DELTA_BYTE_ARRAY"; + return out; + case Encoding::RLE_DICTIONARY: + out << "RLE_DICTIONARY"; + return out; + case Encoding::BYTE_STREAM_SPLIT: + out << "BYTE_STREAM_SPLIT"; + return out; + // no default for compiler error on missing enum + } + out << static_cast(val); + return out; +} + +std::ostream &operator<<(std::ostream &out, const CompressionCodec::type &val) { + switch (val) { + case CompressionCodec::UNCOMPRESSED: + out << "UNCOMPRESSED"; + return out; + case CompressionCodec::SNAPPY: + out << "SNAPPY"; + return out; + case CompressionCodec::GZIP: + out << "GZIP"; + return out; + case CompressionCodec::LZO: + out << "LZO"; + return out; + case CompressionCodec::BROTLI: + out << "BROTLI"; + return out; + case CompressionCodec::LZ4: + out << "LZ4"; + return out; + case CompressionCodec::ZSTD: + out << "ZSTD"; + return out; + // no default for compiler error on missing enum + } + out << static_cast(val); + return out; +} + +std::ostream &operator<<(std::ostream &out, const PageType::type &val) { + switch (val) { + case PageType::DATA_PAGE: + out << "DATA_PAGE"; + return out; + case PageType::INDEX_PAGE: + out << "INDEX_PAGE"; + return out; + case PageType::DICTIONARY_PAGE: + out << "DICTIONARY_PAGE"; + return out; + case PageType::DATA_PAGE_V2: + out << "DATA_PAGE_V2"; + return out; + // no default for compiler error on missing enum + } + out << static_cast(val); + return out; +} + +std::ostream &operator<<(std::ostream &out, const BoundaryOrder::type &val) { + switch (val) { + case BoundaryOrder::UNORDERED: + out << "UNORDERED"; + return out; + case BoundaryOrder::ASCENDING: + out << "ASCENDING"; + return out; + case BoundaryOrder::DESCENDING: + out << "DESCENDING"; + return out; + // no default for compiler error on missing enum + } + out << static_cast(val); + return out; } @@ -6675,4 +6677,5 @@ void FileCryptoMetaData::printTo(std::ostream& out) const { out << ")"; } -}} // namespace + +}} // namespace \ No newline at end of file diff --git a/src/duckdb/third_party/parquet/parquet_types.h b/src/duckdb/third_party/parquet/parquet_types.h index 3037b1ec..434f9db4 100644 --- a/src/duckdb/third_party/parquet/parquet_types.h +++ b/src/duckdb/third_party/parquet/parquet_types.h @@ -36,8 +36,6 @@ struct Type { }; }; -extern const std::map _Type_VALUES_TO_NAMES; - std::ostream& operator<<(std::ostream& out, const Type::type& val); struct ConvertedType { @@ -67,8 +65,6 @@ struct ConvertedType { }; }; -extern const std::map _ConvertedType_VALUES_TO_NAMES; - std::ostream& operator<<(std::ostream& out, const ConvertedType::type& val); struct FieldRepetitionType { @@ -79,8 +75,6 @@ struct FieldRepetitionType { }; }; -extern const std::map _FieldRepetitionType_VALUES_TO_NAMES; - std::ostream& operator<<(std::ostream& out, const FieldRepetitionType::type& val); struct Encoding { @@ -97,8 +91,6 @@ struct Encoding { }; }; -extern const std::map _Encoding_VALUES_TO_NAMES; - std::ostream& operator<<(std::ostream& out, const Encoding::type& val); struct CompressionCodec { @@ -113,8 +105,6 @@ struct CompressionCodec { }; }; -extern const std::map _CompressionCodec_VALUES_TO_NAMES; - std::ostream& operator<<(std::ostream& out, const CompressionCodec::type& val); struct PageType { @@ -126,8 +116,6 @@ struct PageType { }; }; -extern const std::map _PageType_VALUES_TO_NAMES; - std::ostream& operator<<(std::ostream& out, const PageType::type& val); struct BoundaryOrder { @@ -138,8 +126,6 @@ struct BoundaryOrder { }; }; -extern const std::map _BoundaryOrder_VALUES_TO_NAMES; - std::ostream& operator<<(std::ostream& out, const BoundaryOrder::type& val); class Statistics; diff --git a/src/duckdb/ub_src_common_arrow_appender.cpp b/src/duckdb/ub_src_common_arrow_appender.cpp index 94af8f11..e3adc18a 100644 --- a/src/duckdb/ub_src_common_arrow_appender.cpp +++ b/src/duckdb/ub_src_common_arrow_appender.cpp @@ -1,9 +1,5 @@ #include "src/common/arrow/appender/bool_data.cpp" -#include "src/common/arrow/appender/list_data.cpp" - -#include "src/common/arrow/appender/map_data.cpp" - #include "src/common/arrow/appender/struct_data.cpp" #include "src/common/arrow/appender/union_data.cpp" diff --git a/src/duckdb/ub_src_function_table_arrow.cpp b/src/duckdb/ub_src_function_table_arrow.cpp index 8cb9298c..3a169777 100644 --- a/src/duckdb/ub_src_function_table_arrow.cpp +++ b/src/duckdb/ub_src_function_table_arrow.cpp @@ -1,2 +1,4 @@ #include "src/function/table/arrow/arrow_duck_schema.cpp" +#include "src/function/table/arrow/arrow_array_scan_state.cpp" + diff --git a/src/duckdb/ub_src_function_table_system.cpp b/src/duckdb/ub_src_function_table_system.cpp index fd325e19..11a8b99a 100644 --- a/src/duckdb/ub_src_function_table_system.cpp +++ b/src/duckdb/ub_src_function_table_system.cpp @@ -38,6 +38,8 @@ #include "src/function/table/system/pragma_table_info.cpp" +#include "src/function/table/system/pragma_user_agent.cpp" + #include "src/function/table/system/test_all_types.cpp" #include "src/function/table/system/test_vector_types.cpp" diff --git a/test/columns.test.ts b/test/columns.test.ts index 55dabce2..52bfdb3d 100644 --- a/test/columns.test.ts +++ b/test/columns.test.ts @@ -236,7 +236,7 @@ describe('Column Types', function() { name: "union", type: { id: "UNION", - sql_type: "UNION(name VARCHAR, age SMALLINT)", + sql_type: "UNION(\"name\" VARCHAR, age SMALLINT)", children: [ { name: "name",