Merge pull request #10 from hannes/main

Update to DuckDB 0.9.1
duckdb · Oct 11, 2023 · 21405f2 · 21405f2
2 parents c6fc2fd + 1750ef0
commit 21405f2
Show file tree

Hide file tree

Showing 85 changed files with 763 additions and 471 deletions.
diff --git a/src/duckdb/extension/icu/icu-makedate.cpp b/src/duckdb/extension/icu/icu-makedate.cpp
@@ -23,7 +23,7 @@ struct ICUMakeDate : public ICUDateFunc {
 		}
 
 		// Extract the time zone parts
-		auto micros = SetTime(calendar, instant);
+		SetTime(calendar, instant);
 		const auto era = ExtractField(calendar, UCAL_ERA);
 		const auto year = ExtractField(calendar, UCAL_YEAR);
 		const auto mm = ExtractField(calendar, UCAL_MONTH) + 1;

diff --git a/src/duckdb/extension/icu/icu-strptime.cpp b/src/duckdb/extension/icu/icu-strptime.cpp
@@ -96,7 +96,6 @@ struct ICUStrptime : public ICUDateFunc {
 		auto &info = func_expr.bind_info->Cast<ICUStrptimeBindData>();
 		CalendarPtr calendar_ptr(info.calendar->clone());
 		auto calendar = calendar_ptr.get();
-		auto &formats = info.formats;
 
 		D_ASSERT(fmt_arg.GetVectorType() == VectorType::CONSTANT_VECTOR);
 
@@ -126,7 +125,6 @@ struct ICUStrptime : public ICUDateFunc {
 		auto &info = func_expr.bind_info->Cast<ICUStrptimeBindData>();
 		CalendarPtr calendar_ptr(info.calendar->clone());
 		auto calendar = calendar_ptr.get();
-		auto &formats = info.formats;
 
 		D_ASSERT(fmt_arg.GetVectorType() == VectorType::CONSTANT_VECTOR);
 

diff --git a/src/duckdb/extension/icu/icu_extension.cpp b/src/duckdb/extension/icu/icu_extension.cpp
@@ -223,7 +223,6 @@ static void SetICUCalendar(ClientContext &context, SetScope scope, Value &parame
 
 void IcuExtension::Load(DuckDB &ddb) {
 	auto &db = *ddb.instance;
-	auto &catalog = Catalog::GetSystemCatalog(db);
 
 	// iterate over all the collations
 	int32_t count;

diff --git a/src/duckdb/extension/json/json_functions.cpp b/src/duckdb/extension/json/json_functions.cpp
@@ -189,16 +189,7 @@ vector<TableFunctionSet> JSONFunctions::GetTableFunctions() {
 
 unique_ptr<TableRef> JSONFunctions::ReadJSONReplacement(ClientContext &context, const string &table_name,
                                                         ReplacementScanData *data) {
-	auto lower_name = StringUtil::Lower(table_name);
-	// remove any compression
-	if (StringUtil::EndsWith(lower_name, ".gz")) {
-		lower_name = lower_name.substr(0, lower_name.size() - 3);
-	} else if (StringUtil::EndsWith(lower_name, ".zst")) {
-		lower_name = lower_name.substr(0, lower_name.size() - 4);
-	}
-	if (!StringUtil::EndsWith(lower_name, ".json") && !StringUtil::Contains(lower_name, ".json?") &&
-	    !StringUtil::EndsWith(lower_name, ".jsonl") && !StringUtil::Contains(lower_name, ".jsonl?") &&
-	    !StringUtil::EndsWith(lower_name, ".ndjson") && !StringUtil::Contains(lower_name, ".ndjson?")) {
+	if (!ReplacementScan::CanReplace(table_name, {"json", "jsonl", "ndjson"})) {
 		return nullptr;
 	}
 	auto table_function = make_uniq<TableFunctionRef>();

diff --git a/src/duckdb/extension/json/json_functions/json_create.cpp b/src/duckdb/extension/json/json_functions/json_create.cpp
@@ -682,20 +682,33 @@ BoundCastInfo AnyToJSONCastBind(BindCastInput &input, const LogicalType &source,
 }
 
 void JSONFunctions::RegisterJSONCreateCastFunctions(CastFunctionSet &casts) {
-	auto json_to_any_cost = casts.ImplicitCastCost(LogicalType::ANY, JSONCommon::JSONType());
-	casts.RegisterCastFunction(LogicalType::ANY, JSONCommon::JSONType(), AnyToJSONCastBind, json_to_any_cost);
-
-	const auto struct_type = LogicalType::STRUCT({{"any", LogicalType::ANY}});
-	auto struct_to_json_cost = casts.ImplicitCastCost(struct_type, LogicalType::VARCHAR) - 2;
-	casts.RegisterCastFunction(struct_type, JSONCommon::JSONType(), AnyToJSONCastBind, struct_to_json_cost);
-
-	const auto list_type = LogicalType::LIST(LogicalType::ANY);
-	auto list_to_json_cost = casts.ImplicitCastCost(list_type, LogicalType::VARCHAR) - 2;
-	casts.RegisterCastFunction(list_type, JSONCommon::JSONType(), AnyToJSONCastBind, list_to_json_cost);
-
-	const auto map_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY);
-	auto map_to_json_cost = casts.ImplicitCastCost(map_type, LogicalType::VARCHAR) - 2;
-	casts.RegisterCastFunction(map_type, JSONCommon::JSONType(), AnyToJSONCastBind, map_to_json_cost);
+	// Anything can be cast to JSON
+	for (const auto &type : LogicalType::AllTypes()) {
+		LogicalType source_type;
+		switch (type.id()) {
+		case LogicalTypeId::STRUCT:
+			source_type = LogicalType::STRUCT({{"any", LogicalType::ANY}});
+			break;
+		case LogicalTypeId::LIST:
+			source_type = LogicalType::LIST(LogicalType::ANY);
+			break;
+		case LogicalTypeId::MAP:
+			source_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY);
+			break;
+		case LogicalTypeId::UNION:
+			source_type = LogicalType::UNION({{"any", LogicalType::ANY}});
+			break;
+		case LogicalTypeId::VARCHAR:
+			// We skip this one here as it's handled in json_functions.cpp
+			continue;
+		default:
+			source_type = type;
+		}
+		// We prefer going to JSON over going to VARCHAR if a function can do either
+		const auto source_to_json_cost =
+		    MaxValue<int64_t>(casts.ImplicitCastCost(source_type, LogicalType::VARCHAR) - 1, 0);
+		casts.RegisterCastFunction(source_type, JSONCommon::JSONType(), AnyToJSONCastBind, source_to_json_cost);
+	}
 }
 
 } // namespace duckdb
diff --git a/src/duckdb/extension/json/json_functions/json_transform.cpp b/src/duckdb/extension/json/json_functions/json_transform.cpp
@@ -898,20 +898,32 @@ BoundCastInfo JSONToAnyCastBind(BindCastInput &input, const LogicalType &source,
 }
 
 void JSONFunctions::RegisterJSONTransformCastFunctions(CastFunctionSet &casts) {
-	auto json_to_any_cost = casts.ImplicitCastCost(JSONCommon::JSONType(), LogicalType::ANY);
-	casts.RegisterCastFunction(JSONCommon::JSONType(), LogicalType::ANY, JSONToAnyCastBind, json_to_any_cost);
-
-	const auto struct_type = LogicalType::STRUCT({{"any", LogicalType::ANY}});
-	auto json_to_struct_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, struct_type) - 2;
-	casts.RegisterCastFunction(JSONCommon::JSONType(), struct_type, JSONToAnyCastBind, json_to_struct_cost);
-
-	const auto list_type = LogicalType::LIST(LogicalType::ANY);
-	auto json_to_list_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, list_type) - 2;
-	casts.RegisterCastFunction(JSONCommon::JSONType(), list_type, JSONToAnyCastBind, json_to_list_cost);
-
-	const auto map_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY);
-	auto json_to_map_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, map_type) - 2;
-	casts.RegisterCastFunction(JSONCommon::JSONType(), map_type, JSONToAnyCastBind, json_to_map_cost);
+	// JSON can be cast to anything
+	for (const auto &type : LogicalType::AllTypes()) {
+		LogicalType target_type;
+		switch (type.id()) {
+		case LogicalTypeId::STRUCT:
+			target_type = LogicalType::STRUCT({{"any", LogicalType::ANY}});
+			break;
+		case LogicalTypeId::LIST:
+			target_type = LogicalType::LIST(LogicalType::ANY);
+			break;
+		case LogicalTypeId::MAP:
+			target_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY);
+			break;
+		case LogicalTypeId::UNION:
+			target_type = LogicalType::UNION({{"any", LogicalType::ANY}});
+			break;
+		case LogicalTypeId::VARCHAR:
+			// We skip this one here as it's handled in json_functions.cpp
+			continue;
+		default:
+			target_type = type;
+		}
+		// Going from JSON to another type has the same cost as going from VARCHAR to that type
+		const auto json_to_target_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, target_type);
+		casts.RegisterCastFunction(JSONCommon::JSONType(), target_type, JSONToAnyCastBind, json_to_target_cost);
+	}
 }
 
 } // namespace duckdb
diff --git a/src/duckdb/extension/parquet/column_reader.cpp b/src/duckdb/extension/parquet/column_reader.cpp
@@ -243,6 +243,7 @@ void ColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChun
 void ColumnReader::PrepareRead(parquet_filter_t &filter) {
 	dict_decoder.reset();
 	defined_decoder.reset();
+	bss_decoder.reset();
 	block.reset();
 	PageHeader page_hdr;
 	page_hdr.read(protocol);
@@ -443,6 +444,13 @@ void ColumnReader::PrepareDataPage(PageHeader &page_hdr) {
 		PrepareDeltaByteArray(*block);
 		break;
 	}
+	case Encoding::BYTE_STREAM_SPLIT: {
+		// Subtract 1 from length as the block is allocated with 1 extra byte,
+		// but the byte stream split encoder needs to know the correct data size.
+		bss_decoder = make_uniq<BssDecoder>(block->ptr, block->len - 1);
+		block->inc(block->len);
+		break;
+	}
 	case Encoding::PLAIN:
 		// nothing to do here, will be read directly below
 		break;
@@ -488,7 +496,7 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
 
 		idx_t null_count = 0;
 
-		if ((dict_decoder || dbp_decoder || rle_decoder) && HasDefines()) {
+		if ((dict_decoder || dbp_decoder || rle_decoder || bss_decoder) && HasDefines()) {
 			// we need the null count because the dictionary offsets have no entries for nulls
 			for (idx_t i = 0; i < read_now; i++) {
 				if (define_out[i + result_offset] != max_define) {
@@ -534,6 +542,23 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
 		} else if (byte_array_data) {
 			// DELTA_BYTE_ARRAY or DELTA_LENGTH_BYTE_ARRAY
 			DeltaByteArray(define_out, read_now, filter, result_offset, result);
+		} else if (bss_decoder) {
+			auto read_buf = make_shared<ResizeableBuffer>();
+
+			switch (schema.type) {
+			case duckdb_parquet::format::Type::FLOAT:
+				read_buf->resize(reader.allocator, sizeof(float) * (read_now - null_count));
+				bss_decoder->GetBatch<float>(read_buf->ptr, read_now - null_count);
+				break;
+			case duckdb_parquet::format::Type::DOUBLE:
+				read_buf->resize(reader.allocator, sizeof(double) * (read_now - null_count));
+				bss_decoder->GetBatch<double>(read_buf->ptr, read_now - null_count);
+				break;
+			default:
+				throw std::runtime_error("BYTE_STREAM_SPLIT encoding is only supported for FLOAT or DOUBLE data");
+			}
+
+			Plain(read_buf, define_out, read_now, filter, result_offset, result);
 		} else {
 			PlainReference(block, result);
 			Plain(block, define_out, read_now, filter, result_offset, result);

diff --git a/src/duckdb/extension/parquet/column_writer.cpp b/src/duckdb/extension/parquet/column_writer.cpp
@@ -796,6 +796,13 @@ struct ParquetTimestampSOperator : public BaseParquetOperator {
 	}
 };
 
+struct ParquetTimeTZOperator : public BaseParquetOperator {
+	template <class SRC, class TGT>
+	static TGT Operation(SRC input) {
+		return input.time().micros;
+	}
+};
+
 struct ParquetHugeintOperator {
 	template <class SRC, class TGT>
 	static TGT Operation(SRC input) {
@@ -1975,12 +1982,14 @@ unique_ptr<ColumnWriter> ColumnWriter::CreateWriterRecursive(vector<duckdb_parqu
 		                                                         max_define, can_have_nulls);
 	case LogicalTypeId::BIGINT:
 	case LogicalTypeId::TIME:
-	case LogicalTypeId::TIME_TZ:
 	case LogicalTypeId::TIMESTAMP:
 	case LogicalTypeId::TIMESTAMP_TZ:
 	case LogicalTypeId::TIMESTAMP_MS:
 		return make_uniq<StandardColumnWriter<int64_t, int64_t>>(writer, schema_idx, std::move(schema_path), max_repeat,
 		                                                         max_define, can_have_nulls);
+	case LogicalTypeId::TIME_TZ:
+		return make_uniq<StandardColumnWriter<dtime_tz_t, int64_t, ParquetTimeTZOperator>>(
+		    writer, schema_idx, std::move(schema_path), max_repeat, max_define, can_have_nulls);
 	case LogicalTypeId::HUGEINT:
 		return make_uniq<StandardColumnWriter<hugeint_t, double, ParquetHugeintOperator>>(
 		    writer, schema_idx, std::move(schema_path), max_repeat, max_define, can_have_nulls);

diff --git a/src/duckdb/extension/parquet/include/column_reader.hpp b/src/duckdb/extension/parquet/include/column_reader.hpp
@@ -9,6 +9,7 @@
 #pragma once
 
 #include "duckdb.hpp"
+#include "parquet_bss_decoder.hpp"
 #include "parquet_dbp_decoder.hpp"
 #include "parquet_rle_bp_decoder.hpp"
 #include "parquet_statistics.hpp"
@@ -161,6 +162,7 @@ class ColumnReader {
 	unique_ptr<RleBpDecoder> repeated_decoder;
 	unique_ptr<DbpDecoder> dbp_decoder;
 	unique_ptr<RleBpDecoder> rle_decoder;
+	unique_ptr<BssDecoder> bss_decoder;
 
 	// dummies for Skip()
 	parquet_filter_t none_filter;

diff --git a/src/duckdb/extension/parquet/include/parquet_bss_decoder.hpp b/src/duckdb/extension/parquet/include/parquet_bss_decoder.hpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//                         DuckDB
+//
+// parquet_bss_decoder.hpp
+//
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include "parquet_types.h"
+#include "resizable_buffer.hpp"
+
+namespace duckdb {
+
+/// Decoder for the Byte Stream Split encoding
+class BssDecoder {
+public:
+	/// Create a decoder object. buffer/buffer_len is the encoded data.
+	BssDecoder(data_ptr_t buffer, uint32_t buffer_len) : buffer_(buffer, buffer_len), value_offset_(0) {
+	}
+
+public:
+	template <typename T>
+	void GetBatch(data_ptr_t values_target_ptr, uint32_t batch_size) {
+		if (buffer_.len % sizeof(T) != 0) {
+			std::stringstream error;
+			error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len
+			      << ") should be a multiple of the type size (" << sizeof(T) << ")";
+			throw std::runtime_error(error.str());
+		}
+		uint32_t num_buffer_values = buffer_.len / sizeof(T);
+
+		buffer_.available((value_offset_ + batch_size) * sizeof(T));
+
+		for (uint32_t byte_offset = 0; byte_offset < sizeof(T); ++byte_offset) {
+			data_ptr_t input_bytes = buffer_.ptr + byte_offset * num_buffer_values + value_offset_;
+			for (uint32_t i = 0; i < batch_size; ++i) {
+				values_target_ptr[byte_offset + i * sizeof(T)] = *(input_bytes + i);
+			}
+		}
+		value_offset_ += batch_size;
+	}
+
+private:
+	ByteBuffer buffer_;
+	uint32_t value_offset_;
+};
+
+} // namespace duckdb
diff --git a/src/duckdb/extension/parquet/parquet_extension.cpp b/src/duckdb/extension/parquet/parquet_extension.cpp
@@ -20,6 +20,8 @@
 #include "duckdb/common/enums/file_compression_type.hpp"
 #include "duckdb/common/file_system.hpp"
 #include "duckdb/common/multi_file_reader.hpp"
+#include "duckdb/common/serializer/deserializer.hpp"
+#include "duckdb/common/serializer/serializer.hpp"
 #include "duckdb/common/types/chunk_collection.hpp"
 #include "duckdb/function/copy_function.hpp"
 #include "duckdb/function/table_function.hpp"
@@ -34,8 +36,6 @@
 #include "duckdb/planner/operator/logical_get.hpp"
 #include "duckdb/storage/statistics/base_statistics.hpp"
 #include "duckdb/storage/table/row_group.hpp"
-#include "duckdb/common/serializer/serializer.hpp"
-#include "duckdb/common/serializer/deserializer.hpp"
 #endif
 
 namespace duckdb {
@@ -983,8 +983,7 @@ idx_t ParquetWriteDesiredBatchSize(ClientContext &context, FunctionData &bind_da
 //===--------------------------------------------------------------------===//
 unique_ptr<TableRef> ParquetScanReplacement(ClientContext &context, const string &table_name,
                                             ReplacementScanData *data) {
-	auto lower_name = StringUtil::Lower(table_name);
-	if (!StringUtil::EndsWith(lower_name, ".parquet") && !StringUtil::Contains(lower_name, ".parquet?")) {
+	if (!ReplacementScan::CanReplace(table_name, {"parquet"})) {
 		return nullptr;
 	}
 	auto table_function = make_uniq<TableFunctionRef>();

diff --git a/src/duckdb/extension/parquet/parquet_timestamp.cpp b/src/duckdb/extension/parquet/parquet_timestamp.cpp
@@ -66,10 +66,9 @@ dtime_t ParquetIntToTimeNs(const int64_t &raw_time) {
 	return Time::FromTimeNs(raw_time);
 }
 
-dtime_tz_t ParquetIntToTimeTZ(const int64_t &raw_time) {
-	dtime_tz_t result;
-	result.bits = raw_time;
-	return result;
+dtime_tz_t ParquetIntToTimeTZ(const int64_t &raw_micros) {
+	dtime_t t(raw_micros);
+	return dtime_tz_t(t, 0);
 }
 
 } // namespace duckdb
diff --git a/src/duckdb/src/common/arrow/appender/list_data.cpp b/src/duckdb/src/common/arrow/appender/list_data.cpp
@@ -69,10 +69,10 @@ void ArrowListData::Finalize(ArrowAppendData &append_data, const LogicalType &ty
 	result->buffers[1] = append_data.main_buffer.data();
 
 	auto &child_type = ListType::GetChildType(type);
-	append_data.child_pointers.resize(1);
+	ArrowAppender::AddChildren(append_data, 1);
 	result->children = append_data.child_pointers.data();
 	result->n_children = 1;
-	append_data.child_pointers[0] = ArrowAppender::FinalizeChild(child_type, *append_data.child_data[0]);
+	append_data.child_arrays[0] = *ArrowAppender::FinalizeChild(child_type, std::move(append_data.child_data[0]));
 }
 
 } // namespace duckdb