Skip to content

Commit

Permalink
Merge pull request #10 from hannes/main
Browse files Browse the repository at this point in the history
Update to DuckDB 0.9.1
  • Loading branch information
hannes authored Oct 11, 2023
2 parents c6fc2fd + 1750ef0 commit 21405f2
Show file tree
Hide file tree
Showing 85 changed files with 763 additions and 471 deletions.
2 changes: 1 addition & 1 deletion src/duckdb/extension/icu/icu-makedate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ struct ICUMakeDate : public ICUDateFunc {
}

// Extract the time zone parts
auto micros = SetTime(calendar, instant);
SetTime(calendar, instant);
const auto era = ExtractField(calendar, UCAL_ERA);
const auto year = ExtractField(calendar, UCAL_YEAR);
const auto mm = ExtractField(calendar, UCAL_MONTH) + 1;
Expand Down
2 changes: 0 additions & 2 deletions src/duckdb/extension/icu/icu-strptime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ struct ICUStrptime : public ICUDateFunc {
auto &info = func_expr.bind_info->Cast<ICUStrptimeBindData>();
CalendarPtr calendar_ptr(info.calendar->clone());
auto calendar = calendar_ptr.get();
auto &formats = info.formats;

D_ASSERT(fmt_arg.GetVectorType() == VectorType::CONSTANT_VECTOR);

Expand Down Expand Up @@ -126,7 +125,6 @@ struct ICUStrptime : public ICUDateFunc {
auto &info = func_expr.bind_info->Cast<ICUStrptimeBindData>();
CalendarPtr calendar_ptr(info.calendar->clone());
auto calendar = calendar_ptr.get();
auto &formats = info.formats;

D_ASSERT(fmt_arg.GetVectorType() == VectorType::CONSTANT_VECTOR);

Expand Down
1 change: 0 additions & 1 deletion src/duckdb/extension/icu/icu_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,6 @@ static void SetICUCalendar(ClientContext &context, SetScope scope, Value &parame

void IcuExtension::Load(DuckDB &ddb) {
auto &db = *ddb.instance;
auto &catalog = Catalog::GetSystemCatalog(db);

// iterate over all the collations
int32_t count;
Expand Down
11 changes: 1 addition & 10 deletions src/duckdb/extension/json/json_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,16 +189,7 @@ vector<TableFunctionSet> JSONFunctions::GetTableFunctions() {

unique_ptr<TableRef> JSONFunctions::ReadJSONReplacement(ClientContext &context, const string &table_name,
ReplacementScanData *data) {
auto lower_name = StringUtil::Lower(table_name);
// remove any compression
if (StringUtil::EndsWith(lower_name, ".gz")) {
lower_name = lower_name.substr(0, lower_name.size() - 3);
} else if (StringUtil::EndsWith(lower_name, ".zst")) {
lower_name = lower_name.substr(0, lower_name.size() - 4);
}
if (!StringUtil::EndsWith(lower_name, ".json") && !StringUtil::Contains(lower_name, ".json?") &&
!StringUtil::EndsWith(lower_name, ".jsonl") && !StringUtil::Contains(lower_name, ".jsonl?") &&
!StringUtil::EndsWith(lower_name, ".ndjson") && !StringUtil::Contains(lower_name, ".ndjson?")) {
if (!ReplacementScan::CanReplace(table_name, {"json", "jsonl", "ndjson"})) {
return nullptr;
}
auto table_function = make_uniq<TableFunctionRef>();
Expand Down
41 changes: 27 additions & 14 deletions src/duckdb/extension/json/json_functions/json_create.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -682,20 +682,33 @@ BoundCastInfo AnyToJSONCastBind(BindCastInput &input, const LogicalType &source,
}

void JSONFunctions::RegisterJSONCreateCastFunctions(CastFunctionSet &casts) {
auto json_to_any_cost = casts.ImplicitCastCost(LogicalType::ANY, JSONCommon::JSONType());
casts.RegisterCastFunction(LogicalType::ANY, JSONCommon::JSONType(), AnyToJSONCastBind, json_to_any_cost);

const auto struct_type = LogicalType::STRUCT({{"any", LogicalType::ANY}});
auto struct_to_json_cost = casts.ImplicitCastCost(struct_type, LogicalType::VARCHAR) - 2;
casts.RegisterCastFunction(struct_type, JSONCommon::JSONType(), AnyToJSONCastBind, struct_to_json_cost);

const auto list_type = LogicalType::LIST(LogicalType::ANY);
auto list_to_json_cost = casts.ImplicitCastCost(list_type, LogicalType::VARCHAR) - 2;
casts.RegisterCastFunction(list_type, JSONCommon::JSONType(), AnyToJSONCastBind, list_to_json_cost);

const auto map_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY);
auto map_to_json_cost = casts.ImplicitCastCost(map_type, LogicalType::VARCHAR) - 2;
casts.RegisterCastFunction(map_type, JSONCommon::JSONType(), AnyToJSONCastBind, map_to_json_cost);
// Anything can be cast to JSON
for (const auto &type : LogicalType::AllTypes()) {
LogicalType source_type;
switch (type.id()) {
case LogicalTypeId::STRUCT:
source_type = LogicalType::STRUCT({{"any", LogicalType::ANY}});
break;
case LogicalTypeId::LIST:
source_type = LogicalType::LIST(LogicalType::ANY);
break;
case LogicalTypeId::MAP:
source_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY);
break;
case LogicalTypeId::UNION:
source_type = LogicalType::UNION({{"any", LogicalType::ANY}});
break;
case LogicalTypeId::VARCHAR:
// We skip this one here as it's handled in json_functions.cpp
continue;
default:
source_type = type;
}
// We prefer going to JSON over going to VARCHAR if a function can do either
const auto source_to_json_cost =
MaxValue<int64_t>(casts.ImplicitCastCost(source_type, LogicalType::VARCHAR) - 1, 0);
casts.RegisterCastFunction(source_type, JSONCommon::JSONType(), AnyToJSONCastBind, source_to_json_cost);
}
}

} // namespace duckdb
40 changes: 26 additions & 14 deletions src/duckdb/extension/json/json_functions/json_transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -898,20 +898,32 @@ BoundCastInfo JSONToAnyCastBind(BindCastInput &input, const LogicalType &source,
}

void JSONFunctions::RegisterJSONTransformCastFunctions(CastFunctionSet &casts) {
auto json_to_any_cost = casts.ImplicitCastCost(JSONCommon::JSONType(), LogicalType::ANY);
casts.RegisterCastFunction(JSONCommon::JSONType(), LogicalType::ANY, JSONToAnyCastBind, json_to_any_cost);

const auto struct_type = LogicalType::STRUCT({{"any", LogicalType::ANY}});
auto json_to_struct_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, struct_type) - 2;
casts.RegisterCastFunction(JSONCommon::JSONType(), struct_type, JSONToAnyCastBind, json_to_struct_cost);

const auto list_type = LogicalType::LIST(LogicalType::ANY);
auto json_to_list_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, list_type) - 2;
casts.RegisterCastFunction(JSONCommon::JSONType(), list_type, JSONToAnyCastBind, json_to_list_cost);

const auto map_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY);
auto json_to_map_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, map_type) - 2;
casts.RegisterCastFunction(JSONCommon::JSONType(), map_type, JSONToAnyCastBind, json_to_map_cost);
// JSON can be cast to anything
for (const auto &type : LogicalType::AllTypes()) {
LogicalType target_type;
switch (type.id()) {
case LogicalTypeId::STRUCT:
target_type = LogicalType::STRUCT({{"any", LogicalType::ANY}});
break;
case LogicalTypeId::LIST:
target_type = LogicalType::LIST(LogicalType::ANY);
break;
case LogicalTypeId::MAP:
target_type = LogicalType::MAP(LogicalType::ANY, LogicalType::ANY);
break;
case LogicalTypeId::UNION:
target_type = LogicalType::UNION({{"any", LogicalType::ANY}});
break;
case LogicalTypeId::VARCHAR:
// We skip this one here as it's handled in json_functions.cpp
continue;
default:
target_type = type;
}
// Going from JSON to another type has the same cost as going from VARCHAR to that type
const auto json_to_target_cost = casts.ImplicitCastCost(LogicalType::VARCHAR, target_type);
casts.RegisterCastFunction(JSONCommon::JSONType(), target_type, JSONToAnyCastBind, json_to_target_cost);
}
}

} // namespace duckdb
27 changes: 26 additions & 1 deletion src/duckdb/extension/parquet/column_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ void ColumnReader::InitializeRead(idx_t row_group_idx_p, const vector<ColumnChun
void ColumnReader::PrepareRead(parquet_filter_t &filter) {
dict_decoder.reset();
defined_decoder.reset();
bss_decoder.reset();
block.reset();
PageHeader page_hdr;
page_hdr.read(protocol);
Expand Down Expand Up @@ -443,6 +444,13 @@ void ColumnReader::PrepareDataPage(PageHeader &page_hdr) {
PrepareDeltaByteArray(*block);
break;
}
case Encoding::BYTE_STREAM_SPLIT: {
// Subtract 1 from length as the block is allocated with 1 extra byte,
// but the byte stream split encoder needs to know the correct data size.
bss_decoder = make_uniq<BssDecoder>(block->ptr, block->len - 1);
block->inc(block->len);
break;
}
case Encoding::PLAIN:
// nothing to do here, will be read directly below
break;
Expand Down Expand Up @@ -488,7 +496,7 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr

idx_t null_count = 0;

if ((dict_decoder || dbp_decoder || rle_decoder) && HasDefines()) {
if ((dict_decoder || dbp_decoder || rle_decoder || bss_decoder) && HasDefines()) {
// we need the null count because the dictionary offsets have no entries for nulls
for (idx_t i = 0; i < read_now; i++) {
if (define_out[i + result_offset] != max_define) {
Expand Down Expand Up @@ -534,6 +542,23 @@ idx_t ColumnReader::Read(uint64_t num_values, parquet_filter_t &filter, data_ptr
} else if (byte_array_data) {
// DELTA_BYTE_ARRAY or DELTA_LENGTH_BYTE_ARRAY
DeltaByteArray(define_out, read_now, filter, result_offset, result);
} else if (bss_decoder) {
auto read_buf = make_shared<ResizeableBuffer>();

switch (schema.type) {
case duckdb_parquet::format::Type::FLOAT:
read_buf->resize(reader.allocator, sizeof(float) * (read_now - null_count));
bss_decoder->GetBatch<float>(read_buf->ptr, read_now - null_count);
break;
case duckdb_parquet::format::Type::DOUBLE:
read_buf->resize(reader.allocator, sizeof(double) * (read_now - null_count));
bss_decoder->GetBatch<double>(read_buf->ptr, read_now - null_count);
break;
default:
throw std::runtime_error("BYTE_STREAM_SPLIT encoding is only supported for FLOAT or DOUBLE data");
}

Plain(read_buf, define_out, read_now, filter, result_offset, result);
} else {
PlainReference(block, result);
Plain(block, define_out, read_now, filter, result_offset, result);
Expand Down
11 changes: 10 additions & 1 deletion src/duckdb/extension/parquet/column_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -796,6 +796,13 @@ struct ParquetTimestampSOperator : public BaseParquetOperator {
}
};

struct ParquetTimeTZOperator : public BaseParquetOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
return input.time().micros;
}
};

struct ParquetHugeintOperator {
template <class SRC, class TGT>
static TGT Operation(SRC input) {
Expand Down Expand Up @@ -1975,12 +1982,14 @@ unique_ptr<ColumnWriter> ColumnWriter::CreateWriterRecursive(vector<duckdb_parqu
max_define, can_have_nulls);
case LogicalTypeId::BIGINT:
case LogicalTypeId::TIME:
case LogicalTypeId::TIME_TZ:
case LogicalTypeId::TIMESTAMP:
case LogicalTypeId::TIMESTAMP_TZ:
case LogicalTypeId::TIMESTAMP_MS:
return make_uniq<StandardColumnWriter<int64_t, int64_t>>(writer, schema_idx, std::move(schema_path), max_repeat,
max_define, can_have_nulls);
case LogicalTypeId::TIME_TZ:
return make_uniq<StandardColumnWriter<dtime_tz_t, int64_t, ParquetTimeTZOperator>>(
writer, schema_idx, std::move(schema_path), max_repeat, max_define, can_have_nulls);
case LogicalTypeId::HUGEINT:
return make_uniq<StandardColumnWriter<hugeint_t, double, ParquetHugeintOperator>>(
writer, schema_idx, std::move(schema_path), max_repeat, max_define, can_have_nulls);
Expand Down
2 changes: 2 additions & 0 deletions src/duckdb/extension/parquet/include/column_reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#pragma once

#include "duckdb.hpp"
#include "parquet_bss_decoder.hpp"
#include "parquet_dbp_decoder.hpp"
#include "parquet_rle_bp_decoder.hpp"
#include "parquet_statistics.hpp"
Expand Down Expand Up @@ -161,6 +162,7 @@ class ColumnReader {
unique_ptr<RleBpDecoder> repeated_decoder;
unique_ptr<DbpDecoder> dbp_decoder;
unique_ptr<RleBpDecoder> rle_decoder;
unique_ptr<BssDecoder> bss_decoder;

// dummies for Skip()
parquet_filter_t none_filter;
Expand Down
49 changes: 49 additions & 0 deletions src/duckdb/extension/parquet/include/parquet_bss_decoder.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// parquet_bss_decoder.hpp
//
//
//===----------------------------------------------------------------------===//

#pragma once
#include "parquet_types.h"
#include "resizable_buffer.hpp"

namespace duckdb {

/// Decoder for the Byte Stream Split encoding
class BssDecoder {
public:
/// Create a decoder object. buffer/buffer_len is the encoded data.
BssDecoder(data_ptr_t buffer, uint32_t buffer_len) : buffer_(buffer, buffer_len), value_offset_(0) {
}

public:
template <typename T>
void GetBatch(data_ptr_t values_target_ptr, uint32_t batch_size) {
if (buffer_.len % sizeof(T) != 0) {
std::stringstream error;
error << "Data buffer size for the BYTE_STREAM_SPLIT encoding (" << buffer_.len
<< ") should be a multiple of the type size (" << sizeof(T) << ")";
throw std::runtime_error(error.str());
}
uint32_t num_buffer_values = buffer_.len / sizeof(T);

buffer_.available((value_offset_ + batch_size) * sizeof(T));

for (uint32_t byte_offset = 0; byte_offset < sizeof(T); ++byte_offset) {
data_ptr_t input_bytes = buffer_.ptr + byte_offset * num_buffer_values + value_offset_;
for (uint32_t i = 0; i < batch_size; ++i) {
values_target_ptr[byte_offset + i * sizeof(T)] = *(input_bytes + i);
}
}
value_offset_ += batch_size;
}

private:
ByteBuffer buffer_;
uint32_t value_offset_;
};

} // namespace duckdb
7 changes: 3 additions & 4 deletions src/duckdb/extension/parquet/parquet_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
#include "duckdb/common/enums/file_compression_type.hpp"
#include "duckdb/common/file_system.hpp"
#include "duckdb/common/multi_file_reader.hpp"
#include "duckdb/common/serializer/deserializer.hpp"
#include "duckdb/common/serializer/serializer.hpp"
#include "duckdb/common/types/chunk_collection.hpp"
#include "duckdb/function/copy_function.hpp"
#include "duckdb/function/table_function.hpp"
Expand All @@ -34,8 +36,6 @@
#include "duckdb/planner/operator/logical_get.hpp"
#include "duckdb/storage/statistics/base_statistics.hpp"
#include "duckdb/storage/table/row_group.hpp"
#include "duckdb/common/serializer/serializer.hpp"
#include "duckdb/common/serializer/deserializer.hpp"
#endif

namespace duckdb {
Expand Down Expand Up @@ -983,8 +983,7 @@ idx_t ParquetWriteDesiredBatchSize(ClientContext &context, FunctionData &bind_da
//===--------------------------------------------------------------------===//
unique_ptr<TableRef> ParquetScanReplacement(ClientContext &context, const string &table_name,
ReplacementScanData *data) {
auto lower_name = StringUtil::Lower(table_name);
if (!StringUtil::EndsWith(lower_name, ".parquet") && !StringUtil::Contains(lower_name, ".parquet?")) {
if (!ReplacementScan::CanReplace(table_name, {"parquet"})) {
return nullptr;
}
auto table_function = make_uniq<TableFunctionRef>();
Expand Down
7 changes: 3 additions & 4 deletions src/duckdb/extension/parquet/parquet_timestamp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,9 @@ dtime_t ParquetIntToTimeNs(const int64_t &raw_time) {
return Time::FromTimeNs(raw_time);
}

dtime_tz_t ParquetIntToTimeTZ(const int64_t &raw_time) {
dtime_tz_t result;
result.bits = raw_time;
return result;
dtime_tz_t ParquetIntToTimeTZ(const int64_t &raw_micros) {
dtime_t t(raw_micros);
return dtime_tz_t(t, 0);
}

} // namespace duckdb
4 changes: 2 additions & 2 deletions src/duckdb/src/common/arrow/appender/list_data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@ void ArrowListData::Finalize(ArrowAppendData &append_data, const LogicalType &ty
result->buffers[1] = append_data.main_buffer.data();

auto &child_type = ListType::GetChildType(type);
append_data.child_pointers.resize(1);
ArrowAppender::AddChildren(append_data, 1);
result->children = append_data.child_pointers.data();
result->n_children = 1;
append_data.child_pointers[0] = ArrowAppender::FinalizeChild(child_type, *append_data.child_data[0]);
append_data.child_arrays[0] = *ArrowAppender::FinalizeChild(child_type, std::move(append_data.child_data[0]));
}

} // namespace duckdb
Loading

0 comments on commit 21405f2

Please sign in to comment.