-
Notifications
You must be signed in to change notification settings - Fork 32
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update vendored DuckDB sources to 6024b25
- Loading branch information
1 parent
6024b25
commit 6561074
Showing
162 changed files
with
2,947 additions
and
1,064 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
#include "duckdb/common/arrow/arrow_merge_event.hpp" | ||
#include "duckdb/storage/storage_info.hpp" | ||
|
||
namespace duckdb { | ||
|
||
//===--------------------------------------------------------------------===// | ||
// Arrow Batch Task | ||
//===--------------------------------------------------------------------===// | ||
|
||
ArrowBatchTask::ArrowBatchTask(ArrowQueryResult &result, vector<idx_t> record_batch_indices, Executor &executor, | ||
shared_ptr<Event> event_p, BatchCollectionChunkScanState scan_state, | ||
vector<string> names, idx_t batch_size) | ||
: ExecutorTask(executor, event_p), result(result), record_batch_indices(std::move(record_batch_indices)), | ||
event(std::move(event_p)), batch_size(batch_size), names(std::move(names)), scan_state(std::move(scan_state)) { | ||
} | ||
|
||
void ArrowBatchTask::ProduceRecordBatches() { | ||
auto &arrays = result.Arrays(); | ||
auto arrow_options = executor.context.GetClientProperties(); | ||
for (auto &index : record_batch_indices) { | ||
auto &array = arrays[index]; | ||
D_ASSERT(array); | ||
idx_t count; | ||
count = ArrowUtil::FetchChunk(scan_state, arrow_options, batch_size, &array->arrow_array); | ||
(void)count; | ||
D_ASSERT(count != 0); | ||
} | ||
} | ||
|
||
TaskExecutionResult ArrowBatchTask::ExecuteTask(TaskExecutionMode mode) { | ||
ProduceRecordBatches(); | ||
event->FinishTask(); | ||
return TaskExecutionResult::TASK_FINISHED; | ||
} | ||
|
||
//===--------------------------------------------------------------------===// | ||
// Arrow Merge Event | ||
//===--------------------------------------------------------------------===// | ||
|
||
ArrowMergeEvent::ArrowMergeEvent(ArrowQueryResult &result, BatchedDataCollection &batches, Pipeline &pipeline_p) | ||
: BasePipelineEvent(pipeline_p), result(result), batches(batches) { | ||
record_batch_size = result.BatchSize(); | ||
} | ||
|
||
namespace { | ||
|
||
struct BatchesForTask { | ||
idx_t tuple_count; | ||
BatchedChunkIteratorRange batches; | ||
}; | ||
|
||
struct BatchesToTaskTransformer { | ||
public: | ||
explicit BatchesToTaskTransformer(BatchedDataCollection &batches) : batches(batches), batch_index(0) { | ||
batch_count = batches.BatchCount(); | ||
} | ||
idx_t GetIndex() const { | ||
return batch_index; | ||
} | ||
bool TryGetNextBatchSize(idx_t &tuple_count) { | ||
if (batch_index >= batch_count) { | ||
return false; | ||
} | ||
auto internal_index = batches.IndexToBatchIndex(batch_index++); | ||
auto tuples_in_batch = batches.BatchSize(internal_index); | ||
tuple_count = tuples_in_batch; | ||
return true; | ||
} | ||
|
||
public: | ||
BatchedDataCollection &batches; | ||
idx_t batch_index; | ||
idx_t batch_count; | ||
}; | ||
|
||
} // namespace | ||
|
||
void ArrowMergeEvent::Schedule() { | ||
vector<shared_ptr<Task>> tasks; | ||
|
||
BatchesToTaskTransformer transformer(batches); | ||
vector<BatchesForTask> task_data; | ||
bool finished = false; | ||
// First we convert our list of batches into units of Storage::ROW_GROUP_SIZE tuples each | ||
while (!finished) { | ||
idx_t tuples_for_task = 0; | ||
idx_t start_index = transformer.GetIndex(); | ||
idx_t end_index = start_index; | ||
while (tuples_for_task < Storage::ROW_GROUP_SIZE) { | ||
idx_t batch_size; | ||
if (!transformer.TryGetNextBatchSize(batch_size)) { | ||
finished = true; | ||
break; | ||
} | ||
end_index++; | ||
tuples_for_task += batch_size; | ||
} | ||
if (start_index == end_index) { | ||
break; | ||
} | ||
BatchesForTask batches_for_task; | ||
batches_for_task.tuple_count = tuples_for_task; | ||
batches_for_task.batches = batches.BatchRange(start_index, end_index); | ||
task_data.push_back(batches_for_task); | ||
} | ||
|
||
// Now we produce tasks from these units | ||
// Every task is given a scan_state created from the range of batches | ||
// and a vector of indices indicating the arrays (record batches) they should populate | ||
idx_t record_batch_index = 0; | ||
for (auto &data : task_data) { | ||
const auto tuples = data.tuple_count; | ||
|
||
auto full_batches = tuples / record_batch_size; | ||
auto remainder = tuples % record_batch_size; | ||
auto total_batches = full_batches + !!remainder; | ||
|
||
vector<idx_t> record_batch_indices(total_batches); | ||
for (idx_t i = 0; i < total_batches; i++) { | ||
record_batch_indices[i] = record_batch_index++; | ||
} | ||
|
||
BatchCollectionChunkScanState scan_state(batches, data.batches, pipeline->executor.context); | ||
tasks.push_back(make_uniq<ArrowBatchTask>(result, std::move(record_batch_indices), pipeline->executor, | ||
shared_from_this(), std::move(scan_state), result.names, | ||
record_batch_size)); | ||
} | ||
|
||
// Allocate the list of record batches inside the query result | ||
{ | ||
vector<unique_ptr<ArrowArrayWrapper>> arrays; | ||
arrays.resize(record_batch_index); | ||
for (idx_t i = 0; i < record_batch_index; i++) { | ||
arrays[i] = make_uniq<ArrowArrayWrapper>(); | ||
} | ||
result.SetArrowData(std::move(arrays)); | ||
} | ||
D_ASSERT(!tasks.empty()); | ||
SetTasks(std::move(tasks)); | ||
} | ||
|
||
} // namespace duckdb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
37 changes: 37 additions & 0 deletions
37
src/duckdb/src/common/arrow/physical_arrow_batch_collector.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#include "duckdb/common/arrow/physical_arrow_batch_collector.hpp" | ||
#include "duckdb/common/types/batched_data_collection.hpp" | ||
#include "duckdb/common/arrow/arrow_query_result.hpp" | ||
#include "duckdb/common/arrow/arrow_merge_event.hpp" | ||
#include "duckdb/main/client_context.hpp" | ||
#include "duckdb/common/arrow/physical_arrow_collector.hpp" | ||
|
||
namespace duckdb { | ||
|
||
unique_ptr<GlobalSinkState> PhysicalArrowBatchCollector::GetGlobalSinkState(ClientContext &context) const { | ||
return make_uniq<ArrowBatchGlobalState>(context, *this); | ||
} | ||
|
||
SinkFinalizeType PhysicalArrowBatchCollector::Finalize(Pipeline &pipeline, Event &event, ClientContext &context, | ||
OperatorSinkFinalizeInput &input) const { | ||
auto &gstate = input.global_state.Cast<ArrowBatchGlobalState>(); | ||
|
||
auto total_tuple_count = gstate.data.Count(); | ||
if (total_tuple_count == 0) { | ||
// Create the result containing a single empty result conversion | ||
gstate.result = make_uniq<ArrowQueryResult>(statement_type, properties, names, types, | ||
context.GetClientProperties(), record_batch_size); | ||
return SinkFinalizeType::READY; | ||
} | ||
|
||
// Already create the final query result | ||
gstate.result = make_uniq<ArrowQueryResult>(statement_type, properties, names, types, context.GetClientProperties(), | ||
record_batch_size); | ||
// Spawn an event that will populate the conversion result | ||
auto &arrow_result = gstate.result->Cast<ArrowQueryResult>(); | ||
auto new_event = make_shared_ptr<ArrowMergeEvent>(arrow_result, gstate.data, pipeline); | ||
event.InsertEvent(std::move(new_event)); | ||
|
||
return SinkFinalizeType::READY; | ||
} | ||
|
||
} // namespace duckdb |
Oops, something went wrong.