Skip to content

Commit

Permalink
Update vendored DuckDB sources to 741684c
Browse files Browse the repository at this point in the history
  • Loading branch information
duckdblabs-bot committed Oct 16, 2024
1 parent 741684c commit 31a6506
Show file tree
Hide file tree
Showing 24 changed files with 139 additions and 73 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -119,15 +119,15 @@ void CSVBufferManager::ResetBuffer(const idx_t buffer_idx) {
}
}

idx_t CSVBufferManager::GetBufferSize() {
idx_t CSVBufferManager::GetBufferSize() const {
return buffer_size;
}

idx_t CSVBufferManager::BufferCount() {
idx_t CSVBufferManager::BufferCount() const {
return cached_buffers.size();
}

bool CSVBufferManager::Done() {
bool CSVBufferManager::Done() const {
return done;
}

Expand All @@ -144,7 +144,7 @@ void CSVBufferManager::ResetBufferManager() {
}
}

string CSVBufferManager::GetFilePath() {
string CSVBufferManager::GetFilePath() const {
return file_path;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "duckdb/execution/operator/csv_scanner/base_scanner.hpp"

#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/skip_scanner.hpp"

namespace duckdb {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,53 @@ bool CSVSchema::Empty() const {
return columns.empty();
}

bool CSVSchema::SchemasMatch(string &error_message, vector<string> &names, vector<LogicalType> &types,
const string &cur_file_path) {
D_ASSERT(names.size() == types.size());
bool CSVSchema::SchemasMatch(string &error_message, SnifferResult &sniffer_result, const string &cur_file_path,
bool is_minimal_sniffer) const {
D_ASSERT(sniffer_result.names.size() == sniffer_result.return_types.size());
bool match = true;
unordered_map<string, TypeIdxPair> current_schema;
for (idx_t i = 0; i < names.size(); i++) {

for (idx_t i = 0; i < sniffer_result.names.size(); i++) {
// Populate our little schema
current_schema[names[i]] = {types[i], i};
current_schema[sniffer_result.names[i]] = {sniffer_result.return_types[i], i};
}
if (is_minimal_sniffer) {
auto min_sniffer = static_cast<AdaptiveSnifferResult &>(sniffer_result);
if (!min_sniffer.more_than_one_row) {
bool min_sniff_match = true;
// If we don't have more than one row, either the names must match or the types must match.
for (auto &column : columns) {
if (current_schema.find(column.name) == current_schema.end()) {
min_sniff_match = false;
break;
}
}
if (min_sniff_match) {
return true;
}
// Otherwise, the types must match.
min_sniff_match = true;
if (sniffer_result.return_types.size() == columns.size()) {
idx_t return_type_idx = 0;
for (auto &column : columns) {
if (column.type != sniffer_result.return_types[return_type_idx++]) {
min_sniff_match = false;
break;
}
}
} else {
min_sniff_match = false;
}
if (min_sniff_match) {
// If we got here, we have the right types but the wrong names, lets fix the names
idx_t sniff_name_idx = 0;
for (auto &column : columns) {
sniffer_result.names[sniff_name_idx++] = column.name;
}
return true;
}
}
// If we got to this point, the minimal sniffer doesn't match, we throw an error.
}
// Here we check if the schema of a given file matched our original schema
// We consider it's not a match if:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/common/types/value.hpp"

namespace duckdb {
Expand Down Expand Up @@ -88,15 +88,14 @@ void CSVSniffer::SetResultOptions() {
options.dialect_options.rows_until_header = best_candidate->GetStateMachine().dialect_options.rows_until_header;
}

SnifferResult CSVSniffer::MinimalSniff() {
AdaptiveSnifferResult CSVSniffer::MinimalSniff() {
if (set_columns.IsSet()) {
// Nothing to see here
return SnifferResult(*set_columns.types, *set_columns.names);
return AdaptiveSnifferResult(*set_columns.types, *set_columns.names, true);
}
// Return Types detected
vector<LogicalType> return_types;
// Column Names detected
vector<string> names;

buffer_manager->sniffing = true;
constexpr idx_t result_size = 2;
Expand All @@ -106,7 +105,8 @@ SnifferResult CSVSniffer::MinimalSniff() {
ColumnCountScanner count_scanner(buffer_manager, state_machine, error_handler, result_size);
auto &sniffed_column_counts = count_scanner.ParseChunk();
if (sniffed_column_counts.result_position == 0) {
return {{}, {}};
// The file is an empty file, we just return
return {{}, {}, false};
}

state_machine->dialect_options.num_cols = sniffed_column_counts[0].number_of_columns;
Expand All @@ -130,20 +130,20 @@ SnifferResult CSVSniffer::MinimalSniff() {

// Possibly Gather Header
vector<HeaderValue> potential_header;
if (start_row != 0) {
for (idx_t col_idx = 0; col_idx < data_chunk.ColumnCount(); col_idx++) {
auto &cur_vector = data_chunk.data[col_idx];
auto vector_data = FlatVector::GetData<string_t>(cur_vector);
auto &validity = FlatVector::Validity(cur_vector);
HeaderValue val;
if (validity.RowIsValid(0)) {
val = HeaderValue(vector_data[0]);
}
potential_header.emplace_back(val);

for (idx_t col_idx = 0; col_idx < data_chunk.ColumnCount(); col_idx++) {
auto &cur_vector = data_chunk.data[col_idx];
auto vector_data = FlatVector::GetData<string_t>(cur_vector);
auto &validity = FlatVector::Validity(cur_vector);
HeaderValue val;
if (validity.RowIsValid(0)) {
val = HeaderValue(vector_data[0]);
}
potential_header.emplace_back(val);
}
names = DetectHeaderInternal(buffer_manager->context, potential_header, *state_machine, set_columns,
best_sql_types_candidates_per_column_idx, options, *error_handler);

vector<string> names = DetectHeaderInternal(buffer_manager->context, potential_header, *state_machine, set_columns,
best_sql_types_candidates_per_column_idx, options, *error_handler);

for (idx_t column_idx = 0; column_idx < best_sql_types_candidates_per_column_idx.size(); column_idx++) {
LogicalType d_type = best_sql_types_candidates_per_column_idx[column_idx].back();
Expand All @@ -153,34 +153,33 @@ SnifferResult CSVSniffer::MinimalSniff() {
detected_types.push_back(d_type);
}

return {detected_types, names};
return {detected_types, names, sniffed_column_counts.result_position > 1};
}

SnifferResult CSVSniffer::AdaptiveSniff(CSVSchema &file_schema) {
SnifferResult CSVSniffer::AdaptiveSniff(const CSVSchema &file_schema) {
auto min_sniff_res = MinimalSniff();
bool run_full = error_handler->AnyErrors() || detection_error_handler->AnyErrors();
// Check if we are happy with the result or if we need to do more sniffing
if (!error_handler->AnyErrors() && !detection_error_handler->AnyErrors()) {
// If we got no errors, we also run full if schemas do not match.
if (!set_columns.IsSet() && !options.file_options.AnySet()) {
string error;
run_full =
!file_schema.SchemasMatch(error, min_sniff_res.names, min_sniff_res.return_types, options.file_path);
run_full = !file_schema.SchemasMatch(error, min_sniff_res, options.file_path, true);
}
}
if (run_full) {
// We run full sniffer
auto full_sniffer = SniffCSV();
if (!set_columns.IsSet() && !options.file_options.AnySet()) {
string error;
if (!file_schema.SchemasMatch(error, full_sniffer.names, full_sniffer.return_types, options.file_path) &&
if (!file_schema.SchemasMatch(error, full_sniffer, options.file_path, false) &&
!options.ignore_errors.GetValue()) {
throw InvalidInputException(error);
}
}
return full_sniffer;
}
return min_sniff_res;
return min_sniff_res.ToSnifferResult();
}
SnifferResult CSVSniffer::SniffCSV(bool force_match) {
buffer_manager->sniffing = true;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "duckdb/common/shared_ptr.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/main/client_data.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp"

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "duckdb/common/types/cast_helpers.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp"

#include "utf8proc.hpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "duckdb/common/operator/integer_cast_operator.hpp"
#include "duckdb/common/string.hpp"
#include "duckdb/common/types/time.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"

namespace duckdb {
struct TryCastFloatingOperator {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_casting.hpp"

namespace duckdb {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"

namespace duckdb {
void CSVSniffer::ReplaceTypes() {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "duckdb/execution/operator/csv_scanner/csv_state_machine.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "utf8proc_wrapper.hpp"
#include "duckdb/main/error_manager.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_state_machine_cache.hpp"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "duckdb/execution/operator/csv_scanner/csv_state_machine.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_state_machine_cache.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"

namespace duckdb {

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp"

#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/skip_scanner.hpp"
#include "duckdb/function/table/read_csv.hpp"

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "duckdb/execution/operator/csv_scanner/global_csv_state.hpp"

#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/scanner_boundary.hpp"
#include "duckdb/execution/operator/csv_scanner/skip_scanner.hpp"
#include "duckdb/execution/operator/persistent/csv_rejects_table.hpp"
Expand Down
2 changes: 1 addition & 1 deletion src/duckdb/src/function/table/copy_csv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "duckdb/common/types/column/column_data_collection.hpp"
#include "duckdb/common/types/string_type.hpp"
#include "duckdb/common/vector_operations/vector_operations.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/function/copy_function.hpp"
#include "duckdb/function/scalar/string_functions.hpp"
#include "duckdb/function/table/read_csv.hpp"
Expand Down
2 changes: 1 addition & 1 deletion src/duckdb/src/function/table/read_csv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include "duckdb/common/union_by_name.hpp"
#include "duckdb/execution/operator/csv_scanner/global_csv_state.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_error.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/execution/operator/persistent/csv_rejects_table.hpp"
#include "duckdb/function/function_set.hpp"
#include "duckdb/main/client_context.hpp"
Expand Down
2 changes: 1 addition & 1 deletion src/duckdb/src/function/table/sniff_csv.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include "duckdb/function/built_in_functions.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_reader_options.hpp"
#include "duckdb/common/types/data_chunk.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/csv_sniffer.hpp"
#include "duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp"
#include "duckdb/function/table_function.hpp"
#include "duckdb/main/client_context.hpp"
Expand Down
6 changes: 3 additions & 3 deletions src/duckdb/src/function/table/version/pragma_version.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#ifndef DUCKDB_PATCH_VERSION
#define DUCKDB_PATCH_VERSION "2"
#define DUCKDB_PATCH_VERSION "3-dev13"
#endif
#ifndef DUCKDB_MINOR_VERSION
#define DUCKDB_MINOR_VERSION 1
Expand All @@ -8,10 +8,10 @@
#define DUCKDB_MAJOR_VERSION 1
#endif
#ifndef DUCKDB_VERSION
#define DUCKDB_VERSION "v1.1.2"
#define DUCKDB_VERSION "v1.1.3-dev13"
#endif
#ifndef DUCKDB_SOURCE_ID
#define DUCKDB_SOURCE_ID "f680b7d08f"
#define DUCKDB_SOURCE_ID "8d3f8f8195"
#endif
#include "duckdb/function/table/system_functions.hpp"
#include "duckdb/main/database.hpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,15 @@ class CSVBufferManager {

void UnpinBuffer(const idx_t cache_idx);
//! Returns the buffer size set for this CSV buffer manager
idx_t GetBufferSize();
idx_t GetBufferSize() const;
//! Returns the number of buffers in the cached_buffers cache
idx_t BufferCount();
idx_t BufferCount() const;
//! If this buffer manager is done. In the context of a buffer manager it means that it read all buffers at least
//! once.
bool Done();
bool Done() const;

void ResetBufferManager();
string GetFilePath();
string GetFilePath() const;

ClientContext &context;
idx_t skip_rows = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#pragma once

#include "duckdb/common/types.hpp"
#include "duckdb/execution/operator/csv_scanner/sniffer/sniff_result.hpp"

namespace duckdb {
//! Basic CSV Column Info
Expand All @@ -23,8 +24,8 @@ struct CSVColumnInfo {
struct CSVSchema {
void Initialize(vector<string> &names, vector<LogicalType> &types, const string &file_path);
bool Empty() const;
bool SchemasMatch(string &error_message, vector<string> &names, vector<LogicalType> &types,
const string &cur_file_path);
bool SchemasMatch(string &error_message, SnifferResult &sniffer_result, const string &cur_file_path,
bool is_minimal_sniffer) const;

private:
static bool CanWeCastIt(LogicalTypeId source, LogicalTypeId destination);
Expand Down
Loading

0 comments on commit 31a6506

Please sign in to comment.