From d6ec36945701e293b480d901255673ae451e1f89 Mon Sep 17 00:00:00 2001 From: Norman Foerster Date: Thu, 13 Jun 2024 20:28:23 +0200 Subject: [PATCH 01/13] azure support --- src/functions/delta_scan.cpp | 119 +++++++++++++++++++++++++++-------- 1 file changed, 94 insertions(+), 25 deletions(-) diff --git a/src/functions/delta_scan.cpp b/src/functions/delta_scan.cpp index dd2a027..41c38cf 100644 --- a/src/functions/delta_scan.cpp +++ b/src/functions/delta_scan.cpp @@ -69,27 +69,43 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p ffi::EngineBuilder* builder; // For "regular" paths we early out with the default builder config - if (!StringUtil::StartsWith(path, "s3://")) { + if (!StringUtil::StartsWith(path, "s3://") && !StringUtil::StartsWith(path, "azure://")) { auto interface_builder_res = ffi::get_engine_builder(KernelUtils::ToDeltaString(path), DuckDBEngineError::AllocateError); return KernelUtils::UnpackResult(interface_builder_res, "get_engine_interface_builder for path " + path); } - auto end_of_container = path.find('/',5); + string bucket; + string path_in_bucket; + string secret_type; - if(end_of_container == string::npos) { - throw IOException("Invalid s3 url passed to delta scan: %s", path); + if (StringUtil::StartsWith(path, "s3://")) { + auto end_of_container = path.find('/',5); + + if(end_of_container == string::npos) { + throw IOException("Invalid s3 url passed to delta scan: %s", path); + } + bucket = path.substr(5, end_of_container-5); + path_in_bucket = path.substr(end_of_container); + secret_type = "s3"; + } else if (StringUtil::StartsWith(path, "azure://")) { + auto end_of_container = path.find('/',8); + + if(end_of_container == string::npos) { + throw IOException("Invalid azure url passed to delta scan: %s", path); + } + bucket = path.substr(8, end_of_container-8); + path_in_bucket = path.substr(end_of_container); + secret_type = "azure"; } - auto bucket = path.substr(5, end_of_container-5); - auto path_in_bucket = path.substr(end_of_container); auto interface_builder_res = ffi::get_engine_builder(KernelUtils::ToDeltaString(path), DuckDBEngineError::AllocateError); builder = KernelUtils::UnpackResult(interface_builder_res, "get_engine_interface_builder for path " + path); - // For S3 paths we need to trim the url, set the container, and fetch a potential secret + // For S3 or Azure paths we need to trim the url, set the container, and fetch a potential secret auto &secret_manager = SecretManager::Get(context); auto transaction = CatalogTransaction::GetSystemCatalogTransaction(context); - auto secret_match = secret_manager.LookupSecret(transaction, path, "s3"); + auto secret_match = secret_manager.LookupSecret(transaction, path, secret_type); // No secret: nothing left to do here! if (!secret_match.HasMatch()) { @@ -97,26 +113,79 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p } const auto &kv_secret = dynamic_cast(*secret_match.secret_entry->secret); - auto key_id = kv_secret.TryGetValue("key_id").ToString(); - auto secret = kv_secret.TryGetValue("secret").ToString(); - auto session_token = kv_secret.TryGetValue("session_token").ToString(); - auto region = kv_secret.TryGetValue("region").ToString(); - if (key_id.empty() && secret.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("skip_signature"), KernelUtils::ToDeltaString("true")); - } + // Here you would need to add the logic for setting the builder options for Azure + // This is just a placeholder and will need to be replaced with the actual logic + if (secret_type == "s3") { + auto key_id = kv_secret.TryGetValue("key_id").ToString(); + auto secret = kv_secret.TryGetValue("secret").ToString(); + auto session_token = kv_secret.TryGetValue("session_token").ToString(); + auto region = kv_secret.TryGetValue("region").ToString(); - if (!key_id.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_access_key_id"), KernelUtils::ToDeltaString(key_id)); - } - if (!secret.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_secret_access_key"), KernelUtils::ToDeltaString(secret)); - } - if (!session_token.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_session_token"), KernelUtils::ToDeltaString(session_token)); - } - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_region"), KernelUtils::ToDeltaString(region)); + if (key_id.empty() && secret.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("skip_signature"), KernelUtils::ToDeltaString("true")); + } + + if (!key_id.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_access_key_id"), KernelUtils::ToDeltaString(key_id)); + } + if (!secret.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_secret_access_key"), KernelUtils::ToDeltaString(secret)); + } + if (!session_token.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_session_token"), KernelUtils::ToDeltaString(session_token)); + } + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_region"), KernelUtils::ToDeltaString(region)); + + } else if (secret_type == "azure") { + + auto connection_string = kv_secret.TryGetValue("connection_string").ToString(); + auto account_name = kv_secret.TryGetValue("account_name").ToString(); + auto account_key = kv_secret.TryGetValue("account_key").ToString(); + auto client_id = kv_secret.TryGetValue("client_id").ToString(); + auto client_secret = kv_secret.TryGetValue("client_secret").ToString(); + auto tenant_id = kv_secret.TryGetValue("tenant_id").ToString(); + auto azure_client_certificate_path = kv_secret.TryGetValue("certificate_path").ToString(); + auto sas_token = kv_secret.TryGetValue("sas_token").ToString(); + auto http_proxy = kv_secret.TryGetValue("http_proxy").ToString(); + auto proxy_user_name = kv_secret.TryGetValue("proxy_user_name").ToString(); + auto proxy_password = kv_secret.TryGetValue("proxy_password").ToString(); + + if (!connection_string.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_storage_connection_string"), KernelUtils::ToDeltaString(connection_string)); + } + if (!account_name.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_storage_account_name"), KernelUtils::ToDeltaString(account_name)); + } + if (!account_key.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_storage_account_key"), KernelUtils::ToDeltaString(account_key)); + } + if (!client_id.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_id"), KernelUtils::ToDeltaString(client_id)); + } + if (!client_secret.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_secret"), KernelUtils::ToDeltaString(client_secret)); + } + if (!tenant_id.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_tenant_id"), KernelUtils::ToDeltaString(tenant_id)); + } + if (!azure_client_certificate_path.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_certificate_path"), KernelUtils::ToDeltaString(azure_client_certificate_path)); + } + if (!sas_token.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_sas_token"), KernelUtils::ToDeltaString(sas_token)); + } + if (!http_proxy.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("http_proxy"), KernelUtils::ToDeltaString(http_proxy)); + } + if (!proxy_user_name.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("proxy_user_name"), KernelUtils::ToDeltaString(proxy_user_name)); + } + if (!proxy_password.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("proxy_password"), KernelUtils::ToDeltaString(proxy_password)); + } + } return builder; } From a2ddb6c7d65cbae8e7466a4d35c315b2c34b8799 Mon Sep 17 00:00:00 2001 From: Norman Foerster Date: Thu, 13 Jun 2024 22:02:59 +0200 Subject: [PATCH 02/13] azure test impl --- src/functions/delta_scan.cpp | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/functions/delta_scan.cpp b/src/functions/delta_scan.cpp index 41c38cf..1b7d894 100644 --- a/src/functions/delta_scan.cpp +++ b/src/functions/delta_scan.cpp @@ -69,7 +69,7 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p ffi::EngineBuilder* builder; // For "regular" paths we early out with the default builder config - if (!StringUtil::StartsWith(path, "s3://") && !StringUtil::StartsWith(path, "azure://")) { + if (!StringUtil::StartsWith(path, "s3://") && !StringUtil::StartsWith(path, "azure://") && !StringUtil::StartsWith(path, "az://") && !StringUtil::StartsWith(path, "abfss://")) { auto interface_builder_res = ffi::get_engine_builder(KernelUtils::ToDeltaString(path), DuckDBEngineError::AllocateError); return KernelUtils::UnpackResult(interface_builder_res, "get_engine_interface_builder for path " + path); } @@ -90,6 +90,24 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p } else if (StringUtil::StartsWith(path, "azure://")) { auto end_of_container = path.find('/',8); + if(end_of_container == string::npos) { + throw IOException("Invalid azure url passed to delta scan: %s", path); + } + bucket = path.substr(8, end_of_container-8); + path_in_bucket = path.substr(end_of_container); + secret_type = "azure"; + } else if (StringUtil::StartsWith(path, "az://")) { + auto end_of_container = path.find('/',5); + + if(end_of_container == string::npos) { + throw IOException("Invalid azure url passed to delta scan: %s", path); + } + bucket = path.substr(5, end_of_container-5); + path_in_bucket = path.substr(end_of_container); + secret_type = "azure"; + } else if (StringUtil::StartsWith(path, "abfss://")) { + auto end_of_container = path.find('/',8); + if(end_of_container == string::npos) { throw IOException("Invalid azure url passed to delta scan: %s", path); } From efd4db01aeb81dec4900d9957a67360eb83fbd18 Mon Sep 17 00:00:00 2001 From: Norman Foerster Date: Fri, 14 Jun 2024 11:11:15 +0200 Subject: [PATCH 03/13] update azure values for azure extension --- src/functions/delta_scan.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/functions/delta_scan.cpp b/src/functions/delta_scan.cpp index 1b7d894..05b958e 100644 --- a/src/functions/delta_scan.cpp +++ b/src/functions/delta_scan.cpp @@ -159,12 +159,12 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p auto connection_string = kv_secret.TryGetValue("connection_string").ToString(); auto account_name = kv_secret.TryGetValue("account_name").ToString(); - auto account_key = kv_secret.TryGetValue("account_key").ToString(); + auto endpoint = kv_secret.TryGetValue("endpoint").ToString(); + auto credential_chain = kv_secret.TryGetValue("credential_chain").ToString(); auto client_id = kv_secret.TryGetValue("client_id").ToString(); auto client_secret = kv_secret.TryGetValue("client_secret").ToString(); auto tenant_id = kv_secret.TryGetValue("tenant_id").ToString(); - auto azure_client_certificate_path = kv_secret.TryGetValue("certificate_path").ToString(); - auto sas_token = kv_secret.TryGetValue("sas_token").ToString(); + auto certificate_path = kv_secret.TryGetValue("certificate_path").ToString(); auto http_proxy = kv_secret.TryGetValue("http_proxy").ToString(); auto proxy_user_name = kv_secret.TryGetValue("proxy_user_name").ToString(); auto proxy_password = kv_secret.TryGetValue("proxy_password").ToString(); @@ -175,8 +175,11 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p if (!account_name.empty()) { ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_storage_account_name"), KernelUtils::ToDeltaString(account_name)); } - if (!account_key.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_storage_account_key"), KernelUtils::ToDeltaString(account_key)); + if (!endpoint.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_endpoint"), KernelUtils::ToDeltaString(endpoint)); + } + if (!credential_chain.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_credential_chain"), KernelUtils::ToDeltaString(credential_chain)); } if (!client_id.empty()) { ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_id"), KernelUtils::ToDeltaString(client_id)); @@ -187,11 +190,8 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p if (!tenant_id.empty()) { ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_tenant_id"), KernelUtils::ToDeltaString(tenant_id)); } - if (!azure_client_certificate_path.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_certificate_path"), KernelUtils::ToDeltaString(azure_client_certificate_path)); - } - if (!sas_token.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_sas_token"), KernelUtils::ToDeltaString(sas_token)); + if (!certificate_path.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_certificate_path"), KernelUtils::ToDeltaString(certificate_path)); } if (!http_proxy.empty()) { ffi::set_builder_option(builder, KernelUtils::ToDeltaString("http_proxy"), KernelUtils::ToDeltaString(http_proxy)); From 90455e5f55f4caf682a81156c7ccdae2e47f7471 Mon Sep 17 00:00:00 2001 From: Norman Foerster Date: Tue, 25 Jun 2024 10:28:47 +0200 Subject: [PATCH 04/13] working azure setting --- .gitignore | 4 ++ extension_config.cmake | 3 + src/functions/delta_scan.cpp | 87 ++++++++++++++++++++--------- test/sql/generated/azure.emulator.x | 25 +++++++++ vcpkg.json | 3 + 5 files changed, 96 insertions(+), 26 deletions(-) create mode 100644 test/sql/generated/azure.emulator.x diff --git a/.gitignore b/.gitignore index 2cf38b5..31bc287 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,7 @@ testext test/python/__pycache__/ .Rhistory data/generated +__azurite*__.json +__blobstorage__ +.venv +.vscode \ No newline at end of file diff --git a/extension_config.cmake b/extension_config.cmake index 46e7a27..6cfa12c 100644 --- a/extension_config.cmake +++ b/extension_config.cmake @@ -9,6 +9,9 @@ duckdb_extension_load(delta # Build the httpfs extension to test with s3/http duckdb_extension_load(httpfs) +# Build the azure extension to test with azure +duckdb_extension_load(azure) + # Build the tpch and tpcds extension for testing/benchmarking duckdb_extension_load(tpch) duckdb_extension_load(tpcds) diff --git a/src/functions/delta_scan.cpp b/src/functions/delta_scan.cpp index 05b958e..fb3643c 100644 --- a/src/functions/delta_scan.cpp +++ b/src/functions/delta_scan.cpp @@ -18,6 +18,7 @@ #include #include +#include namespace duckdb { @@ -65,11 +66,23 @@ static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::Kernel ffi::visit_scan_data(engine_data, selection_vec, engine_context, visit_callback); } +std::string parseFromConnectionString(const std::string& connectionString, const std::string& key) { + std::regex pattern(key + "=([^;]+);"); + std::smatch matches; + if (std::regex_search(connectionString, matches, pattern) && matches.size() > 1) { + // The second match ([1]) contains the access key + return matches[1].str(); + } else { + // If no access key is found, return an empty string or handle as needed + return ""; + } +} + static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &path) { ffi::EngineBuilder* builder; // For "regular" paths we early out with the default builder config - if (!StringUtil::StartsWith(path, "s3://") && !StringUtil::StartsWith(path, "azure://") && !StringUtil::StartsWith(path, "az://") && !StringUtil::StartsWith(path, "abfss://")) { + if (!StringUtil::StartsWith(path, "s3://") && !StringUtil::StartsWith(path, "azure://") && !StringUtil::StartsWith(path, "az://") && !StringUtil::StartsWith(path, "abfs://") && !StringUtil::StartsWith(path, "abfss://")) { auto interface_builder_res = ffi::get_engine_builder(KernelUtils::ToDeltaString(path), DuckDBEngineError::AllocateError); return KernelUtils::UnpackResult(interface_builder_res, "get_engine_interface_builder for path " + path); } @@ -87,7 +100,7 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p bucket = path.substr(5, end_of_container-5); path_in_bucket = path.substr(end_of_container); secret_type = "s3"; - } else if (StringUtil::StartsWith(path, "azure://")) { + } else if ((StringUtil::StartsWith(path, "azure://")) || (StringUtil::StartsWith(path, "abfss://"))) { auto end_of_container = path.find('/',8); if(end_of_container == string::npos) { @@ -105,8 +118,8 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p bucket = path.substr(5, end_of_container-5); path_in_bucket = path.substr(end_of_container); secret_type = "azure"; - } else if (StringUtil::StartsWith(path, "abfss://")) { - auto end_of_container = path.find('/',8); + } else if (StringUtil::StartsWith(path, "abfs://")) { + auto end_of_container = path.find('/',7); if(end_of_container == string::npos) { throw IOException("Invalid azure url passed to delta scan: %s", path); @@ -157,10 +170,12 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p } else if (secret_type == "azure") { + // azure seems to be super complicated as we need to cover duckdb azure plugin and delta RS builder + // and both require different settings + auto connection_string = kv_secret.TryGetValue("connection_string").ToString(); auto account_name = kv_secret.TryGetValue("account_name").ToString(); auto endpoint = kv_secret.TryGetValue("endpoint").ToString(); - auto credential_chain = kv_secret.TryGetValue("credential_chain").ToString(); auto client_id = kv_secret.TryGetValue("client_id").ToString(); auto client_secret = kv_secret.TryGetValue("client_secret").ToString(); auto tenant_id = kv_secret.TryGetValue("tenant_id").ToString(); @@ -168,40 +183,60 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p auto http_proxy = kv_secret.TryGetValue("http_proxy").ToString(); auto proxy_user_name = kv_secret.TryGetValue("proxy_user_name").ToString(); auto proxy_password = kv_secret.TryGetValue("proxy_password").ToString(); + auto chain = kv_secret.TryGetValue("chain").ToString(); + + if (account_name == "devstoreaccount1" || connection_string.find("devstoreaccount1") != string::npos) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("use_emulator"), KernelUtils::ToDeltaString("true")); //needed for delta RS builder + } - if (!connection_string.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_storage_connection_string"), KernelUtils::ToDeltaString(connection_string)); + if (!connection_string.empty() && connection_string != "NULL") { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_storage_connection_string"), KernelUtils::ToDeltaString(connection_string)); //needed for duckdb azure plugin + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("connection_string"), KernelUtils::ToDeltaString(connection_string)); //needed for duckdb azure plugin + account_name = parseFromConnectionString(connection_string, "AccountName"); + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("access_key"), KernelUtils::ToDeltaString(parseFromConnectionString(connection_string, "AccountKey"))); //needed for delta RS builder } - if (!account_name.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_storage_account_name"), KernelUtils::ToDeltaString(account_name)); + if (!account_name.empty() && account_name != "NULL") { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_account_name"), KernelUtils::ToDeltaString(account_name)); //needed for duckdb azure plugin + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("account_name"), KernelUtils::ToDeltaString(account_name)); //needed for delta RS builder } - if (!endpoint.empty()) { + if (!endpoint.empty() && endpoint != "NULL") { ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_endpoint"), KernelUtils::ToDeltaString(endpoint)); + } else { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_endpoint"), KernelUtils::ToDeltaString("https://" + account_name + ".blob.core.windows.net/")); //needed? Does that work with dfs files system? } - if (!credential_chain.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_credential_chain"), KernelUtils::ToDeltaString(credential_chain)); + if (!chain.empty() && chain != "NULL") { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("provider"), KernelUtils::ToDeltaString("credential_chain")); //needed for duckdb azure plugin + + if (chain.find("cli") != std::string::npos) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("use_azure_cli"), KernelUtils::ToDeltaString("true")); //dont know if that is the right way, but we need to tell delta RS builder to authenticate with azure cli + } + + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_credential_chain"), KernelUtils::ToDeltaString(chain)); //needed for duckdb azure plugin, dont know if all three are necessary + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("credential_chain"), KernelUtils::ToDeltaString(chain)); //needed for duckdb azure plugin, dont know if all three are necessary + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("chain"), KernelUtils::ToDeltaString(chain)); //needed for duckdb azure plugin, dont know if all three are necessary } - if (!client_id.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_id"), KernelUtils::ToDeltaString(client_id)); + if (!client_id.empty() && client_id != "NULL") { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_id"), KernelUtils::ToDeltaString(client_id)); //untested } - if (!client_secret.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_secret"), KernelUtils::ToDeltaString(client_secret)); + if (!client_secret.empty() && client_secret != "NULL") { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_secret"), KernelUtils::ToDeltaString(client_secret)); //untested } - if (!tenant_id.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_tenant_id"), KernelUtils::ToDeltaString(tenant_id)); + if (!tenant_id.empty() && tenant_id != "NULL") { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_tenant_id"), KernelUtils::ToDeltaString(tenant_id)); //needed for duckdb azure plugin } - if (!certificate_path.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_certificate_path"), KernelUtils::ToDeltaString(certificate_path)); + if (!certificate_path.empty() && certificate_path != "NULL") { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_certificate_path"), KernelUtils::ToDeltaString(certificate_path)); //untested } - if (!http_proxy.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("http_proxy"), KernelUtils::ToDeltaString(http_proxy)); + if (!http_proxy.empty() && http_proxy != "NULL") { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("http_proxy"), KernelUtils::ToDeltaString(http_proxy)); //untested } - if (!proxy_user_name.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("proxy_user_name"), KernelUtils::ToDeltaString(proxy_user_name)); + if (!proxy_user_name.empty() && proxy_user_name != "NULL") { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("proxy_user_name"), KernelUtils::ToDeltaString(proxy_user_name)); //untested } - if (!proxy_password.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("proxy_password"), KernelUtils::ToDeltaString(proxy_password)); + if (!proxy_password.empty() && proxy_password != "NULL") { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("proxy_password"), KernelUtils::ToDeltaString(proxy_password)); //untested } + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("container_name"), KernelUtils::ToDeltaString(bucket)); // needed ? } return builder; diff --git a/test/sql/generated/azure.emulator.x b/test/sql/generated/azure.emulator.x new file mode 100644 index 0000000..86b67ec --- /dev/null +++ b/test/sql/generated/azure.emulator.x @@ -0,0 +1,25 @@ +# name: test/sql/generated/azure.emulator +# description: test delta scan on azure emulator data using secret +# group: [delta_generated] + +require parquet + +require httpfs + +require azure + +require delta + +require-env GENERATED_AZURE_DATA_AVAILABLE + +statement ok +CREATE SECRET azure_1 (TYPE AZURE, CONNECTION_STRING 'AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;DefaultEndpointsProtocol=http;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1') + +# Run modified tpch q06 against the remote data +query I rowsort q1 +SELECT + * +FROM + delta_scan('az://test-bucket-ceiveran/delta_testing/lineitem_sf0_01/delta_lake/') +LIMIT 100 +---- \ No newline at end of file diff --git a/vcpkg.json b/vcpkg.json index 85936bf..0cefd94 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -1,5 +1,8 @@ { "dependencies": [ + "azure-identity-cpp", + "azure-storage-blobs-cpp", + "azure-storage-files-datalake-cpp", "openssl" ] } \ No newline at end of file From 4688190ce50e0adadee097a46a28aee5b3cdd5cc Mon Sep 17 00:00:00 2001 From: Norman Foerster Date: Tue, 25 Jun 2024 14:03:17 +0200 Subject: [PATCH 05/13] load azure functionality from duckdb azure plugin --- .gitmodules | 3 +++ duckdb | 2 +- duckdb_azure | 1 + extension-ci-tools | 2 +- extension_config.cmake | 4 +++- 5 files changed, 9 insertions(+), 3 deletions(-) create mode 160000 duckdb_azure diff --git a/.gitmodules b/.gitmodules index a55d71e..5131848 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,3 +6,6 @@ path = extension-ci-tools url = git@github.com:duckdb/extension-ci-tools.git branch = main +[submodule "duckdb_azure"] + path = duckdb_azure + url = https://github.com/duckdb/duckdb_azure.git diff --git a/duckdb b/duckdb index 1f98600..7b8efd3 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 1f98600c2cf8722a6d2f2d805bb4af5e701319fc +Subproject commit 7b8efd3d0fab38ec9dae467861a317af3f1d7f3e diff --git a/duckdb_azure b/duckdb_azure new file mode 160000 index 0000000..097bb13 --- /dev/null +++ b/duckdb_azure @@ -0,0 +1 @@ +Subproject commit 097bb13aadb186ca43ae9b5dc6a21c20e56ad4dd diff --git a/extension-ci-tools b/extension-ci-tools index c0cc931..71b8a60 160000 --- a/extension-ci-tools +++ b/extension-ci-tools @@ -1 +1 @@ -Subproject commit c0cc9319492bfa38344c2f28bd35f2304c74cdde +Subproject commit 71b8a603ea24b1ac8a2cff134aca28163576548f diff --git a/extension_config.cmake b/extension_config.cmake index 6cfa12c..369abd0 100644 --- a/extension_config.cmake +++ b/extension_config.cmake @@ -10,7 +10,9 @@ duckdb_extension_load(delta duckdb_extension_load(httpfs) # Build the azure extension to test with azure -duckdb_extension_load(azure) +duckdb_extension_load(azure + SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/duckdb_azure +) # Build the tpch and tpcds extension for testing/benchmarking duckdb_extension_load(tpch) From 49810dac2fb9fbf8ef856a7b58e39c14981fb427 Mon Sep 17 00:00:00 2001 From: Norman Foerster Date: Tue, 25 Jun 2024 14:58:45 +0200 Subject: [PATCH 06/13] added tests --- .../{azure.emulator.x => azure.emulator.test} | 0 .../azure_emulator_with_partitions.test | 25 +++++++++++++++++++ 2 files changed, 25 insertions(+) rename test/sql/generated/{azure.emulator.x => azure.emulator.test} (100%) create mode 100644 test/sql/generated/azure_emulator_with_partitions.test diff --git a/test/sql/generated/azure.emulator.x b/test/sql/generated/azure.emulator.test similarity index 100% rename from test/sql/generated/azure.emulator.x rename to test/sql/generated/azure.emulator.test diff --git a/test/sql/generated/azure_emulator_with_partitions.test b/test/sql/generated/azure_emulator_with_partitions.test new file mode 100644 index 0000000..78946b4 --- /dev/null +++ b/test/sql/generated/azure_emulator_with_partitions.test @@ -0,0 +1,25 @@ +# name: test/sql/generated/azure.emulator +# description: test delta scan on azure emulator data using secret +# group: [delta_generated] + +require parquet + +require httpfs + +require azure + +require delta + +require-env GENERATED_AZURE_DATA_AVAILABLE + +statement ok +CREATE SECRET azure_1 (TYPE AZURE, CONNECTION_STRING 'AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;DefaultEndpointsProtocol=http;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1') + +# Run modified tpch q06 against the remote data +query I rowsort q1 +SELECT + * +FROM + delta_scan('az://test-bucket-ceiveran/delta_testing/simple_partitioned/delta_lake/') +where part=1 +---- \ No newline at end of file From aa0b52b14d459bece34efb305d39ca61a17c7ebc Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Thu, 27 Jun 2024 12:10:00 +0200 Subject: [PATCH 07/13] add testing for azure --- .github/workflows/CloudTesting.yml | 80 ++++++++++++ .github/workflows/LocalTesting.yml | 66 ++++++++++ .gitmodules | 5 +- duckdb_azure | 1 - extension_config.cmake | 4 +- scripts/upload_test_files_to_azurite.sh | 21 ++++ src/functions/delta_scan.cpp | 114 +++++++++--------- test/sql/cloud/azure/cli_auth.test | 37 ++++++ .../cloud/azure/hierarchical_namespace.test | 42 +++++++ test/sql/cloud/azure/spn_auth.test | 38 ++++++ test/sql/cloud/azure/unauthenticated.test | 47 ++++++++ test/sql/cloud/azurite/azurite.test | 31 +++++ 12 files changed, 426 insertions(+), 60 deletions(-) create mode 100644 .github/workflows/CloudTesting.yml create mode 100644 .github/workflows/LocalTesting.yml delete mode 160000 duckdb_azure create mode 100755 scripts/upload_test_files_to_azurite.sh create mode 100644 test/sql/cloud/azure/cli_auth.test create mode 100644 test/sql/cloud/azure/hierarchical_namespace.test create mode 100644 test/sql/cloud/azure/spn_auth.test create mode 100644 test/sql/cloud/azure/unauthenticated.test create mode 100644 test/sql/cloud/azurite/azurite.test diff --git a/.github/workflows/CloudTesting.yml b/.github/workflows/CloudTesting.yml new file mode 100644 index 0000000..f75a37d --- /dev/null +++ b/.github/workflows/CloudTesting.yml @@ -0,0 +1,80 @@ +name: Cloud functional tests +on: [push, repository_dispatch] +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} + cancel-in-progress: true +defaults: + run: + shell: bash + +jobs: + azure-tests-linux: + name: Azure tests (Linux) + runs-on: ubuntu-latest + env: + VCPKG_TARGET_TRIPLET: x64-linux + VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake + GEN: Ninja + DUCKDB_PLATFORM: linux_amd64 + + steps: + - name: Install required ubuntu packages + run: | + sudo apt-get update -y -qq + sudo apt-get install -y -qq software-properties-common + sudo add-apt-repository ppa:git-core/ppa + sudo apt-get update -y -qq + sudo apt-get install -y -qq ninja-build make gcc-multilib g++-multilib zip unzip build-essential checkinstall curl libz-dev openssh-client + + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + submodules: 'true' + + - name: Setup Ccache + uses: hendrikmuhs/ccache-action@v1.2.11 # Note: pinned due to GLIBC incompatibility in later releases + with: + key: ${{ github.job }}-${{ matrix.duckdb_arch }} + + - name: Setup vcpkg + uses: lukka/run-vcpkg@v11.1 + with: + vcpkgGitCommitId: a1a1cbc975abf909a6c8985a6a2b8fe20bbd9bd6 + + - name: Setup Rust + uses: dtolnay/rust-toolchain@stable + + - name: Build extension + env: + GEN: ninja + run: | + make release + + - name: Test with Service Principal (SPN) in env vars + env: + AZURE_CLIENT_ID: ${{secrets.AZURE_CLIENT_ID}} + AZURE_CLIENT_SECRET: ${{secrets.AZURE_CLIENT_SECRET}} + AZURE_TENANT_ID: ${{secrets.AZURE_TENANT_ID}} + AZURE_STORAGE_ACCOUNT: ${{secrets.AZURE_STORAGE_ACCOUNT}} + run: | + python3 duckdb/scripts/run_tests_one_by_one.py ./build/release/test/unittest "*test/sql/cloud/*" + + - name: Test with SPN logged in in azure-cli + env: + AZURE_STORAGE_ACCOUNT: ${{secrets.AZURE_STORAGE_ACCOUNT}} + DUCKDB_AZ_CLI_LOGGED_IN: 1 + run: | + az login --service-principal -u ${{secrets.AZURE_CLIENT_ID}} -p ${{secrets.AZURE_CLIENT_SECRET}} --tenant ${{secrets.AZURE_TENANT_ID}} + python3 duckdb/scripts/run_tests_one_by_one.py ./build/release/test/unittest "*test/sql/cloud/*" + + - name: Log out azure-cli + if: always() + run: | + az logout + + - name: Tests that focus on public non-authenticated requests + env: + AZURE_STORAGE_ACCOUNT: ${{secrets.AZURE_STORAGE_ACCOUNT}} + DUCKDB_AZURE_PUBLIC_CONTAINER_AVAILABLE: 1 + run: | + python3 duckdb/scripts/run_tests_one_by_one.py ./build/release/test/unittest "*test/sql/cloud/*" \ No newline at end of file diff --git a/.github/workflows/LocalTesting.yml b/.github/workflows/LocalTesting.yml new file mode 100644 index 0000000..23b31ac --- /dev/null +++ b/.github/workflows/LocalTesting.yml @@ -0,0 +1,66 @@ +name: Local functional tests +on: [push, pull_request,repository_dispatch] +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} + cancel-in-progress: true +defaults: + run: + shell: bash + +jobs: + azurite-tests-linux: + name: Azurite (local azure test server) tests (Linux) + runs-on: ubuntu-latest + container: 'quay.io/pypa/manylinux2014_x86_64' + env: + VCPKG_TARGET_TRIPLET: 'x64-linux' + GEN: Ninja + VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake + AZURE_STORAGE_CONNECTION_STRING: 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;' + AZURE_STORAGE_ACCOUNT: devstoreaccount1 + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + submodules: 'true' + + - name: install Azure test service + run: | + yum install -y nodejs npm + npm install -g azurite + echo -e "[azure-cli]\nname=Azure CLI\nbaseurl=https://packages.microsoft.com/yumrepos/azure-cli\nenabled=1\ngpgcheck=1\ngpgkey=https://packages.microsoft.com/keys/microsoft.asc" | tee /etc/yum.repos.d/azure-cli.repo + yum install -y azure-cli + + - name: Setup ManyLinux2014 + run: | + ./duckdb/scripts/setup_manylinux2014.sh general aws-cli ccache ssh openssl python_alias + + - name: Setup vcpkg + uses: lukka/run-vcpkg@v11.1 + with: + vcpkgGitCommitId: a1a1cbc975abf909a6c8985a6a2b8fe20bbd9bd6 + + # Build extension + - name: Build extension + env: + GEN: ninja + run: | + make release + + - name: Launch & populate Azure test service + run: | + azurite > azurite_log.txt 2>&1 & + sleep 10 + ./scripts/upload_test_files_to_azurite.sh + + - name: Test extension + run: | + make test + + - name: Azure test server log + if: always() + shell: bash + run: | + echo "## azurite" + cat azurite_log.txt \ No newline at end of file diff --git a/.gitmodules b/.gitmodules index 5131848..cd15846 100644 --- a/.gitmodules +++ b/.gitmodules @@ -5,7 +5,4 @@ [submodule "extension-ci-tools"] path = extension-ci-tools url = git@github.com:duckdb/extension-ci-tools.git - branch = main -[submodule "duckdb_azure"] - path = duckdb_azure - url = https://github.com/duckdb/duckdb_azure.git + branch = main \ No newline at end of file diff --git a/duckdb_azure b/duckdb_azure deleted file mode 160000 index 097bb13..0000000 --- a/duckdb_azure +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 097bb13aadb186ca43ae9b5dc6a21c20e56ad4dd diff --git a/extension_config.cmake b/extension_config.cmake index 369abd0..16571c2 100644 --- a/extension_config.cmake +++ b/extension_config.cmake @@ -11,7 +11,9 @@ duckdb_extension_load(httpfs) # Build the azure extension to test with azure duckdb_extension_load(azure - SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/duckdb_azure + LOAD_TESTS + GIT_URL https://github.com/duckdb/duckdb_azure + GIT_TAG 49b63dc8cd166952a0a34dfd54e6cfe5b823e05e ) # Build the tpch and tpcds extension for testing/benchmarking diff --git a/scripts/upload_test_files_to_azurite.sh b/scripts/upload_test_files_to_azurite.sh new file mode 100755 index 0000000..f1ae34e --- /dev/null +++ b/scripts/upload_test_files_to_azurite.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# Default Azurite connection string (see: https://github.com/Azure/Azurite) +conn_string="DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;" + +# Create container +az storage container create -n delta-testing-private --connection-string "${conn_string}" +az storage container create -n delta-testing-public --connection-string "${conn_string}" --public-access blob + +copy_file() { + local from="${1}" + local to="${2}" + az storage blob upload --file "${from}" --name "${to}" --container-name "delta-testing-private" --connection-string "${conn_string}" + az storage blob upload --file "${from}" --name "${to}" --container-name "delta-testing-public" --connection-string "${conn_string}" +} + +cd ./build/debug/rust/src/delta_kernel/acceptance/tests/dat/out/reader_tests/generated +while read filepath; do + remote_filepath=dat/"$(echo "${filepath}" | cut -c 3-)" + copy_file "${filepath}" "${remote_filepath}" +done < <(find . -type f) \ No newline at end of file diff --git a/src/functions/delta_scan.cpp b/src/functions/delta_scan.cpp index 40dd143..3929c57 100644 --- a/src/functions/delta_scan.cpp +++ b/src/functions/delta_scan.cpp @@ -66,16 +66,25 @@ static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::Kernel ffi::visit_scan_data(engine_data, selection_vec, engine_context, visit_callback); } -std::string parseFromConnectionString(const std::string& connectionString, const std::string& key) { - std::regex pattern(key + "=([^;]+);"); +string ParseAccountNameFromEndpoint(const string& endpoint) { + if (!StringUtil::StartsWith(endpoint, "https://")) { + return ""; + } + auto result = endpoint.find('.', 8); + if (result == endpoint.npos) { + return ""; + } + return endpoint.substr(8,result-8); +} + +string parseFromConnectionString(const string& connectionString, const string& key) { + std::regex pattern(key + "=([^;]+)(?=;|$)"); std::smatch matches; if (std::regex_search(connectionString, matches, pattern) && matches.size() > 1) { // The second match ([1]) contains the access key return matches[1].str(); - } else { - // If no access key is found, return an empty string or handle as needed - return ""; } + return ""; } static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &path) { @@ -169,75 +178,72 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_region"), KernelUtils::ToDeltaString(region)); } else if (secret_type == "azure") { - // azure seems to be super complicated as we need to cover duckdb azure plugin and delta RS builder // and both require different settings - auto connection_string = kv_secret.TryGetValue("connection_string").ToString(); auto account_name = kv_secret.TryGetValue("account_name").ToString(); auto endpoint = kv_secret.TryGetValue("endpoint").ToString(); auto client_id = kv_secret.TryGetValue("client_id").ToString(); auto client_secret = kv_secret.TryGetValue("client_secret").ToString(); auto tenant_id = kv_secret.TryGetValue("tenant_id").ToString(); - auto certificate_path = kv_secret.TryGetValue("certificate_path").ToString(); - auto http_proxy = kv_secret.TryGetValue("http_proxy").ToString(); - auto proxy_user_name = kv_secret.TryGetValue("proxy_user_name").ToString(); - auto proxy_password = kv_secret.TryGetValue("proxy_password").ToString(); auto chain = kv_secret.TryGetValue("chain").ToString(); + auto provider = kv_secret.GetProvider(); - if (account_name == "devstoreaccount1" || connection_string.find("devstoreaccount1") != string::npos) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("use_emulator"), KernelUtils::ToDeltaString("true")); //needed for delta RS builder - } + if (provider == "credential_chain") { + // Authentication option 1a: using the cli authentication + if (chain.find("cli") != std::string::npos) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("use_azure_cli"), KernelUtils::ToDeltaString("true")); + } + // Authentication option 1b: non-cli credential chains will just "hope for the best" technically since we are using the default + // credential chain provider duckDB and delta-kernel-rs should find the same auth + } else if (!connection_string.empty() && connection_string != "NULL") { - if (!connection_string.empty() && connection_string != "NULL") { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_storage_connection_string"), KernelUtils::ToDeltaString(connection_string)); //needed for duckdb azure plugin - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("connection_string"), KernelUtils::ToDeltaString(connection_string)); //needed for duckdb azure plugin + // Authentication option 2: a connection string based on account key + auto account_key = parseFromConnectionString(connection_string, "AccountKey"); account_name = parseFromConnectionString(connection_string, "AccountName"); - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("access_key"), KernelUtils::ToDeltaString(parseFromConnectionString(connection_string, "AccountKey"))); //needed for delta RS builder + // Authentication option 2: a connection string based on account key + if (!account_name.empty() && !account_key.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("account_key"), + KernelUtils::ToDeltaString(account_key)); + } else { + // Authentication option 2b: a connection string based on SAS token + endpoint = parseFromConnectionString(connection_string, "BlobEndpoint"); + if (account_name.empty()) { + account_name = ParseAccountNameFromEndpoint(endpoint); + } + auto sas_token = parseFromConnectionString(connection_string, "SharedAccessSignature"); + if (!sas_token.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("sas_token"), + KernelUtils::ToDeltaString(sas_token)); + } + } + } else if (provider == "service_principal") { + if (!client_id.empty() && client_id != "NULL") { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_id"), KernelUtils::ToDeltaString(client_id)); + } + if (!client_secret.empty() && client_secret != "NULL") { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_secret"), KernelUtils::ToDeltaString(client_secret)); + } + if (!tenant_id.empty() && tenant_id != "NULL") { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_tenant_id"), KernelUtils::ToDeltaString(tenant_id)); + } + } else { + // Authentication option 3: no authentication, just an account name + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_skip_signature"), KernelUtils::ToDeltaString("true")); + } + // Set the use_emulator option for when the azurite test server is used + if (account_name == "devstoreaccount1" || connection_string.find("devstoreaccount1") != string::npos) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("use_emulator"), KernelUtils::ToDeltaString("true")); } if (!account_name.empty() && account_name != "NULL") { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_account_name"), KernelUtils::ToDeltaString(account_name)); //needed for duckdb azure plugin ffi::set_builder_option(builder, KernelUtils::ToDeltaString("account_name"), KernelUtils::ToDeltaString(account_name)); //needed for delta RS builder } if (!endpoint.empty() && endpoint != "NULL") { ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_endpoint"), KernelUtils::ToDeltaString(endpoint)); } else { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_endpoint"), KernelUtils::ToDeltaString("https://" + account_name + ".blob.core.windows.net/")); //needed? Does that work with dfs files system? - } - if (!chain.empty() && chain != "NULL") { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("provider"), KernelUtils::ToDeltaString("credential_chain")); //needed for duckdb azure plugin - - if (chain.find("cli") != std::string::npos) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("use_azure_cli"), KernelUtils::ToDeltaString("true")); //dont know if that is the right way, but we need to tell delta RS builder to authenticate with azure cli - } - - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_credential_chain"), KernelUtils::ToDeltaString(chain)); //needed for duckdb azure plugin, dont know if all three are necessary - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("credential_chain"), KernelUtils::ToDeltaString(chain)); //needed for duckdb azure plugin, dont know if all three are necessary - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("chain"), KernelUtils::ToDeltaString(chain)); //needed for duckdb azure plugin, dont know if all three are necessary + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_endpoint"), KernelUtils::ToDeltaString("https://" + account_name + ".blob.core.windows.net/")); } - if (!client_id.empty() && client_id != "NULL") { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_id"), KernelUtils::ToDeltaString(client_id)); //untested - } - if (!client_secret.empty() && client_secret != "NULL") { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_secret"), KernelUtils::ToDeltaString(client_secret)); //untested - } - if (!tenant_id.empty() && tenant_id != "NULL") { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_tenant_id"), KernelUtils::ToDeltaString(tenant_id)); //needed for duckdb azure plugin - } - if (!certificate_path.empty() && certificate_path != "NULL") { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_certificate_path"), KernelUtils::ToDeltaString(certificate_path)); //untested - } - if (!http_proxy.empty() && http_proxy != "NULL") { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("http_proxy"), KernelUtils::ToDeltaString(http_proxy)); //untested - } - if (!proxy_user_name.empty() && proxy_user_name != "NULL") { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("proxy_user_name"), KernelUtils::ToDeltaString(proxy_user_name)); //untested - } - if (!proxy_password.empty() && proxy_password != "NULL") { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("proxy_password"), KernelUtils::ToDeltaString(proxy_password)); //untested - } - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("container_name"), KernelUtils::ToDeltaString(bucket)); // needed ? - + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("container_name"), KernelUtils::ToDeltaString(bucket)); } return builder; } diff --git a/test/sql/cloud/azure/cli_auth.test b/test/sql/cloud/azure/cli_auth.test new file mode 100644 index 0000000..fffa36a --- /dev/null +++ b/test/sql/cloud/azure/cli_auth.test @@ -0,0 +1,37 @@ +# name: test/sql/cloud/basic.test +# description: confirm queried data is correct +# group: [azure] + +require azure + +require parquet + +require delta + +require-env DUCKDB_AZ_CLI_LOGGED_IN + +require-env AZURE_STORAGE_ACCOUNT + +statement ok +set allow_persistent_secrets=false + +statement ok +CREATE SECRET az1 ( + TYPE AZURE, + PROVIDER CREDENTIAL_CHAIN, + CHAIN 'cli', + ACCOUNT_NAME '${AZURE_STORAGE_ACCOUNT}' +) + +mode output_result + +# Run a remote DAT test +query I rowsort all_primitive_types +SELECT * +FROM delta_scan('azure://delta-testing-private/dat/all_primitive_types/delta') +---- + +query I rowsort all_primitive_types +SELECT * +FROM parquet_scan('azure://delta-testing-private/dat/all_primitive_types/expected/latest/**/*.parquet') +---- diff --git a/test/sql/cloud/azure/hierarchical_namespace.test b/test/sql/cloud/azure/hierarchical_namespace.test new file mode 100644 index 0000000..470a325 --- /dev/null +++ b/test/sql/cloud/azure/hierarchical_namespace.test @@ -0,0 +1,42 @@ +# name: test/sql/hierarchical_namespace.test +# description: test azure extension with ADLS GEN2 storage +# group: [azure] + +# Require statement will ensure this test is run with this extension loaded +require azure + +require parquet + +require delta + +require-env AZURE_TENANT_ID + +require-env AZURE_CLIENT_ID + +require-env AZURE_CLIENT_SECRET + +require-env AZURE_STORAGE_ACCOUNT + +statement ok +set allow_persistent_secrets=false + +statement ok +CREATE SECRET spn ( + TYPE AZURE, + PROVIDER SERVICE_PRINCIPAL, + TENANT_ID '${AZURE_TENANT_ID}', + CLIENT_ID '${AZURE_CLIENT_ID}', + CLIENT_SECRET '${AZURE_CLIENT_SECRET}', + ACCOUNT_NAME '${AZURE_STORAGE_ACCOUNT}' +); + +# Run a remote DAT test on abfss +query I +SELECT int32 +FROM delta_scan('abfss://delta-testing-private/dat/all_primitive_types/delta') +---- +0 +1 +2 +3 +4 diff --git a/test/sql/cloud/azure/spn_auth.test b/test/sql/cloud/azure/spn_auth.test new file mode 100644 index 0000000..11ed035 --- /dev/null +++ b/test/sql/cloud/azure/spn_auth.test @@ -0,0 +1,38 @@ +# name: test/sql/cloud/spn_auth.test +# description: test azure extension with service principal authentication +# group: [azure] + +require azure + +require parquet + +require delta + +require-env AZURE_CLIENT_ID + +require-env AZURE_CLIENT_SECRET + +require-env AZURE_TENANT_ID + +require-env AZURE_STORAGE_ACCOUNT + +statement ok +CREATE SECRET spn ( + TYPE AZURE, + PROVIDER SERVICE_PRINCIPAL, + TENANT_ID '${AZURE_TENANT_ID}', + CLIENT_ID '${AZURE_CLIENT_ID}', + CLIENT_SECRET '${AZURE_CLIENT_SECRET}', + ACCOUNT_NAME '${AZURE_STORAGE_ACCOUNT}' +); + +# Run a remote DAT test +query I rowsort all_primitive_types +SELECT * +FROM delta_scan('azure://delta-testing-private/dat/all_primitive_types/delta') +---- + +query I rowsort all_primitive_types +SELECT * +FROM parquet_scan('azure://delta-testing-private/dat/all_primitive_types/expected/latest/**/*.parquet') +---- diff --git a/test/sql/cloud/azure/unauthenticated.test b/test/sql/cloud/azure/unauthenticated.test new file mode 100644 index 0000000..84c1f5f --- /dev/null +++ b/test/sql/cloud/azure/unauthenticated.test @@ -0,0 +1,47 @@ +# name: test/sql/cloud/unauthenticated.test +# description: test unauthenticated queries +# group: [azure] + +require azure + +require parquet + +require delta + +require-env DUCKDB_AZURE_PUBLIC_CONTAINER_AVAILABLE + +require-env AZURE_STORAGE_ACCOUNT + +statement ok +set allow_persistent_secrets=false + +# TODO: this doesn't work yet +mode skip + +query I +SELECT int32 +FROM delta_scan('azure://${AZURE_STORAGE_ACCOUNT}.blob.core.windows.net/dat/all_primitive_types/delta') +---- +0 +1 +2 +3 +4 + +mode unskip + +# Using a secret to set the account name, we can omit the fully qualified url +statement ok +CREATE SECRET s1 (TYPE AZURE, ACCOUNT_NAME '${AZURE_STORAGE_ACCOUNT}') + +query I +SELECT int32 +FROM delta_scan('azure://delta-testing-public/dat/all_primitive_types/delta') +---- +0 +1 +2 +3 +4 + + diff --git a/test/sql/cloud/azurite/azurite.test b/test/sql/cloud/azurite/azurite.test new file mode 100644 index 0000000..169615b --- /dev/null +++ b/test/sql/cloud/azurite/azurite.test @@ -0,0 +1,31 @@ +# name: test/sql/cloud/azurite/azurite.test +# description: test with azurite test server +# group: [azure] + +# Require statement will ensure this test is run with this extension loaded +require azure + +require parquet + +require delta + +require-env AZURE_STORAGE_CONNECTION_STRING + +# Set connection string from env var +statement ok +CREATE SECRET (TYPE AZURE, CONNECTION_STRING '${AZURE_STORAGE_CONNECTION_STRING}'); + +# We need a connection string to do requests +foreach prefix azure:// az:// + +query I +SELECT int32 +FROM delta_scan('${prefix}delta-testing-private/dat/all_primitive_types/delta') +---- +0 +1 +2 +3 +4 + +endloop From d33690e61901336f9e586008a1db7f3fdc1f6d6f Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Thu, 27 Jun 2024 14:34:00 +0200 Subject: [PATCH 08/13] add rust to local test --- .github/workflows/LocalTesting.yml | 3 +++ test/sql/generated/azure.emulator.test | 25 ------------------- .../azure_emulator_with_partitions.test | 25 ------------------- 3 files changed, 3 insertions(+), 50 deletions(-) delete mode 100644 test/sql/generated/azure.emulator.test delete mode 100644 test/sql/generated/azure_emulator_with_partitions.test diff --git a/.github/workflows/LocalTesting.yml b/.github/workflows/LocalTesting.yml index 23b31ac..a08f0c3 100644 --- a/.github/workflows/LocalTesting.yml +++ b/.github/workflows/LocalTesting.yml @@ -41,6 +41,9 @@ jobs: with: vcpkgGitCommitId: a1a1cbc975abf909a6c8985a6a2b8fe20bbd9bd6 + - name: Setup Rust + uses: dtolnay/rust-toolchain@stable + # Build extension - name: Build extension env: diff --git a/test/sql/generated/azure.emulator.test b/test/sql/generated/azure.emulator.test deleted file mode 100644 index 86b67ec..0000000 --- a/test/sql/generated/azure.emulator.test +++ /dev/null @@ -1,25 +0,0 @@ -# name: test/sql/generated/azure.emulator -# description: test delta scan on azure emulator data using secret -# group: [delta_generated] - -require parquet - -require httpfs - -require azure - -require delta - -require-env GENERATED_AZURE_DATA_AVAILABLE - -statement ok -CREATE SECRET azure_1 (TYPE AZURE, CONNECTION_STRING 'AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;DefaultEndpointsProtocol=http;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1') - -# Run modified tpch q06 against the remote data -query I rowsort q1 -SELECT - * -FROM - delta_scan('az://test-bucket-ceiveran/delta_testing/lineitem_sf0_01/delta_lake/') -LIMIT 100 ----- \ No newline at end of file diff --git a/test/sql/generated/azure_emulator_with_partitions.test b/test/sql/generated/azure_emulator_with_partitions.test deleted file mode 100644 index 78946b4..0000000 --- a/test/sql/generated/azure_emulator_with_partitions.test +++ /dev/null @@ -1,25 +0,0 @@ -# name: test/sql/generated/azure.emulator -# description: test delta scan on azure emulator data using secret -# group: [delta_generated] - -require parquet - -require httpfs - -require azure - -require delta - -require-env GENERATED_AZURE_DATA_AVAILABLE - -statement ok -CREATE SECRET azure_1 (TYPE AZURE, CONNECTION_STRING 'AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;DefaultEndpointsProtocol=http;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;TableEndpoint=http://127.0.0.1:10002/devstoreaccount1') - -# Run modified tpch q06 against the remote data -query I rowsort q1 -SELECT - * -FROM - delta_scan('az://test-bucket-ceiveran/delta_testing/simple_partitioned/delta_lake/') -where part=1 ----- \ No newline at end of file From 06798f3a9ad8369e8401fbc54dd8328a1addc3b0 Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Thu, 27 Jun 2024 14:42:32 +0200 Subject: [PATCH 09/13] small refactor --- .github/workflows/GeneratedTests.yml | 53 ---------------------------- .github/workflows/LocalTesting.yml | 51 +++++++++++++++++++++++--- 2 files changed, 47 insertions(+), 57 deletions(-) delete mode 100644 .github/workflows/GeneratedTests.yml diff --git a/.github/workflows/GeneratedTests.yml b/.github/workflows/GeneratedTests.yml deleted file mode 100644 index bd106a5..0000000 --- a/.github/workflows/GeneratedTests.yml +++ /dev/null @@ -1,53 +0,0 @@ -# -# This workflow calls the main distribution pipeline from DuckDB to build, test and (optionally) release the extension -# -name: GeneratedTests -on: - push: - pull_request: - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }} - cancel-in-progress: true - -jobs: - generated-tests-linux: - name: Generated Tests (Linux) - runs-on: ubuntu-latest - env: - GEN: ninja - VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake - - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - submodules: 'true' - - - name: Install - shell: bash - run: sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build - - - name: Setup Ccache - uses: hendrikmuhs/ccache-action@main - with: - key: ${{ github.job }} - - - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Setup vcpkg - uses: lukka/run-vcpkg@v11.1 - with: - vcpkgGitCommitId: a1a1cbc975abf909a6c8985a6a2b8fe20bbd9bd6 - - - name: Build - shell: bash - run: make generate-data - - - name: Test - shell: bash - run: | - GENERATED_DATA_AVAILABLE=1 make test \ No newline at end of file diff --git a/.github/workflows/LocalTesting.yml b/.github/workflows/LocalTesting.yml index a08f0c3..95a7a09 100644 --- a/.github/workflows/LocalTesting.yml +++ b/.github/workflows/LocalTesting.yml @@ -36,14 +36,17 @@ jobs: run: | ./duckdb/scripts/setup_manylinux2014.sh general aws-cli ccache ssh openssl python_alias + - name: Setup Rust for manylinux (dtolnay/rust-toolchain doesn't work due to curl being old here) + if: ${{ matrix.duckdb_arch == 'linux_amd64_gcc4' }} + run: | + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + echo "$HOME/.cargo/bin" >> $GITHUB_PATH + - name: Setup vcpkg uses: lukka/run-vcpkg@v11.1 with: vcpkgGitCommitId: a1a1cbc975abf909a6c8985a6a2b8fe20bbd9bd6 - - name: Setup Rust - uses: dtolnay/rust-toolchain@stable - # Build extension - name: Build extension env: @@ -66,4 +69,44 @@ jobs: shell: bash run: | echo "## azurite" - cat azurite_log.txt \ No newline at end of file + cat azurite_log.txt + + generated-tests-linux: + name: Generated Tests (Linux) + runs-on: ubuntu-latest + env: + GEN: ninja + VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + submodules: 'true' + + - name: Install + shell: bash + run: sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build + + - name: Setup Ccache + uses: hendrikmuhs/ccache-action@main + with: + key: ${{ github.job }} + + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Setup vcpkg + uses: lukka/run-vcpkg@v11.1 + with: + vcpkgGitCommitId: a1a1cbc975abf909a6c8985a6a2b8fe20bbd9bd6 + + - name: Build + shell: bash + run: make generate-data + + - name: Test + shell: bash + run: | + GENERATED_DATA_AVAILABLE=1 make test \ No newline at end of file From 7c296837d93b8c21780d98ba287fe2a1e5361e7c Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Thu, 27 Jun 2024 15:33:46 +0200 Subject: [PATCH 10/13] add missing openssl dep --- .github/workflows/LocalTesting.yml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.github/workflows/LocalTesting.yml b/.github/workflows/LocalTesting.yml index 95a7a09..d74c7dd 100644 --- a/.github/workflows/LocalTesting.yml +++ b/.github/workflows/LocalTesting.yml @@ -34,7 +34,7 @@ jobs: - name: Setup ManyLinux2014 run: | - ./duckdb/scripts/setup_manylinux2014.sh general aws-cli ccache ssh openssl python_alias + ./duckdb/scripts/setup_manylinux2014.sh general aws-cli ccache ssh python_alias openssl - name: Setup Rust for manylinux (dtolnay/rust-toolchain doesn't work due to curl being old here) if: ${{ matrix.duckdb_arch == 'linux_amd64_gcc4' }} @@ -47,6 +47,17 @@ jobs: with: vcpkgGitCommitId: a1a1cbc975abf909a6c8985a6a2b8fe20bbd9bd6 + - name: Handle OpenSSL dependency for rust build + run: | + echo "OPENSSL_ROOT_DIR=`pwd`/build/release/vcpkg_installed/${{ matrix.vcpkg_triplet }}" >> $GITHUB_ENV + echo "OPENSSL_DIR=`pwd`/build/release/vcpkg_installed/${{ matrix.vcpkg_triplet }}" >> $GITHUB_ENV + echo "OPENSSL_USE_STATIC_LIBS=true" >> $GITHUB_ENV + + - name: Set Openssl dir + if: inputs.openssl_path != '' + shell: bash + run: | + # Build extension - name: Build extension env: From 20ad323e144e09d421bc35dd78fec04cdf293974 Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Thu, 27 Jun 2024 17:38:56 +0200 Subject: [PATCH 11/13] correct openssl path --- .github/workflows/LocalTesting.yml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/.github/workflows/LocalTesting.yml b/.github/workflows/LocalTesting.yml index d74c7dd..0b2ea85 100644 --- a/.github/workflows/LocalTesting.yml +++ b/.github/workflows/LocalTesting.yml @@ -49,15 +49,10 @@ jobs: - name: Handle OpenSSL dependency for rust build run: | - echo "OPENSSL_ROOT_DIR=`pwd`/build/release/vcpkg_installed/${{ matrix.vcpkg_triplet }}" >> $GITHUB_ENV - echo "OPENSSL_DIR=`pwd`/build/release/vcpkg_installed/${{ matrix.vcpkg_triplet }}" >> $GITHUB_ENV + echo "OPENSSL_ROOT_DIR=`pwd`/build/release/vcpkg_installed/x64-linux" >> $GITHUB_ENV + echo "OPENSSL_DIR=`pwd`/build/release/vcpkg_installed/x64-linux" >> $GITHUB_ENV echo "OPENSSL_USE_STATIC_LIBS=true" >> $GITHUB_ENV - - name: Set Openssl dir - if: inputs.openssl_path != '' - shell: bash - run: | - # Build extension - name: Build extension env: From 9b0b86fd343fc59d3d96dfaa8f1689f4cddc8caa Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Thu, 27 Jun 2024 18:16:28 +0200 Subject: [PATCH 12/13] actually run rust install --- .github/workflows/LocalTesting.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/LocalTesting.yml b/.github/workflows/LocalTesting.yml index 0b2ea85..ecdc23c 100644 --- a/.github/workflows/LocalTesting.yml +++ b/.github/workflows/LocalTesting.yml @@ -37,7 +37,6 @@ jobs: ./duckdb/scripts/setup_manylinux2014.sh general aws-cli ccache ssh python_alias openssl - name: Setup Rust for manylinux (dtolnay/rust-toolchain doesn't work due to curl being old here) - if: ${{ matrix.duckdb_arch == 'linux_amd64_gcc4' }} run: | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y echo "$HOME/.cargo/bin" >> $GITHUB_PATH From d4454da51b22320bd7b0fa53539e46b95b5e39cf Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Fri, 28 Jun 2024 09:35:27 +0200 Subject: [PATCH 13/13] fix upload script --- scripts/upload_test_files_to_azurite.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/upload_test_files_to_azurite.sh b/scripts/upload_test_files_to_azurite.sh index f1ae34e..f3631ba 100755 --- a/scripts/upload_test_files_to_azurite.sh +++ b/scripts/upload_test_files_to_azurite.sh @@ -14,7 +14,7 @@ copy_file() { az storage blob upload --file "${from}" --name "${to}" --container-name "delta-testing-public" --connection-string "${conn_string}" } -cd ./build/debug/rust/src/delta_kernel/acceptance/tests/dat/out/reader_tests/generated +cd ./build/release/rust/src/delta_kernel/acceptance/tests/dat/out/reader_tests/generated && while read filepath; do remote_filepath=dat/"$(echo "${filepath}" | cut -c 3-)" copy_file "${filepath}" "${remote_filepath}"