From f2c97d85313bb4a8bb1cd9cdd4c0e0e8d8d18cc6 Mon Sep 17 00:00:00 2001 From: Gareth Western Date: Sun, 18 Aug 2024 23:45:52 +0200 Subject: [PATCH] fix: update url parser to include alternative abfss syntax --- src/azure_parsed_url.cpp | 46 +++++++++++++++------- test/sql/cloud/hierarchical_namespace.test | 6 +++ 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/src/azure_parsed_url.cpp b/src/azure_parsed_url.cpp index 83e114e..2eb2be0 100644 --- a/src/azure_parsed_url.cpp +++ b/src/azure_parsed_url.cpp @@ -7,7 +7,8 @@ namespace duckdb { AzureParsedUrl ParseUrl(const std::string &url) { constexpr auto invalid_url_format = "The URL %s does not match the expected formats: (azure|az):///[] or the fully qualified one: " - "(azure|az)://.//[]"; + "(abfss|azure|az)://.//[] " + "or abfss://@./[]"; bool is_fully_qualified; std::string container, storage_account_name, endpoint, prefix, path; @@ -22,26 +23,41 @@ AzureParsedUrl ParseUrl(const std::string &url) { // they will be no more changes to path format. const auto dot_pos = url.find('.', prefix_end_pos); const auto slash_pos = url.find('/', prefix_end_pos); + const auto at_pos = url.find('@', prefix_end_pos); if (slash_pos == std::string::npos) { throw duckdb::IOException(invalid_url_format, url); } if (dot_pos != std::string::npos && dot_pos < slash_pos) { - // syntax is (azure|az)://.//[] - const auto container_slash_pos = url.find('/', dot_pos); - if (container_slash_pos == string::npos) { - throw IOException(invalid_url_format, url); - } - const auto path_slash_pos = url.find('/', container_slash_pos + 1); - if (path_slash_pos == string::npos) { - throw IOException(invalid_url_format, url); - } - is_fully_qualified = true; - storage_account_name = url.substr(prefix_end_pos, dot_pos - prefix_end_pos); - endpoint = url.substr(dot_pos + 1, container_slash_pos - dot_pos - 1); - container = url.substr(container_slash_pos + 1, path_slash_pos - container_slash_pos - 1); - path = url.substr(path_slash_pos + 1); + + if (url.rfind(AzureDfsStorageFileSystem::PATH_PREFIX, 0) == 0 && + at_pos != std::string::npos) { + // syntax is abfss://@./[] + const auto path_slash_pos = url.find('/', prefix_end_pos + 1); + if (path_slash_pos == string::npos) { + throw IOException(invalid_url_format, url); + } + + container = url.substr(prefix_end_pos, at_pos - prefix_end_pos); + storage_account_name = url.substr(at_pos + 1, dot_pos - at_pos - 1); + endpoint = url.substr(dot_pos + 1, path_slash_pos - dot_pos - 1); + path = url.substr(path_slash_pos + 1); + } else { + // syntax is (abfss|azure|az)://.//[] + const auto container_slash_pos = url.find('/', dot_pos); + if (container_slash_pos == string::npos) { + throw IOException(invalid_url_format, url); + } + const auto path_slash_pos = url.find('/', container_slash_pos + 1); + if (path_slash_pos == string::npos) { + throw IOException(invalid_url_format, url); + } + storage_account_name = url.substr(prefix_end_pos, dot_pos - prefix_end_pos); + endpoint = url.substr(dot_pos + 1, container_slash_pos - dot_pos - 1); + container = url.substr(container_slash_pos + 1, path_slash_pos - container_slash_pos - 1); + path = url.substr(path_slash_pos + 1); + } } else { // syntax is (azure|az):///[] // Storage account name will be retrieve from the variables or the secret information diff --git a/test/sql/cloud/hierarchical_namespace.test b/test/sql/cloud/hierarchical_namespace.test index 279e0fb..a2d3dac 100644 --- a/test/sql/cloud/hierarchical_namespace.test +++ b/test/sql/cloud/hierarchical_namespace.test @@ -69,6 +69,12 @@ SELECT count(*) FROM 'abfss://${AZURE_STORAGE_ACCOUNT}.dfs.core.windows.net/test ---- 2317 +# Check fully qualified name abfss alternative syntax +query I +SELECT count(*) FROM 'abfss://testing-private@${AZURE_STORAGE_ACCOUNT}.dfs.core.windows.net/partitioned/l_receipmonth=*/l_shipmode=TRUCK/*.csv'; +---- +2317 + # Enable http info for the explain analyze statement statement ok SET azure_http_stats = true;