From f274a2e73e92f3ce84ff7c20682d50776cb89bf6 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Mon, 16 Dec 2024 20:45:45 +0000 Subject: [PATCH 01/26] Add Path and NetworkAuth structs which describe how to access a resource including network auth --- components/core/src/clp_s/InputConfig.cpp | 88 ++++++++++++++++++ components/core/src/clp_s/InputConfig.hpp | 105 ++++++++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100644 components/core/src/clp_s/InputConfig.cpp create mode 100644 components/core/src/clp_s/InputConfig.hpp diff --git a/components/core/src/clp_s/InputConfig.cpp b/components/core/src/clp_s/InputConfig.cpp new file mode 100644 index 000000000..260caefd4 --- /dev/null +++ b/components/core/src/clp_s/InputConfig.cpp @@ -0,0 +1,88 @@ +#include "InputConfig.hpp" + +#include +#include +#include +#include + +#include "Utils.hpp" + +namespace clp_s { +auto get_source_for_path(std::string_view const path) -> InputSource { + try { + return std::filesystem::exists(path) ? InputSource::Filesystem : InputSource::Network; + } catch (std::exception const& e) { + return InputSource::Network; + } +} + +auto get_path_object_for_raw_path(std::string_view const path) -> Path { + return Path{.source{get_source_for_path(path)}, .path{std::string{path}}}; +} + +auto get_input_files_for_raw_path(std::string_view const path, std::vector& files) -> bool { + return get_input_files_for_path(get_path_object_for_raw_path(path), files); +} + +auto get_input_files_for_path(Path const& path, std::vector& files) -> bool { + if (InputSource::Network == path.source) { + files.emplace_back(path); + return true; + } + + if (false == std::filesystem::is_directory(path.path)) { + files.emplace_back(path); + return true; + } + + std::vector file_paths; + if (false == FileUtils::find_all_files_in_directory(path.path, file_paths)) { + return false; + } + + for (auto& file : file_paths) { + files.emplace_back(Path{.source{InputSource::Filesystem}, .path{std::move(file)}}); + } + return true; +} + +auto get_input_archives_for_raw_path(std::string_view const path, std::vector& archives) + -> bool { + return get_input_archives_for_path(get_path_object_for_raw_path(path), archives); +} + +auto get_input_archives_for_path(Path const& path, std::vector& archives) -> bool { + if (InputSource::Network == path.source) { + archives.emplace_back(path); + return true; + } + + if (false == std::filesystem::is_directory(path.path)) { + archives.emplace_back(path); + return true; + } + + std::vector archive_paths; + if (false == FileUtils::find_all_archives_in_directory(path.path, archive_paths)) { + return false; + } + + for (auto& archive : archive_paths) { + archives.emplace_back(Path{.source{InputSource::Filesystem}, .path{std::move(archive)}}); + } + return true; +} + +auto get_archive_id_from_path(Path const& archive_path, std::string& archive_id) -> bool { + switch (archive_path.source) { + case InputSource::Network: + return UriUtils::get_last_uri_component(archive_path.path, archive_id); + case InputSource::Filesystem: + return FileUtils::get_last_non_empty_path_component(archive_path.path, archive_id); + default: + return false; + } + return true; +} + +} // namespace clp_s diff --git a/components/core/src/clp_s/InputConfig.hpp b/components/core/src/clp_s/InputConfig.hpp new file mode 100644 index 000000000..c63acb3ba --- /dev/null +++ b/components/core/src/clp_s/InputConfig.hpp @@ -0,0 +1,105 @@ +#ifndef CLP_S_INPUTCONFIG_HPP +#define CLP_S_INPUTCONFIG_HPP + +#include +#include +#include +#include +#include + +namespace clp_s { +// Constants used for input configuration +constexpr char cAwsAccessKeyIdEnvVar[] = "AWS_ACCESS_KEY_ID"; +constexpr char cAwsSecretAccessKeyEnvVar[] = "AWS_SECRET_ACCESS_KEY"; + +/** + * Enum class definining the source of some resource. + */ +enum class InputSource : uint8_t { + Filesystem, + Network +}; + +/** + * Enum class defining the auth that needs to be performed to access some resource. + */ +enum class AuthMethod : uint8_t { + None, + S3PresignedUrlV4 +}; + +/** + * Struct encapsulating information needed to authenticate network requests. + */ +struct NetworkAuthOption { + AuthMethod method{AuthMethod::None}; +}; + +/** + * Struct describing a path to some resource as well as its source. + */ +struct Path { + InputSource source{InputSource::Filesystem}; + std::string path; +}; + +/** + * Determines the input source for a given raw path or url. + * @param path + * @return the InputSource for the given path + */ +[[nodiscard]] auto get_source_for_path(std::string_view const path) -> InputSource; + +/** + * Determines the input source for a given raw path or url and converts the path into a Path object. + * @param path + * @return a Path object representing the raw path or url + */ +[[nodiscard]] auto get_path_object_for_raw_path(std::string_view const path) -> Path; + +/** + * Recursively records all file paths from the given raw path, including the path itself. + * @param path + * @param files Returned paths + * @return true on success, false otherwise + */ +auto get_input_files_for_raw_path(std::string_view const path, std::vector& files) -> bool; + +/** + * Recursively records all file paths that are children of the the given Path, including the Path + * itself. + * @param option + * @param files Returned paths + * @return true on success, false otherwise + */ +[[nodiscard]] auto get_input_files_for_path(Path const& path, std::vector& files) -> bool; + +/** + * Records all archives that are children of the given raw path, including the path itself. + * @param path + * @param archives Returned archives + * @return true on success, false otherwise + */ +[[nodiscard]] auto +get_input_archives_for_raw_path(std::string_view const path, std::vector& archives) -> bool; + +/** + * Records all archives from the given Path, including the Path itself. + * @param path + * @param archives Returned archives + * @return true on success, false otherwise + */ +[[nodiscard]] auto +get_input_archives_for_path(Path const& path, std::vector& archives) -> bool; + +/** + * Determines the archive id of a given archive based on a path to that archive. + * @param path + * @param archive_id Returned archive id + * @return true on success, false otherwise + */ +[[nodiscard]] auto +get_archive_id_from_path(Path const& archive_path, std::string& archive_id) -> bool; +} // namespace clp_s + +#endif // CLP_S_INPUTCONFIG_HPP From d0807c551ebd64510017eca5cc675c943f731bf6 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Mon, 16 Dec 2024 20:46:10 +0000 Subject: [PATCH 02/26] Remove boost::filesystem usage from ZstdDecompressor --- components/core/src/clp_s/ZstdDecompressor.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/components/core/src/clp_s/ZstdDecompressor.cpp b/components/core/src/clp_s/ZstdDecompressor.cpp index 87d3ae8fa..c6c7f99e7 100644 --- a/components/core/src/clp_s/ZstdDecompressor.cpp +++ b/components/core/src/clp_s/ZstdDecompressor.cpp @@ -3,8 +3,9 @@ #include "ZstdDecompressor.hpp" #include +#include -#include +#include #include namespace clp_s { @@ -202,14 +203,13 @@ ErrorCode ZstdDecompressor::open(std::string const& compressed_file_path) { m_input_type = InputType::MemoryMappedCompressedFile; // Create memory mapping for compressed_file_path, use boost read only memory mapped file - boost::system::error_code boost_error_code; - size_t compressed_file_size - = boost::filesystem::file_size(compressed_file_path, boost_error_code); - if (boost_error_code) { + std::error_code error_code; + size_t compressed_file_size = std::filesystem::file_size(compressed_file_path, error_code); + if (error_code) { SPDLOG_ERROR( "ZstdDecompressor: Unable to obtain file size for '{}' - {}.", compressed_file_path.c_str(), - boost_error_code.message().c_str() + error_code.message().c_str() ); return ErrorCodeFailure; } From cfbfa9489d11f2b8340900617ac757d0c0c8afad Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Mon, 16 Dec 2024 20:46:49 +0000 Subject: [PATCH 03/26] Add utility to create a reader for a resource given Path and NetworkAuth --- components/core/src/clp_s/ReaderUtils.cpp | 77 +++++++++++++++++++---- components/core/src/clp_s/ReaderUtils.hpp | 12 ++-- 2 files changed, 71 insertions(+), 18 deletions(-) diff --git a/components/core/src/clp_s/ReaderUtils.cpp b/components/core/src/clp_s/ReaderUtils.cpp index a2ab5a34a..01359b806 100644 --- a/components/core/src/clp_s/ReaderUtils.cpp +++ b/components/core/src/clp_s/ReaderUtils.cpp @@ -1,6 +1,14 @@ #include "ReaderUtils.hpp" +#include + +#include "../clp/aws/AwsAuthenticationSigner.hpp" +#include "../clp/FileReader.hpp" +#include "../clp/NetworkReader.hpp" +#include "../clp/ReaderInterface.hpp" +#include "../clp/spdlog_with_specializations.hpp" #include "archive_constants.hpp" +#include "Utils.hpp" namespace clp_s { std::shared_ptr ReaderUtils::read_schema_tree(std::string const& archives_dir) { @@ -142,22 +150,67 @@ std::shared_ptr ReaderUtils::read_schemas(std::string co return schemas_pointer; } -std::vector ReaderUtils::get_archives(std::string const& archives_dir) { - std::vector archive_paths; - - if (false == boost::filesystem::is_directory(archives_dir)) { - throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__); +namespace { +std::shared_ptr try_create_file_reader(std::string_view const file_path) { + try { + return std::make_shared(std::string{file_path}); + } catch (clp::FileReader::OperationFailed const& e) { + SPDLOG_ERROR("Failed to open file for reading - {} - {}", file_path, e.what()); + return nullptr; } +} - boost::filesystem::directory_iterator iter(archives_dir); - boost::filesystem::directory_iterator end; - for (; iter != end; ++iter) { - if (boost::filesystem::is_directory(iter->path())) { - archive_paths.push_back(iter->path().string()); +bool try_sign_url(std::string& url) { + clp::aws::AwsAuthenticationSigner signer{ + std::getenv(cAwsAccessKeyIdEnvVar), + std::getenv(cAwsSecretAccessKeyEnvVar) + }; + + try { + clp::aws::S3Url s3_url{url}; + if (auto const rc = signer.generate_presigned_url(s3_url, url); + clp::ErrorCode::ErrorCode_Success != rc) + { + return false; } + } catch (std::exception const& e) { + return false; } - - return archive_paths; + return true; } +std::shared_ptr +try_create_network_reader(std::string_view const url, NetworkAuthOption const& auth) { + std::string request_url{url}; + switch (auth.method) { + case AuthMethod::S3PresignedUrlV4: + if (false == try_sign_url(request_url)) { + return nullptr; + } + break; + case AuthMethod::None: + break; + default: + return nullptr; + } + + try { + return std::make_shared(request_url); + } catch (clp::NetworkReader::OperationFailed const& e) { + SPDLOG_ERROR("Failed to open url for reading - {}", e.what()); + return nullptr; + } +} +} // namespace + +std::shared_ptr +ReaderUtils::try_create_reader(Path const& path, NetworkAuthOption const& network_auth) { + if (InputSource::Filesystem == path.source) { + return try_create_file_reader(path.path); + } else if (InputSource::Network == path.source) { + return try_create_network_reader(path.path, network_auth); + } else { + return nullptr; + } +} } // namespace clp_s diff --git a/components/core/src/clp_s/ReaderUtils.hpp b/components/core/src/clp_s/ReaderUtils.hpp index caa509d6a..68d1c6e2c 100644 --- a/components/core/src/clp_s/ReaderUtils.hpp +++ b/components/core/src/clp_s/ReaderUtils.hpp @@ -1,7 +1,11 @@ #ifndef CLP_S_READERUTILS_HPP #define CLP_S_READERUTILS_HPP +#include + +#include "../clp/ReaderInterface.hpp" #include "DictionaryReader.hpp" +#include "InputConfig.hpp" #include "Schema.hpp" #include "SchemaReader.hpp" #include "SchemaTree.hpp" @@ -66,12 +70,8 @@ class ReaderUtils { std::string const& archive_path ); - /** - * Gets the list of archives in the given archive directory - * @param archives_dir - * @return the list of archives - */ - static std::vector get_archives(std::string const& archives_dir); + static std::shared_ptr + try_create_reader(Path const& path, NetworkAuthOption const& network_auth); private: /** From 97f1fe7f539ef6e3a4d57d580374ae66c7e4ec0c Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Mon, 16 Dec 2024 20:47:45 +0000 Subject: [PATCH 04/26] Add utilities for finding all files or archives in a directory, and add utilities for finding archive Id given a Path --- components/core/src/clp_s/Utils.cpp | 135 ++++++++++++++++++++++++---- components/core/src/clp_s/Utils.hpp | 48 ++++++++-- 2 files changed, 159 insertions(+), 24 deletions(-) diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp index acee48851..41b8d24e1 100644 --- a/components/core/src/clp_s/Utils.cpp +++ b/components/core/src/clp_s/Utils.cpp @@ -1,42 +1,53 @@ #include "Utils.hpp" -#include +#include +#include +#include + +#include +#include #include +#include "archive_constants.hpp" +#include "Utils.hpp" + using std::string; using std::string_view; namespace clp_s { -bool FileUtils::find_all_files(std::string const& path, std::vector& file_paths) { +bool FileUtils::find_all_files_in_directory( + std::string const& path, + std::vector& file_paths +) { try { - if (false == boost::filesystem::is_directory(path)) { + if (false == std::filesystem::is_directory(path)) { // path is a file file_paths.push_back(path); return true; } - if (boost::filesystem::is_empty(path)) { + if (std::filesystem::is_empty(path)) { // path is an empty directory return true; } // Iterate directory - boost::filesystem::recursive_directory_iterator iter( + std::filesystem::recursive_directory_iterator iter( path, - boost::filesystem::directory_options::follow_directory_symlink + std::filesystem::directory_options::follow_directory_symlink ); - boost::filesystem::recursive_directory_iterator end; + std::filesystem::recursive_directory_iterator end; for (; iter != end; ++iter) { // Check if current entry is an empty directory or a file - if (boost::filesystem::is_directory(iter->path())) { - if (boost::filesystem::is_empty(iter->path())) { + if (std::filesystem::is_directory(iter->path())) { + if (std::filesystem::is_empty(iter->path())) { iter.disable_recursion_pending(); } } else { file_paths.push_back(iter->path().string()); } } - } catch (boost::filesystem::filesystem_error& exception) { + } catch (std::exception const& exception) { SPDLOG_ERROR( "Failed to find files/directories at '{}' - {}.", path.c_str(), @@ -48,16 +59,106 @@ bool FileUtils::find_all_files(std::string const& path, std::vector return true; } -bool FileUtils::validate_path(std::vector const& paths) { - bool all_paths_exist = true; - for (auto const& path : paths) { - if (false == boost::filesystem::exists(path)) { - SPDLOG_ERROR("'{}' does not exist.", path.c_str()); - all_paths_exist = false; +namespace { +/** + * Determines if a directory is a multi-file archive. + * @param path + * @return true if this directory is a multi-file archive, false otherwise + */ +bool directory_is_multi_file_archive(std::string_view const path) { + for (auto const& entry : std::filesystem::directory_iterator{path}) { + if (entry.is_directory()) { + return false; + } + + std::string file_name; + if (false == FileUtils::get_last_non_empty_path_component(entry.path().string(), file_name)) + { + return false; + } + auto formatted_name = fmt::format("/{}", file_name); + if (constants::cArchiveTimestampDictFile == formatted_name + || constants::cArchiveSchemaTreeFile == formatted_name + || constants::cArchiveSchemaMapFile == formatted_name + || constants::cArchiveVarDictFile == formatted_name + || constants::cArchiveLogDictFile == formatted_name + || constants::cArchiveArrayDictFile == formatted_name + || constants::cArchiveTableMetadataFile == formatted_name + || constants::cArchiveTablesFile == formatted_name) + { + continue; + } else { + try { + auto segment_file_number = std::stoi(file_name); + continue; + } catch (std::exception const& e) { + return false; + } + } + } + return true; +} +} // namespace + +bool FileUtils::find_all_archives_in_directory( + std::string_view const path, + std::vector& archive_paths +) { + try { + if (false == std::filesystem::is_directory(path)) { + return false; + } + } catch (std::exception const& e) { + return false; + } + + if (directory_is_multi_file_archive(path)) { + archive_paths.emplace_back(path); + return true; + } + + for (auto const& entry : std::filesystem::directory_iterator{path}) { + archive_paths.emplace_back(entry.path().string()); + } + return true; +} + +bool FileUtils::get_last_non_empty_path_component(std::string_view const path, std::string& name) { + std::filesystem::path fs_path; + try { + fs_path = std::filesystem::path{path}.lexically_normal(); + } catch (std::exception const& e) { + return false; + } + + if (fs_path.has_filename() && false == fs_path.filename().string().empty()) { + name = fs_path.filename().string(); + return true; + } + + while (fs_path.has_parent_path()) { + fs_path = fs_path.parent_path(); + if (fs_path.has_filename() && false == fs_path.filename().string().empty()) { + name = fs_path.filename().string(); + return true; } } - return all_paths_exist; + return false; +} + +bool UriUtils::get_last_uri_component(std::string_view const uri, std::string& name) { + auto parsed_result = boost::urls::parse_uri(uri); + if (false == parsed_result.has_value()) { + return false; + } + auto parsed_uri = parsed_result.value(); + auto path_segments_view = parsed_uri.segments(); + if (path_segments_view.empty()) { + return false; + } + name = path_segments_view.back(); + return true; } bool StringUtils::get_bounds_of_next_var(string const& msg, size_t& begin_pos, size_t& end_pos) { diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp index 553f7e608..f57107ecf 100644 --- a/components/core/src/clp_s/Utils.hpp +++ b/components/core/src/clp_s/Utils.hpp @@ -5,8 +5,8 @@ #include #include #include - -#include +#include +#include namespace clp_s { class FileUtils { @@ -17,14 +17,48 @@ class FileUtils { * @param file_paths * @return true if successful, false otherwise */ - static bool find_all_files(std::string const& path, std::vector& file_paths); + static bool + find_all_files_in_directory(std::string const& path, std::vector& file_paths); /** - * Validate if all paths exist - * @param paths - * @return true if all paths exist, false otherwise + * Find all archives in a directory, including the directory itself + * @param path + * @param archive_paths + * @return true if successful, false otherwise + */ + static bool find_all_archives_in_directory( + std::string_view const path, + std::vector& archive_paths + ); + + /** + * Gets the last non-empty component of a path, accounting for trailing forward slashes. + * + * For example: + * ./foo/bar.baz -> bar.baz + * ./foo/bar.baz/ -> bar.baz + * + * @param path + * @param name Returned component name + * @return true on success, false otherwise + */ + static bool get_last_non_empty_path_component(std::string_view const path, std::string& name); +}; + +class UriUtils { +public: + /** + * Gets the last component of a uri. + * + * For example: + * https://www.something.org/abc-xyz -> abc-xyz + * https://www.something.org/aaa/bbb/abc-xyz?something=something -> abc-xyz + * + * @param uri + * @param name Returned component name + * @return true on success, false otherwise */ - static bool validate_path(std::vector const& paths); + static bool get_last_uri_component(std::string_view const uri, std::string& name); }; class StringUtils { From b33e3ae07f9e90949f4a8178e88490c563a574f1 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Mon, 16 Dec 2024 20:48:28 +0000 Subject: [PATCH 05/26] Update CMakeLists to pull in boost::urls --- components/core/CMakeLists.txt | 4 +++- components/core/src/clp_s/CMakeLists.txt | 27 +++++++++++++++++++++++- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index ce74f04cc..0f9473f87 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -257,6 +257,8 @@ set(SOURCE_FILES_clp_s_unitTest src/clp_s/FileReader.hpp src/clp_s/FileWriter.cpp src/clp_s/FileWriter.hpp + src/clp_s/InputConfig.cpp + src/clp_s/InputConfig.hpp src/clp_s/JsonConstructor.cpp src/clp_s/JsonConstructor.hpp src/clp_s/JsonFileIterator.cpp @@ -592,7 +594,7 @@ target_include_directories(unitTest target_link_libraries(unitTest PRIVATE absl::flat_hash_map - Boost::filesystem Boost::iostreams Boost::program_options Boost::regex + Boost::filesystem Boost::iostreams Boost::program_options Boost::regex Boost::url ${CURL_LIBRARIES} fmt::fmt kql diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt index 1656a5d59..8384e518e 100644 --- a/components/core/src/clp_s/CMakeLists.txt +++ b/components/core/src/clp_s/CMakeLists.txt @@ -2,30 +2,51 @@ add_subdirectory(search/kql) set( CLP_SOURCES + ../clp/aws/AwsAuthenticationSigner.cpp + ../clp/aws/AwsAuthenticationSigner.hpp + ../clp/BoundedReader.cpp + ../clp/BoundedReader.hpp + ../clp/CurlDownloadHandler.cpp + ../clp/CurlDownloadHandler.hpp + ../clp/CurlEasyHandle.hpp + ../clp/CurlGlobalInstance.cpp + ../clp/CurlGlobalInstance.hpp + ../clp/CurlOperationFailed.hpp + ../clp/CurlStringList.hpp ../clp/cli_utils.cpp ../clp/cli_utils.hpp ../clp/database_utils.cpp ../clp/database_utils.hpp ../clp/Defs.h ../clp/ErrorCode.hpp + ../clp/FileReader.cpp + ../clp/FileReader.hpp ../clp/GlobalMetadataDB.hpp ../clp/GlobalMetadataDBConfig.cpp ../clp/GlobalMetadataDBConfig.hpp ../clp/GlobalMySQLMetadataDB.cpp ../clp/GlobalMySQLMetadataDB.hpp + ../clp/hash_utils.cpp + ../clp/hash_utils.hpp ../clp/MySQLDB.cpp ../clp/MySQLDB.hpp ../clp/MySQLParamBindings.cpp ../clp/MySQLParamBindings.hpp ../clp/MySQLPreparedStatement.cpp ../clp/MySQLPreparedStatement.hpp + ../clp/NetworkReader.cpp + ../clp/NetworkReader.hpp ../clp/networking/socket_utils.cpp ../clp/networking/socket_utils.hpp ../clp/ReaderInterface.cpp ../clp/ReaderInterface.hpp + ../clp/spdlog_with_specializations.hpp ../clp/streaming_archive/ArchiveMetadata.cpp ../clp/streaming_archive/ArchiveMetadata.hpp + ../clp/Thread.cpp + ../clp/Thread.hpp ../clp/TraceableException.hpp + ../clp/type_utils.hpp ../clp/WriterInterface.cpp ../clp/WriterInterface.hpp ) @@ -58,6 +79,8 @@ set( FileReader.hpp FileWriter.cpp FileWriter.hpp + InputConfig.cpp + InputConfig.hpp JsonConstructor.cpp JsonConstructor.hpp JsonFileIterator.cpp @@ -195,12 +218,14 @@ target_link_libraries( clp-s PRIVATE absl::flat_hash_map - Boost::filesystem Boost::iostreams Boost::program_options + Boost::iostreams Boost::program_options Boost::regex Boost::url + ${CURL_LIBRARIES} clp::string_utils kql MariaDBClient::MariaDBClient ${MONGOCXX_TARGET} msgpack-cxx + OpenSSL::Crypto simdjson spdlog::spdlog yaml-cpp::yaml-cpp From 528c0bd3d604a92602f3c8ca40ac785df5f6c045 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Mon, 16 Dec 2024 20:49:53 +0000 Subject: [PATCH 06/26] Update ArchiveReader to accept Path and NetworkAuth --- components/core/src/clp_s/ArchiveReader.cpp | 20 +++++++++++++++----- components/core/src/clp_s/ArchiveReader.hpp | 9 ++++----- components/core/src/clp_s/ArchiveWriter.hpp | 1 - 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/components/core/src/clp_s/ArchiveReader.cpp b/components/core/src/clp_s/ArchiveReader.cpp index 7c68b301d..487beca34 100644 --- a/components/core/src/clp_s/ArchiveReader.cpp +++ b/components/core/src/clp_s/ArchiveReader.cpp @@ -4,20 +4,30 @@ #include #include "archive_constants.hpp" +#include "InputConfig.hpp" #include "ReaderUtils.hpp" using std::string_view; namespace clp_s { -void ArchiveReader::open(string_view archives_dir, string_view archive_id) { +void ArchiveReader::open(Path const& archive_path, NetworkAuthOption const& network_auth) { if (m_is_open) { throw OperationFailed(ErrorCodeNotReady, __FILENAME__, __LINE__); } m_is_open = true; - m_archive_id = archive_id; - std::filesystem::path archive_path{archives_dir}; - archive_path /= m_archive_id; - auto const archive_path_str = archive_path.string(); + + if (false == get_archive_id_from_path(archive_path, m_archive_id)) { + throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__); + } + + if (InputSource::Filesystem != archive_path.source) { + throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__); + } + + if (false == std::filesystem::is_directory(archive_path.path)) { + throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__); + } + auto const archive_path_str = archive_path.path; m_var_dict = ReaderUtils::get_variable_dictionary_reader(archive_path_str); m_log_dict = ReaderUtils::get_log_type_dictionary_reader(archive_path_str); diff --git a/components/core/src/clp_s/ArchiveReader.hpp b/components/core/src/clp_s/ArchiveReader.hpp index 6b437dfd2..9e492720b 100644 --- a/components/core/src/clp_s/ArchiveReader.hpp +++ b/components/core/src/clp_s/ArchiveReader.hpp @@ -7,9 +7,8 @@ #include #include -#include - #include "DictionaryReader.hpp" +#include "InputConfig.hpp" #include "PackedStreamReader.hpp" #include "ReaderUtils.hpp" #include "SchemaReader.hpp" @@ -32,10 +31,10 @@ class ArchiveReader { /** * Opens an archive for reading. - * @param archives_dir - * @param archive_id + * @param archive_path + * @param network_auth */ - void open(std::string_view archives_dir, std::string_view archive_id); + void open(Path const& archive_path, NetworkAuthOption const& network_auth); /** * Reads the dictionaries and metadata. diff --git a/components/core/src/clp_s/ArchiveWriter.hpp b/components/core/src/clp_s/ArchiveWriter.hpp index 82a0122bc..a76d15daf 100644 --- a/components/core/src/clp_s/ArchiveWriter.hpp +++ b/components/core/src/clp_s/ArchiveWriter.hpp @@ -4,7 +4,6 @@ #include #include -#include #include #include From 6cfdf46abfea08f8ad4fa3b66933e470b2b660a6 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Mon, 16 Dec 2024 20:50:23 +0000 Subject: [PATCH 07/26] Accept clp::ReaderInterface in JsonFileIterator --- .../core/src/clp_s/JsonFileIterator.cpp | 19 +++++-------------- .../core/src/clp_s/JsonFileIterator.hpp | 11 +++-------- 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp index ad6d16cd0..a0a003d9f 100644 --- a/components/core/src/clp_s/JsonFileIterator.cpp +++ b/components/core/src/clp_s/JsonFileIterator.cpp @@ -7,28 +7,19 @@ namespace clp_s { JsonFileIterator::JsonFileIterator( - std::string const& file_name, + clp::ReaderInterface& reader, size_t max_document_size, size_t buf_size ) : m_buf_size(buf_size), m_max_document_size(max_document_size), - m_buf(new char[buf_size + simdjson::SIMDJSON_PADDING]) { - try { - m_reader.open(file_name); - } catch (FileReader::OperationFailed& e) { - SPDLOG_ERROR("Failed to open {} for reading - {}", file_name, e.what()); - return; - } - + m_buf(new char[buf_size + simdjson::SIMDJSON_PADDING]), + m_reader(reader) { read_new_json(); } JsonFileIterator::~JsonFileIterator() { delete[] m_buf; - if (m_reader.is_open()) { - m_reader.close(); - } } bool JsonFileIterator::read_new_json() { @@ -59,9 +50,9 @@ bool JsonFileIterator::read_new_json() { m_buf_occupied += size_read; m_bytes_read += size_read; - if (ErrorCodeEndOfFile == file_error) { + if (clp::ErrorCode::ErrorCode_EndOfFile == file_error) { m_eof = true; - } else if (ErrorCodeSuccess != file_error) { + } else if (clp::ErrorCode::ErrorCode_Success != file_error) { m_error_code = simdjson::error_code::IO_ERROR; return false; } diff --git a/components/core/src/clp_s/JsonFileIterator.hpp b/components/core/src/clp_s/JsonFileIterator.hpp index b8db3f4f2..fc054dd9d 100644 --- a/components/core/src/clp_s/JsonFileIterator.hpp +++ b/components/core/src/clp_s/JsonFileIterator.hpp @@ -3,6 +3,7 @@ #include +#include "../clp/ReaderInterface.hpp" #include "FileReader.hpp" namespace clp_s { @@ -22,7 +23,7 @@ class JsonFileIterator { * @param buf_size the initial buffer size */ explicit JsonFileIterator( - std::string const& file_name, + clp::ReaderInterface& reader, size_t max_document_size, size_t buf_size = 1024 * 1024 /*1MB default*/ ); @@ -35,12 +36,6 @@ class JsonFileIterator { */ [[nodiscard]] bool get_json(simdjson::ondemand::document_stream::iterator& it); - /** - * Checks if the file is open - * @return true if the file opened successfully - */ - [[nodiscard]] bool is_open() const { return m_reader.is_open(); } - /** * @return number of truncated bytes after json documents */ @@ -86,7 +81,7 @@ class JsonFileIterator { size_t m_buf_occupied{0}; size_t m_max_document_size{0}; char* m_buf{nullptr}; - FileReader m_reader; + clp::ReaderInterface& m_reader; simdjson::ondemand::parser m_parser; simdjson::ondemand::document_stream m_stream; bool m_eof{false}; From 63d8bdc4210bee1c59e2c327c3669317b3159172 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Mon, 16 Dec 2024 20:50:42 +0000 Subject: [PATCH 08/26] Update JsonParser to accept Path and NetworkAuth --- components/core/src/clp_s/JsonParser.cpp | 32 +++++++++++------------- components/core/src/clp_s/JsonParser.hpp | 9 +++++-- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index d14a221b3..6fb86f7cf 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -1,13 +1,16 @@ #include "JsonParser.hpp" #include +#include #include #include #include +#include "../clp/ReaderInterface.hpp" #include "archive_constants.hpp" #include "JsonFileIterator.hpp" +#include "JsonParser.hpp" namespace clp_s { JsonParser::JsonParser(JsonParserOption const& option) @@ -16,11 +19,9 @@ JsonParser::JsonParser(JsonParserOption const& option) m_max_document_size(option.max_document_size), m_timestamp_key(option.timestamp_key), m_structurize_arrays(option.structurize_arrays), - m_record_log_order(option.record_log_order) { - if (false == FileUtils::validate_path(option.file_paths)) { - exit(1); - } - + m_record_log_order(option.record_log_order), + m_input_paths(option.input_paths), + m_network_auth(option.network_auth) { if (false == m_timestamp_key.empty()) { if (false == clp_s::StringUtils::tokenize_column_descriptor(m_timestamp_key, m_timestamp_column)) @@ -30,10 +31,6 @@ JsonParser::JsonParser(JsonParserOption const& option) } } - for (auto& file_path : option.file_paths) { - FileUtils::find_all_files(file_path, m_file_paths); - } - m_archive_options.archives_dir = option.archives_dir; m_archive_options.compression_level = option.compression_level; m_archive_options.print_archive_stats = option.print_archive_stats; @@ -427,18 +424,19 @@ void JsonParser::parse_line(ondemand::value line, int32_t parent_node_id, std::s } bool JsonParser::parse() { - for (auto& file_path : m_file_paths) { - JsonFileIterator json_file_iterator(file_path, m_max_document_size); - if (false == json_file_iterator.is_open()) { + for (auto const& path : m_input_paths) { + auto reader{ReaderUtils::try_create_reader(path, m_network_auth)}; + if (nullptr == reader) { m_archive_writer->close(); return false; } + JsonFileIterator json_file_iterator(*reader, m_max_document_size); if (simdjson::error_code::SUCCESS != json_file_iterator.get_error()) { SPDLOG_ERROR( "Encountered error - {} - while trying to parse {} after parsing 0 bytes", simdjson::error_message(json_file_iterator.get_error()), - file_path + path.path ); m_archive_writer->close(); return false; @@ -472,7 +470,7 @@ bool JsonParser::parse() { SPDLOG_ERROR( "Encountered non-json-object while trying to parse {} after parsing {} " "bytes", - file_path, + path.path, bytes_consumed_up_to_prev_record ); m_archive_writer->close(); @@ -497,7 +495,7 @@ bool JsonParser::parse() { SPDLOG_ERROR( "Encountered error - {} - while trying to parse {} after parsing {} bytes", error.what(), - file_path, + path.path, bytes_consumed_up_to_prev_record ); m_archive_writer->close(); @@ -531,7 +529,7 @@ bool JsonParser::parse() { SPDLOG_ERROR( "Encountered error - {} - while trying to parse {} after parsing {} bytes", simdjson::error_message(json_file_iterator.get_error()), - file_path, + path.path, bytes_consumed_up_to_prev_record ); m_archive_writer->close(); @@ -541,7 +539,7 @@ bool JsonParser::parse() { SPDLOG_WARN( "Truncated JSON ({} bytes) at end of file {}", json_file_iterator.truncated_bytes(), - file_path.c_str() + path.path ); } } diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp index c05ab9d60..2f4cacbbd 100644 --- a/components/core/src/clp_s/JsonParser.hpp +++ b/components/core/src/clp_s/JsonParser.hpp @@ -2,6 +2,7 @@ #define CLP_S_JSONPARSER_HPP #include +#include #include #include #include @@ -16,7 +17,9 @@ #include "DictionaryWriter.hpp" #include "FileReader.hpp" #include "FileWriter.hpp" +#include "InputConfig.hpp" #include "ParsedMessage.hpp" +#include "ReaderUtils.hpp" #include "Schema.hpp" #include "SchemaMap.hpp" #include "SchemaTree.hpp" @@ -29,7 +32,7 @@ using namespace simdjson; namespace clp_s { struct JsonParserOption { - std::vector file_paths; + std::vector input_paths; CommandLineArguments::FileType input_file_type{CommandLineArguments::FileType::Json}; std::string timestamp_key; std::string archives_dir; @@ -42,6 +45,7 @@ struct JsonParserOption { bool record_log_order{true}; bool single_file_archive{false}; std::shared_ptr metadata_db; + NetworkAuthOption network_auth{}; }; class JsonParser { @@ -108,7 +112,8 @@ class JsonParser { int32_t add_metadata_field(std::string_view const field_name, NodeType type); int m_num_messages; - std::vector m_file_paths; + std::vector m_input_paths; + NetworkAuthOption m_network_auth{}; Schema m_current_schema; ParsedMessage m_current_parsed_message; From e8f5b370502c523cb44b8837823f3f9669c2dcd6 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Mon, 16 Dec 2024 20:51:02 +0000 Subject: [PATCH 09/26] Update JsonConstructor to accept Path and NetworkAuth --- components/core/src/clp_s/JsonConstructor.cpp | 17 +++-------------- components/core/src/clp_s/JsonConstructor.hpp | 7 ++++--- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/components/core/src/clp_s/JsonConstructor.cpp b/components/core/src/clp_s/JsonConstructor.cpp index 8886f2074..f1363549d 100644 --- a/components/core/src/clp_s/JsonConstructor.cpp +++ b/components/core/src/clp_s/JsonConstructor.cpp @@ -31,22 +31,11 @@ JsonConstructor::JsonConstructor(JsonConstructorOption const& option) : m_option ) ); } - - std::filesystem::path archive_path{m_option.archives_dir}; - archive_path /= m_option.archive_id; - if (false == std::filesystem::is_directory(archive_path)) { - throw OperationFailed( - ErrorCodeFailure, - __FILENAME__, - __LINE__, - fmt::format("'{}' is not a directory", archive_path.c_str()) - ); - } } void JsonConstructor::store() { m_archive_reader = std::make_unique(); - m_archive_reader->open(m_option.archives_dir, m_option.archive_id); + m_archive_reader->open(m_option.archive_path, m_option.network_auth); m_archive_reader->read_dictionaries_and_metadata(); if (m_option.ordered && false == m_archive_reader->has_log_order()) { @@ -84,7 +73,7 @@ void JsonConstructor::construct_in_order() { int64_t first_idx{}; int64_t last_idx{}; size_t chunk_size{}; - auto src_path = std::filesystem::path(m_option.output_dir) / m_option.archive_id; + auto src_path = std::filesystem::path(m_option.output_dir) / m_archive_reader->get_archive_id(); FileWriter writer; writer.open(src_path, FileWriter::OpenMode::CreateForWriting); @@ -123,7 +112,7 @@ void JsonConstructor::construct_in_order() { ), bsoncxx::builder::basic::kvp( constants::results_cache::decompression::cStreamId, - m_option.archive_id + std::string{m_archive_reader->get_archive_id()} ), bsoncxx::builder::basic::kvp( constants::results_cache::decompression::cBeginMsgIx, diff --git a/components/core/src/clp_s/JsonConstructor.hpp b/components/core/src/clp_s/JsonConstructor.hpp index 3d9228a02..533d335b4 100644 --- a/components/core/src/clp_s/JsonConstructor.hpp +++ b/components/core/src/clp_s/JsonConstructor.hpp @@ -11,6 +11,7 @@ #include "DictionaryReader.hpp" #include "ErrorCode.hpp" #include "FileWriter.hpp" +#include "InputConfig.hpp" #include "SchemaReader.hpp" #include "SchemaTree.hpp" #include "TraceableException.hpp" @@ -26,12 +27,12 @@ struct MetadataDbOption { }; struct JsonConstructorOption { - std::string archives_dir; - std::string archive_id; + Path archive_path{}; + NetworkAuthOption network_auth{}; std::string output_dir; bool ordered{false}; size_t target_ordered_chunk_size{}; - std::optional metadata_db; + std::optional metadata_db{std::nullopt}; }; class JsonConstructor { From 29049b3a5dc698fc90a5e340f93b7425ab3852ed Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Mon, 16 Dec 2024 20:51:31 +0000 Subject: [PATCH 10/26] Remove unnecessary dependency from kql build target --- components/core/src/clp_s/search/kql/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp_s/search/kql/CMakeLists.txt b/components/core/src/clp_s/search/kql/CMakeLists.txt index ee36ee124..c2bf0fb5c 100644 --- a/components/core/src/clp_s/search/kql/CMakeLists.txt +++ b/components/core/src/clp_s/search/kql/CMakeLists.txt @@ -25,4 +25,4 @@ add_library( ) target_compile_features(kql PRIVATE cxx_std_20) target_include_directories(kql PRIVATE ${ANTLR_KqlParser_OUTPUT_DIR}) -target_link_libraries(kql PRIVATE antlr4_static Boost::filesystem) +target_link_libraries(kql PRIVATE antlr4_static) From df0144a8d99ca368aad387c7758b7783f81bdd95 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Mon, 16 Dec 2024 20:51:51 +0000 Subject: [PATCH 11/26] Update clp_s end to end tests to account for new interface --- components/core/tests/test-clp_s-end_to_end.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/components/core/tests/test-clp_s-end_to_end.cpp b/components/core/tests/test-clp_s-end_to_end.cpp index 3f138b472..a9e66aa1b 100644 --- a/components/core/tests/test-clp_s-end_to_end.cpp +++ b/components/core/tests/test-clp_s-end_to_end.cpp @@ -9,6 +9,7 @@ #include #include +#include "../src/clp_s/InputConfig.hpp" #include "../src/clp_s/JsonConstructor.hpp" #include "../src/clp_s/JsonParser.hpp" @@ -70,7 +71,9 @@ void compress(bool structurize_arrays) { REQUIRE((std::filesystem::is_directory(cTestEndToEndArchiveDirectory))); clp_s::JsonParserOption parser_option{}; - parser_option.file_paths.push_back(get_test_input_local_path()); + parser_option.input_paths.emplace_back( + clp_s::Path{.source{clp_s::InputSource::Filesystem}, .path{get_test_input_local_path()}} + ); parser_option.archives_dir = cTestEndToEndArchiveDirectory; parser_option.target_encoded_size = cDefaultTargetEncodedSize; parser_option.max_document_size = cDefaultMaxDocumentSize; @@ -94,17 +97,19 @@ auto extract() -> std::filesystem::path { REQUIRE(std::filesystem::is_directory(cTestEndToEndOutputDirectory)); clp_s::JsonConstructorOption constructor_option{}; - constructor_option.archives_dir = cTestEndToEndArchiveDirectory; constructor_option.output_dir = cTestEndToEndOutputDirectory; constructor_option.ordered = cDefaultOrdered; constructor_option.target_ordered_chunk_size = cDefaultTargetOrderedChunkSize; - for (auto const& entry : std::filesystem::directory_iterator(constructor_option.archives_dir)) { + for (auto const& entry : std::filesystem::directory_iterator(cTestEndToEndArchiveDirectory)) { if (false == entry.is_directory()) { // Skip non-directories continue; } - constructor_option.archive_id = entry.path().filename(); + constructor_option.archive_path = clp_s::Path{ + .source{clp_s::InputSource::Filesystem}, + .path{entry.path().string()} + }; clp_s::JsonConstructor constructor{constructor_option}; constructor.store(); } From cc75ffea03db1b7b621b66509c7e1c6705164d5d Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Mon, 16 Dec 2024 20:52:20 +0000 Subject: [PATCH 12/26] Update command line arguments and simplify clp-s.cpp --- .../core/src/clp_s/CommandLineArguments.cpp | 145 ++++++++++++++---- .../core/src/clp_s/CommandLineArguments.hpp | 13 +- components/core/src/clp_s/clp-s.cpp | 72 +++------ 3 files changed, 143 insertions(+), 87 deletions(-) diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp index c7fb9487e..6515ebdd0 100644 --- a/components/core/src/clp_s/CommandLineArguments.cpp +++ b/components/core/src/clp_s/CommandLineArguments.cpp @@ -1,8 +1,10 @@ #include "CommandLineArguments.hpp" +#include #include #include +#include #include #include "../clp/cli_utils.hpp" @@ -131,8 +133,11 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { throw std::invalid_argument(std::string("Unknown action '") + command_input + "'"); } + constexpr std::string_view cNoAuth{"none"}; + constexpr std::string_view cS3Auth{"s3"}; if (Command::Compress == m_command) { po::options_description compression_positional_options; + std::vector input_paths; // clang-format off compression_positional_options.add_options()( "archives-dir", @@ -140,7 +145,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { "output directory" )( "input-paths", - po::value>(&m_file_paths)->value_name("PATHS"), + po::value>(&input_paths)->value_name("PATHS"), "input paths" ); // clang-format on @@ -151,6 +156,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { constexpr std::string_view cJsonFileType{"json"}; constexpr std::string_view cKeyValueIrFileType{"kv-ir"}; std::string file_type{cJsonFileType}; + std::string auth{cNoAuth}; // clang-format off compression_options.add_options()( "compression-level", @@ -209,6 +215,14 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { "file-type", po::value(&file_type)->value_name("FILE_TYPE")->default_value(file_type), "The type of file being compressed (json or kv-ir)" + )( + "auth", + po::value(&auth) + ->value_name("AUTH_TYPE") + ->default_value(auth), + "Type of authentication required for network requests (s3 | none). Authentication" + " with s3 requires the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment" + " variables." ); // clang-format on @@ -252,13 +266,19 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { } if (false == input_path_list_file_path.empty()) { - if (false == read_paths_from_file(input_path_list_file_path, m_file_paths)) { + if (false == read_paths_from_file(input_path_list_file_path, input_paths)) { SPDLOG_ERROR("Failed to read paths from {}", input_path_list_file_path); return ParsingResult::Failure; } } - if (m_file_paths.empty()) { + for (auto const& path : input_paths) { + if (false == get_input_files_for_raw_path(path, m_input_paths)) { + throw std::invalid_argument(fmt::format("Invalid input path \"{}\".", path)); + } + } + + if (m_input_paths.empty()) { throw std::invalid_argument("No input paths specified."); } @@ -278,6 +298,13 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { throw std::invalid_argument("Unknown FILE_TYPE: " + file_type); } + if (cS3Auth == auth) { + m_network_auth.method = AuthMethod::S3PresignedUrlV4; + } else if (cNoAuth != auth) { + throw std::invalid_argument(fmt::format("Invalid authentication type \"{}\"", auth) + ); + } + // Parse and validate global metadata DB config if (false == metadata_db_config_file_path.empty()) { clp::GlobalMetadataDBConfig metadata_db_config; @@ -302,11 +329,12 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { } } else if ((char)Command::Extract == command_input) { po::options_description extraction_options; + std::string archive_path; // clang-format off extraction_options.add_options()( - "archives-dir", - po::value(&m_archives_dir), - "The directory containing the archives" + "archive-path", + po::value(&archive_path), + "Path to a directory containing archives, or the path to a single archive" )( "output-dir", po::value(&m_output_dir), @@ -314,15 +342,9 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { ); // clang-format on - po::options_description input_options("Input Options"); - input_options.add_options()( - "archive-id", - po::value(&m_archive_id)->value_name("ID"), - "ID of the archive to decompress" - ); - extraction_options.add(input_options); - po::options_description decompression_options("Decompression Options"); + std::string auth{cNoAuth}; + std::string archive_id; // clang-format off decompression_options.add_options()( "ordered", @@ -335,6 +357,19 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { ->value_name("SIZE"), "Chunk size (B) for each output file when decompressing records in log order." " When set to 0, no chunking is performed." + )( + "archive-id", + po::value(&archive_id)->value_name("ID"), + "Limit decompression to the archive with the given ID in a subdirectory of" + " archive-path" + )( + "auth", + po::value(&auth) + ->value_name("AUTH_TYPE") + ->default_value(auth), + "Type of authentication required for network requests (s3 | none). Authentication" + " with s3 requires the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment" + " variables." ); // clang-format on extraction_options.add(decompression_options); @@ -354,7 +389,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { extraction_options.add(output_metadata_options); po::positional_options_description positional_options; - positional_options.add("archives-dir", 1); + positional_options.add("archive-path", 1); positional_options.add("output-dir", 1); std::vector unrecognized_options @@ -382,15 +417,38 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { po::options_description visible_options; visible_options.add(general_options); - visible_options.add(input_options); visible_options.add(decompression_options); visible_options.add(output_metadata_options); std::cerr << visible_options << std::endl; return ParsingResult::InfoCommand; } - if (m_archives_dir.empty()) { - throw std::invalid_argument("No archives directory specified"); + if (archive_path.empty()) { + throw std::invalid_argument("No archive path specified"); + } + + if (false == archive_id.empty()) { + auto archive_fs_path = std::filesystem::path(archive_path) / archive_id; + if (false == std::filesystem::exists(archive_fs_path)) { + throw std::invalid_argument("Requested archive does not exist"); + } + m_input_paths.emplace_back(clp_s::Path{ + .source{clp_s::InputSource::Filesystem}, + .path{archive_fs_path.string()} + }); + } else if (false == get_input_archives_for_raw_path(archive_path, m_input_paths)) { + throw std::invalid_argument("Invalid archive path"); + } + + if (m_input_paths.empty()) { + throw std::invalid_argument("No archive paths specified"); + } + + if (cS3Auth == auth) { + m_network_auth.method = AuthMethod::S3PresignedUrlV4; + } else if (cNoAuth != auth) { + throw std::invalid_argument(fmt::format("Invalid authentication type \"{}\"", auth) + ); } if (m_output_dir.empty()) { @@ -422,11 +480,12 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { po::options_description search_options; std::string output_handler_name; + std::string archive_path; // clang-format off search_options.add_options()( - "archives-dir", - po::value(&m_archives_dir), - "The directory containing the archives" + "archive-path", + po::value(&archive_path), + "Path to a directory containing archives, or the path to a single archive" )( "query,q", po::value(&m_query), @@ -440,12 +499,14 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { ); // clang-format on po::positional_options_description positional_options; - positional_options.add("archives-dir", 1); + positional_options.add("archive-path", 1); positional_options.add("query", 1); positional_options.add("output-handler", 1); positional_options.add("output-handler-args", -1); po::options_description match_options("Match Controls"); + std::string auth{cNoAuth}; + std::string archive_id; // clang-format off match_options.add_options()( "tge", @@ -461,8 +522,8 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { "Ignore case distinctions between values in the query and the compressed data" )( "archive-id", - po::value(&m_archive_id)->value_name("ID"), - "Limit search to the archive with the given ID" + po::value(&archive_id)->value_name("ID"), + "Limit search to the archive with the given ID in a subdirectory of archive-path" )( "projection", po::value>(&m_projection_columns) @@ -471,6 +532,14 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { "Project only the given set of columns for matching results. This option must be" " specified after all positional options. Values that are objects or structured" " arrays are currently unsupported." + )( + "auth", + po::value(&auth) + ->value_name("AUTH_TYPE") + ->default_value(auth), + "Type of authentication required for network requests (s3 | none). Authentication" + " with s3 requires the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment" + " variables." ); // clang-format on search_options.add(match_options); @@ -622,8 +691,32 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { return ParsingResult::InfoCommand; } - if (m_archives_dir.empty()) { - throw std::invalid_argument("No archives directory specified"); + if (archive_path.empty()) { + throw std::invalid_argument("No archive path specified"); + } + + if (false == archive_id.empty()) { + auto archive_fs_path = std::filesystem::path(archive_path) / archive_id; + if (false == std::filesystem::exists(archive_fs_path)) { + throw std::invalid_argument("Requested archive does not exist"); + } + m_input_paths.emplace_back(clp_s::Path{ + .source{clp_s::InputSource::Filesystem}, + .path{archive_fs_path.string()} + }); + } else if (false == get_input_archives_for_raw_path(archive_path, m_input_paths)) { + throw std::invalid_argument("Invalid archive path"); + } + + if (m_input_paths.empty()) { + throw std::invalid_argument("No archive paths specified"); + } + + if (cS3Auth == auth) { + m_network_auth.method = AuthMethod::S3PresignedUrlV4; + } else if (cNoAuth != auth) { + throw std::invalid_argument(fmt::format("Invalid authentication type \"{}\"", auth) + ); } if (m_query.empty()) { diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp index 47c244646..17ee77369 100644 --- a/components/core/src/clp_s/CommandLineArguments.hpp +++ b/components/core/src/clp_s/CommandLineArguments.hpp @@ -12,6 +12,7 @@ #include "../clp/GlobalMetadataDBConfig.hpp" #include "../reducer/types.hpp" #include "Defs.hpp" +#include "InputConfig.hpp" namespace clp_s { class CommandLineArguments { @@ -51,7 +52,9 @@ class CommandLineArguments { Command get_command() const { return m_command; } - std::vector const& get_file_paths() const { return m_file_paths; } + std::vector const& get_input_paths() const { return m_input_paths; } + + NetworkAuthOption const& get_network_auth() const { return m_network_auth; } std::string const& get_archives_dir() const { return m_archives_dir; } @@ -87,8 +90,6 @@ class CommandLineArguments { bool get_ignore_case() const { return m_ignore_case; } - std::string const& get_archive_id() const { return m_archive_id; } - std::optional const& get_metadata_db_config() const { return m_metadata_db_config; } @@ -177,7 +178,8 @@ class CommandLineArguments { Command m_command; // Compression and decompression variables - std::vector m_file_paths; + std::vector m_input_paths; + NetworkAuthOption m_network_auth{}; std::string m_archives_dir; std::string m_output_dir; std::string m_timestamp_key; @@ -213,9 +215,6 @@ class CommandLineArguments { bool m_ignore_case{false}; std::vector m_projection_columns; - // Decompression and search variables - std::string m_archive_id; - // Search aggregation variables std::string m_reducer_host; int m_reducer_port{-1}; diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index 2c6639290..e46b7daf3 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -11,6 +12,7 @@ #include #include +#include "../clp/CurlGlobalInstance.hpp" #include "../clp/GlobalMySQLMetadataDB.hpp" #include "../clp/streaming_archive/ArchiveMetadata.hpp" #include "../reducer/network_utils.hpp" @@ -18,7 +20,6 @@ #include "Defs.hpp" #include "JsonConstructor.hpp" #include "JsonParser.hpp" -#include "ReaderUtils.hpp" #include "search/AddTimestampConditions.hpp" #include "search/ConvertToExists.hpp" #include "search/EmptyExpr.hpp" @@ -87,7 +88,8 @@ bool compress(CommandLineArguments const& command_line_arguments) { } clp_s::JsonParserOption option{}; - option.file_paths = command_line_arguments.get_file_paths(); + option.input_paths = command_line_arguments.get_input_paths(); + option.network_auth = command_line_arguments.get_network_auth(); option.input_file_type = command_line_arguments.get_file_type(); option.archives_dir = archives_dir.string(); option.target_encoded_size = command_line_arguments.get_target_encoded_size(); @@ -143,7 +145,7 @@ bool search_archive( ) { auto const& query = command_line_arguments.get_query(); - auto timestamp_dict = archive_reader->read_timestamp_dictionary(); + auto timestamp_dict = archive_reader->get_timestamp_dictionary(); AddTimestampConditions add_timestamp_conditions( timestamp_dict->get_authoritative_timestamp_tokenized_column(), command_line_arguments.get_search_begin_ts(), @@ -282,6 +284,7 @@ int main(int argc, char const* argv[]) { clp_s::TimestampPattern::init(); mongocxx::instance const mongocxx_instance{}; + clp::CurlGlobalInstance const curl_instance{}; CommandLineArguments command_line_arguments("clp-s"); auto parsing_result = command_line_arguments.parse_arguments(argc, argv); @@ -300,37 +303,21 @@ int main(int argc, char const* argv[]) { return 1; } } else if (CommandLineArguments::Command::Extract == command_line_arguments.get_command()) { - auto const& archives_dir = command_line_arguments.get_archives_dir(); - if (false == std::filesystem::is_directory(archives_dir)) { - SPDLOG_ERROR("'{}' is not a directory.", archives_dir); - return 1; - } - clp_s::JsonConstructorOption option{}; option.output_dir = command_line_arguments.get_output_dir(); option.ordered = command_line_arguments.get_ordered_decompression(); - option.archives_dir = archives_dir; option.target_ordered_chunk_size = command_line_arguments.get_target_ordered_chunk_size(); + option.network_auth = command_line_arguments.get_network_auth(); if (false == command_line_arguments.get_mongodb_uri().empty()) { option.metadata_db = {command_line_arguments.get_mongodb_uri(), command_line_arguments.get_mongodb_collection()}; } + try { - auto const& archive_id = command_line_arguments.get_archive_id(); - if (false == archive_id.empty()) { - option.archive_id = archive_id; + for (auto const& archive_path : command_line_arguments.get_input_paths()) { + option.archive_path = archive_path; decompress_archive(option); - } else { - for (auto const& entry : std::filesystem::directory_iterator(archives_dir)) { - if (false == entry.is_directory()) { - // Skip non-directories - continue; - } - - option.archive_id = entry.path().filename(); - decompress_archive(option); - } } } catch (clp_s::TraceableException& e) { SPDLOG_ERROR("{}", e.what()); @@ -349,12 +336,6 @@ int main(int argc, char const* argv[]) { return 1; } - auto const& archives_dir = command_line_arguments.get_archives_dir(); - if (false == std::filesystem::is_directory(archives_dir)) { - SPDLOG_ERROR("'{}' is not a directory.", archives_dir); - return 1; - } - int reducer_socket_fd{-1}; if (command_line_arguments.get_output_handler_type() == CommandLineArguments::OutputHandlerType::Reducer) @@ -370,37 +351,20 @@ int main(int argc, char const* argv[]) { } } - auto const& archive_id = command_line_arguments.get_archive_id(); auto archive_reader = std::make_shared(); - if (false == archive_id.empty()) { - archive_reader->open(archives_dir, archive_id); + for (auto const& archive_path : command_line_arguments.get_input_paths()) { + archive_reader->open(archive_path, command_line_arguments.get_network_auth()); if (false - == search_archive(command_line_arguments, archive_reader, expr, reducer_socket_fd)) + == search_archive( + command_line_arguments, + archive_reader, + expr->copy(), + reducer_socket_fd + )) { return 1; } archive_reader->close(); - } else { - for (auto const& entry : std::filesystem::directory_iterator(archives_dir)) { - if (false == entry.is_directory()) { - // Skip non-directories - continue; - } - - auto const archive_id = entry.path().filename().string(); - archive_reader->open(archives_dir, archive_id); - if (false - == search_archive( - command_line_arguments, - archive_reader, - expr->copy(), - reducer_socket_fd - )) - { - return 1; - } - archive_reader->close(); - } } } From a64bd6b0c8c243f69ffd961e2b83527e963cd4b6 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Mon, 16 Dec 2024 22:15:15 +0000 Subject: [PATCH 13/26] Catch exception when failing to open archive during search --- components/core/src/clp_s/clp-s.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index e46b7daf3..0ec09b5b8 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -353,7 +353,12 @@ int main(int argc, char const* argv[]) { auto archive_reader = std::make_shared(); for (auto const& archive_path : command_line_arguments.get_input_paths()) { - archive_reader->open(archive_path, command_line_arguments.get_network_auth()); + try { + archive_reader->open(archive_path, command_line_arguments.get_network_auth()); + } catch (std::exception const& e) { + SPDLOG_ERROR("Failed to open archive - {}", e.what()); + return 1; + } if (false == search_archive( command_line_arguments, From 8fa5aa20060cc4c23e97a7f51b6b6687cfb853a5 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Tue, 17 Dec 2024 01:25:11 +0000 Subject: [PATCH 14/26] Attempt to fix build issue on macos --- components/core/src/clp_s/search/kql/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp_s/search/kql/CMakeLists.txt b/components/core/src/clp_s/search/kql/CMakeLists.txt index c2bf0fb5c..2e605115d 100644 --- a/components/core/src/clp_s/search/kql/CMakeLists.txt +++ b/components/core/src/clp_s/search/kql/CMakeLists.txt @@ -25,4 +25,4 @@ add_library( ) target_compile_features(kql PRIVATE cxx_std_20) target_include_directories(kql PRIVATE ${ANTLR_KqlParser_OUTPUT_DIR}) -target_link_libraries(kql PRIVATE antlr4_static) +target_link_libraries(kql PRIVATE antlr4_static spdlog::spdlog) From fe84cd4d1f61bf96918ff8f03c40336bed11be4f Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Tue, 17 Dec 2024 21:14:10 +0000 Subject: [PATCH 15/26] Attempt to fix macos build again --- components/core/src/clp_s/search/kql/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/components/core/src/clp_s/search/kql/CMakeLists.txt b/components/core/src/clp_s/search/kql/CMakeLists.txt index 2e605115d..7fba0946d 100644 --- a/components/core/src/clp_s/search/kql/CMakeLists.txt +++ b/components/core/src/clp_s/search/kql/CMakeLists.txt @@ -25,4 +25,5 @@ add_library( ) target_compile_features(kql PRIVATE cxx_std_20) target_include_directories(kql PRIVATE ${ANTLR_KqlParser_OUTPUT_DIR}) -target_link_libraries(kql PRIVATE antlr4_static spdlog::spdlog) +target_link_libraries(kql PRIVATE antlr4_static Boost::filesystem spdlog::spdlog) + From 561634cfade387906716414b42e4171646cdbd2b Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Wed, 18 Dec 2024 15:24:46 +0000 Subject: [PATCH 16/26] Revert all changes to kql CMakeLists to simplify review --- components/core/src/clp_s/search/kql/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp_s/search/kql/CMakeLists.txt b/components/core/src/clp_s/search/kql/CMakeLists.txt index 7fba0946d..9dba44a4b 100644 --- a/components/core/src/clp_s/search/kql/CMakeLists.txt +++ b/components/core/src/clp_s/search/kql/CMakeLists.txt @@ -25,5 +25,5 @@ add_library( ) target_compile_features(kql PRIVATE cxx_std_20) target_include_directories(kql PRIVATE ${ANTLR_KqlParser_OUTPUT_DIR}) -target_link_libraries(kql PRIVATE antlr4_static Boost::filesystem spdlog::spdlog) +target_link_libraries(kql PRIVATE antlr4_static Boost::filesystem) From 307be9ea0bbbc29a171aef282bc91c60c2782af5 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Thu, 19 Dec 2024 22:17:31 +0000 Subject: [PATCH 17/26] Properly detect http errors when ingesting over the network --- components/core/src/clp_s/JsonParser.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index 6fb86f7cf..649720d09 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -4,9 +4,11 @@ #include #include +#include #include #include +#include "../clp/NetworkReader.hpp" #include "../clp/ReaderInterface.hpp" #include "archive_constants.hpp" #include "JsonFileIterator.hpp" @@ -542,6 +544,23 @@ bool JsonParser::parse() { path.path ); } + + if (auto network_reader = std::dynamic_pointer_cast(reader); + nullptr != network_reader) + { + if (auto const rc = network_reader->get_curl_ret_code(); + rc.has_value() && CURLcode::CURLE_OK != rc.value()) + { + auto curl_error_message = network_reader->get_curl_error_msg(); + std::string error_msg_str; + if (curl_error_message.has_value()) { + error_msg_str = curl_error_message.value(); + } + SPDLOG_ERROR("Encountered curl error during ingestion - {}", error_msg_str); + } + m_archive_writer->close(); + return false; + } } return true; } From 7557c849cd11dd67eafdfc48cce36bb80511e3d1 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Thu, 19 Dec 2024 22:52:29 +0000 Subject: [PATCH 18/26] Fix bug introduced while splitting up changes --- components/core/src/clp_s/clp-s.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index 0ec09b5b8..fd6ccd587 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -145,7 +145,7 @@ bool search_archive( ) { auto const& query = command_line_arguments.get_query(); - auto timestamp_dict = archive_reader->get_timestamp_dictionary(); + auto timestamp_dict = archive_reader->read_timestamp_dictionary(); AddTimestampConditions add_timestamp_conditions( timestamp_dict->get_authoritative_timestamp_tokenized_column(), command_line_arguments.get_search_begin_ts(), From fc8fad640da61937bfafb15d9323f03e4040ca40 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Fri, 20 Dec 2024 00:17:10 +0000 Subject: [PATCH 19/26] Fix obvious bug introduced in recent commit --- components/core/src/clp_s/JsonParser.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index 649720d09..665e5f874 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -557,9 +557,9 @@ bool JsonParser::parse() { error_msg_str = curl_error_message.value(); } SPDLOG_ERROR("Encountered curl error during ingestion - {}", error_msg_str); + m_archive_writer->close(); + return false; } - m_archive_writer->close(); - return false; } } return true; From 84bc5c38170e0e75b763154f2b44bbdeaf3f8f75 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Fri, 20 Dec 2024 18:55:05 +0000 Subject: [PATCH 20/26] Improve error message for curl error during ingestion --- components/core/src/clp_s/JsonParser.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index 665e5f874..18315e9fd 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -551,12 +551,13 @@ bool JsonParser::parse() { if (auto const rc = network_reader->get_curl_ret_code(); rc.has_value() && CURLcode::CURLE_OK != rc.value()) { - auto curl_error_message = network_reader->get_curl_error_msg(); - std::string error_msg_str; - if (curl_error_message.has_value()) { - error_msg_str = curl_error_message.value(); - } - SPDLOG_ERROR("Encountered curl error during ingestion - {}", error_msg_str); + auto const curl_error_message = network_reader->get_curl_error_msg(); + SPDLOG_ERROR( + "Encountered curl error while ingesting {} - Code: {} - Message: {}", + path.path, + rc.value(), + curl_error_message.value_or("Unkown error") + ); m_archive_writer->close(); return false; } From 2a87da22da0204a090055b2913497559a6fb3f67 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Fri, 20 Dec 2024 19:01:52 +0000 Subject: [PATCH 21/26] Log an error when environment variables are unavailable for presigned url authentication --- components/core/src/clp_s/ReaderUtils.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/components/core/src/clp_s/ReaderUtils.cpp b/components/core/src/clp_s/ReaderUtils.cpp index 01359b806..88bb31286 100644 --- a/components/core/src/clp_s/ReaderUtils.cpp +++ b/components/core/src/clp_s/ReaderUtils.cpp @@ -161,10 +161,18 @@ std::shared_ptr try_create_file_reader(std::string_view co } bool try_sign_url(std::string& url) { - clp::aws::AwsAuthenticationSigner signer{ - std::getenv(cAwsAccessKeyIdEnvVar), - std::getenv(cAwsSecretAccessKeyEnvVar) - }; + auto const aws_access_key = std::getenv(cAwsAccessKeyIdEnvVar); + auto const aws_secret_access_key = std::getenv(cAwsSecretAccessKeyEnvVar); + if (nullptr == aws_access_key || nullptr == aws_secret_access_key) { + SPDLOG_ERROR( + "{} and {} environment variables not available for presigned url authentication.", + cAwsAccessKeyIdEnvVar, + cAwsSecretAccessKeyEnvVar + ); + return false; + } + + clp::aws::AwsAuthenticationSigner signer{aws_access_key, aws_secret_access_key}; try { clp::aws::S3Url s3_url{url}; From 1f7660b7787b8d5bb36f9e39f3deadf669692148 Mon Sep 17 00:00:00 2001 From: Devin Gibson Date: Mon, 30 Dec 2024 18:54:20 -0500 Subject: [PATCH 22/26] Apply suggestions from code review Co-authored-by: wraymo <37269683+wraymo@users.noreply.github.com> --- components/core/src/clp_s/InputConfig.hpp | 18 +++++++++--------- components/core/src/clp_s/Utils.cpp | 2 +- components/core/src/clp_s/Utils.hpp | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/components/core/src/clp_s/InputConfig.hpp b/components/core/src/clp_s/InputConfig.hpp index c63acb3ba..3672bb0ae 100644 --- a/components/core/src/clp_s/InputConfig.hpp +++ b/components/core/src/clp_s/InputConfig.hpp @@ -13,7 +13,7 @@ constexpr char cAwsAccessKeyIdEnvVar[] = "AWS_ACCESS_KEY_ID"; constexpr char cAwsSecretAccessKeyEnvVar[] = "AWS_SECRET_ACCESS_KEY"; /** - * Enum class definining the source of some resource. + * Enum class defining the source of a resource. */ enum class InputSource : uint8_t { Filesystem, @@ -21,7 +21,7 @@ enum class InputSource : uint8_t { }; /** - * Enum class defining the auth that needs to be performed to access some resource. + * Enum class defining the authentication method required for accessing a resource. */ enum class AuthMethod : uint8_t { None, @@ -36,7 +36,7 @@ struct NetworkAuthOption { }; /** - * Struct describing a path to some resource as well as its source. + * Struct representing a resource path with its source type. */ struct Path { InputSource source{InputSource::Filesystem}; @@ -58,7 +58,7 @@ struct Path { [[nodiscard]] auto get_path_object_for_raw_path(std::string_view const path) -> Path; /** - * Recursively records all file paths from the given raw path, including the path itself. + * Recursively collects all file paths from the given raw path, including the path itself. * @param path * @param files Returned paths * @return true on success, false otherwise @@ -66,16 +66,16 @@ struct Path { auto get_input_files_for_raw_path(std::string_view const path, std::vector& files) -> bool; /** - * Recursively records all file paths that are children of the the given Path, including the Path + * Recursively collects all file paths that are children of the the given Path, including the Path * itself. - * @param option + * @param path * @param files Returned paths * @return true on success, false otherwise */ [[nodiscard]] auto get_input_files_for_path(Path const& path, std::vector& files) -> bool; /** - * Records all archives that are children of the given raw path, including the path itself. + * Collects all archives that are children of the given raw path, including the path itself. * @param path * @param archives Returned archives * @return true on success, false otherwise @@ -84,7 +84,7 @@ auto get_input_files_for_raw_path(std::string_view const path, std::vector get_input_archives_for_raw_path(std::string_view const path, std::vector& archives) -> bool; /** - * Records all archives from the given Path, including the Path itself. + * Collects all archives from the given Path, including the Path itself. * @param path * @param archives Returned archives * @return true on success, false otherwise @@ -93,7 +93,7 @@ get_input_archives_for_raw_path(std::string_view const path, std::vector& get_input_archives_for_path(Path const& path, std::vector& archives) -> bool; /** - * Determines the archive id of a given archive based on a path to that archive. + * Determines the archive id of an archive based on the archive path. * @param path * @param archive_id Returned archive id * @return true on success, false otherwise diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp index 41b8d24e1..4e2e1ad7e 100644 --- a/components/core/src/clp_s/Utils.cpp +++ b/components/core/src/clp_s/Utils.cpp @@ -65,7 +65,7 @@ namespace { * @param path * @return true if this directory is a multi-file archive, false otherwise */ -bool directory_is_multi_file_archive(std::string_view const path) { +bool is_multi_file_archive(std::string_view const path) { for (auto const& entry : std::filesystem::directory_iterator{path}) { if (entry.is_directory()) { return false; diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp index f57107ecf..0a5886819 100644 --- a/components/core/src/clp_s/Utils.hpp +++ b/components/core/src/clp_s/Utils.hpp @@ -21,7 +21,7 @@ class FileUtils { find_all_files_in_directory(std::string const& path, std::vector& file_paths); /** - * Find all archives in a directory, including the directory itself + * Finds all archives in a directory, including the directory itself * @param path * @param archive_paths * @return true if successful, false otherwise From dbc16fb35b8fefe8451ed0ce6791c40f399515b3 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Tue, 31 Dec 2024 00:01:42 +0000 Subject: [PATCH 23/26] Complete rename --- components/core/src/clp_s/Utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp index 4e2e1ad7e..d01b1a2b9 100644 --- a/components/core/src/clp_s/Utils.cpp +++ b/components/core/src/clp_s/Utils.cpp @@ -112,7 +112,7 @@ bool FileUtils::find_all_archives_in_directory( return false; } - if (directory_is_multi_file_archive(path)) { + if (is_multi_file_archive(path)) { archive_paths.emplace_back(path); return true; } From 86fa165e14c241a01cf8af8df5a76b7715c38828 Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Tue, 31 Dec 2024 00:22:42 +0000 Subject: [PATCH 24/26] Address code review comments --- components/core/src/clp_s/CommandLineArguments.cpp | 6 +++--- components/core/src/clp_s/JsonFileIterator.hpp | 4 ++-- components/core/src/clp_s/ReaderUtils.hpp | 6 ++++++ components/core/src/clp_s/Utils.hpp | 2 +- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp index 6515ebdd0..819ac7ef6 100644 --- a/components/core/src/clp_s/CommandLineArguments.cpp +++ b/components/core/src/clp_s/CommandLineArguments.cpp @@ -218,7 +218,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { )( "auth", po::value(&auth) - ->value_name("AUTH_TYPE") + ->value_name("AUTH_METHOD") ->default_value(auth), "Type of authentication required for network requests (s3 | none). Authentication" " with s3 requires the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment" @@ -365,7 +365,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { )( "auth", po::value(&auth) - ->value_name("AUTH_TYPE") + ->value_name("AUTH_METHOD") ->default_value(auth), "Type of authentication required for network requests (s3 | none). Authentication" " with s3 requires the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment" @@ -535,7 +535,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { )( "auth", po::value(&auth) - ->value_name("AUTH_TYPE") + ->value_name("AUTH_METHOD") ->default_value(auth), "Type of authentication required for network requests (s3 | none). Authentication" " with s3 requires the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment" diff --git a/components/core/src/clp_s/JsonFileIterator.hpp b/components/core/src/clp_s/JsonFileIterator.hpp index fc054dd9d..76bea3cf8 100644 --- a/components/core/src/clp_s/JsonFileIterator.hpp +++ b/components/core/src/clp_s/JsonFileIterator.hpp @@ -10,7 +10,7 @@ namespace clp_s { class JsonFileIterator { public: /** - * An iterator over a file containing json objects. JSON is parsed + * An iterator over an input stream containing json objects. JSON is parsed * using simdjson::parse_many. This allows simdjson to efficiently find * delimeters between JSON objects, and if enabled parse JSON ahead of time * in another thread while the JSON is being iterated over. @@ -18,7 +18,7 @@ class JsonFileIterator { * The buffer grows automatically if there are JSON objects larger than the buffer size. * The buffer is padded to be SIMDJSON_PADDING bytes larger than the specified size. - * @param file_name the file containing JSON + * @param reader the input stream containing JSON * @param max_document_size the maximum allowed size of a single document * @param buf_size the initial buffer size */ diff --git a/components/core/src/clp_s/ReaderUtils.hpp b/components/core/src/clp_s/ReaderUtils.hpp index 68d1c6e2c..4661b5fae 100644 --- a/components/core/src/clp_s/ReaderUtils.hpp +++ b/components/core/src/clp_s/ReaderUtils.hpp @@ -70,6 +70,12 @@ class ReaderUtils { std::string const& archive_path ); + /** + * Tries to open a clp::ReaderInterface using the given Path and NetworkAuthOption. + * @param path + * @param network_auth + * @return the opened clp::ReaderInterface or nullptr on error + */ static std::shared_ptr try_create_reader(Path const& path, NetworkAuthOption const& network_auth); diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp index 0a5886819..0181a6749 100644 --- a/components/core/src/clp_s/Utils.hpp +++ b/components/core/src/clp_s/Utils.hpp @@ -12,7 +12,7 @@ namespace clp_s { class FileUtils { public: /** - * Find all files in a directory + * Finds all files in a directory * @param path * @param file_paths * @return true if successful, false otherwise From 0b2c41a418e6dbe2f3e53e07e4e3fea923eb129c Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Tue, 31 Dec 2024 01:01:12 +0000 Subject: [PATCH 25/26] Deduplicate some validation code in CommandLineArguments.cpp --- .../core/src/clp_s/CommandLineArguments.cpp | 42 ++++++++++--------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp index 8f3dd500b..5b6673f04 100644 --- a/components/core/src/clp_s/CommandLineArguments.cpp +++ b/components/core/src/clp_s/CommandLineArguments.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -16,6 +17,10 @@ namespace po = boost::program_options; namespace clp_s { namespace { +// Authorization method constants +constexpr std::string_view cNoAuth{"none"}; +constexpr std::string_view cS3Auth{"s3"}; + /** * Read a list of newline-delimited paths from a file and put them into a vector passed by reference * TODO: deduplicate this code with the version in clp @@ -57,6 +62,20 @@ bool read_paths_from_file( } return true; } + +/** + * Validates and populates network authorization options. + * @param auth_method + * @param network_auth + * @throws std::invalid_argument if the authorization option is invalid + */ +void validate_network_auth(std::string_view auth_method, NetworkAuthOption& auth) { + if (cS3Auth == auth_method) { + auth.method = AuthMethod::S3PresignedUrlV4; + } else if (cNoAuth != auth_method) { + throw std::invalid_argument(fmt::format("Invalid authentication type \"{}\"", auth_method)); + } +} } // namespace CommandLineArguments::ParsingResult @@ -133,8 +152,6 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { throw std::invalid_argument(std::string("Unknown action '") + command_input + "'"); } - constexpr std::string_view cNoAuth{"none"}; - constexpr std::string_view cS3Auth{"s3"}; if (Command::Compress == m_command) { po::options_description compression_positional_options; std::vector input_paths; @@ -306,12 +323,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { throw std::invalid_argument("Unknown FILE_TYPE: " + file_type); } - if (cS3Auth == auth) { - m_network_auth.method = AuthMethod::S3PresignedUrlV4; - } else if (cNoAuth != auth) { - throw std::invalid_argument(fmt::format("Invalid authentication type \"{}\"", auth) - ); - } + validate_network_auth(auth, m_network_auth); // Parse and validate global metadata DB config if (false == metadata_db_config_file_path.empty()) { @@ -452,12 +464,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { throw std::invalid_argument("No archive paths specified"); } - if (cS3Auth == auth) { - m_network_auth.method = AuthMethod::S3PresignedUrlV4; - } else if (cNoAuth != auth) { - throw std::invalid_argument(fmt::format("Invalid authentication type \"{}\"", auth) - ); - } + validate_network_auth(auth, m_network_auth); if (m_output_dir.empty()) { throw std::invalid_argument("No output directory specified"); @@ -720,12 +727,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { throw std::invalid_argument("No archive paths specified"); } - if (cS3Auth == auth) { - m_network_auth.method = AuthMethod::S3PresignedUrlV4; - } else if (cNoAuth != auth) { - throw std::invalid_argument(fmt::format("Invalid authentication type \"{}\"", auth) - ); - } + validate_network_auth(auth, m_network_auth); if (m_query.empty()) { throw std::invalid_argument("No query specified"); From 93a1696bf39682dd43a27f592c65a33a4ef1f90f Mon Sep 17 00:00:00 2001 From: gibber9809 Date: Fri, 3 Jan 2025 15:31:15 +0000 Subject: [PATCH 26/26] Address rabbit comments and fix macos build --- components/core/src/clp_s/ArchiveReader.cpp | 3 +- components/core/src/clp_s/ArchiveWriter.cpp | 5 +- .../core/src/clp_s/CommandLineArguments.cpp | 77 +++++++++---------- components/core/src/clp_s/InputConfig.cpp | 6 +- .../core/src/clp_s/JsonFileIterator.hpp | 1 - components/core/src/clp_s/JsonParser.cpp | 4 +- components/core/src/clp_s/Utils.cpp | 1 - .../core/tests/test-clp_s-end_to_end.cpp | 7 +- 8 files changed, 51 insertions(+), 53 deletions(-) diff --git a/components/core/src/clp_s/ArchiveReader.cpp b/components/core/src/clp_s/ArchiveReader.cpp index 487beca34..738e8e645 100644 --- a/components/core/src/clp_s/ArchiveReader.cpp +++ b/components/core/src/clp_s/ArchiveReader.cpp @@ -208,8 +208,9 @@ BaseColumnReader* ArchiveReader::append_reader_column(SchemaReader& reader, int3 column_reader = new DateStringColumnReader(column_id, m_timestamp_dict); break; // No need to push columns without associated object readers into the SchemaReader. - case NodeType::Object: + case NodeType::Metadata: case NodeType::NullValue: + case NodeType::Object: case NodeType::StructuredArray: case NodeType::Unknown: break; diff --git a/components/core/src/clp_s/ArchiveWriter.cpp b/components/core/src/clp_s/ArchiveWriter.cpp index d627479de..2a60013a9 100644 --- a/components/core/src/clp_s/ArchiveWriter.cpp +++ b/components/core/src/clp_s/ArchiveWriter.cpp @@ -270,9 +270,10 @@ void ArchiveWriter::initialize_schema_writer(SchemaWriter* writer, Schema const& case NodeType::DateString: writer->append_column(new DateStringColumnWriter(id)); break; - case NodeType::StructuredArray: - case NodeType::Object: + case NodeType::Metadata: case NodeType::NullValue: + case NodeType::Object: + case NodeType::StructuredArray: case NodeType::Unknown: break; } diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp index 5b6673f04..e6cd07163 100644 --- a/components/core/src/clp_s/CommandLineArguments.cpp +++ b/components/core/src/clp_s/CommandLineArguments.cpp @@ -76,6 +76,41 @@ void validate_network_auth(std::string_view auth_method, NetworkAuthOption& auth throw std::invalid_argument(fmt::format("Invalid authentication type \"{}\"", auth_method)); } } + +/** + * Validates and populates archive paths. + * @param archive_path + * @param archive_id + * @param archive_paths + * @throws std::invalid_argument on any error + */ +void validate_archive_paths( + std::string_view archive_path, + std::string_view archive_id, + std::vector& archive_paths +) { + if (archive_path.empty()) { + throw std::invalid_argument("No archive path specified"); + } + + if (false == archive_id.empty()) { + auto archive_fs_path = std::filesystem::path(archive_path) / archive_id; + std::error_code ec; + if (false == std::filesystem::exists(archive_fs_path, ec) || ec) { + throw std::invalid_argument("Requested archive does not exist"); + } + archive_paths.emplace_back(clp_s::Path{ + .source = clp_s::InputSource::Filesystem, + .path = archive_fs_path.string() + }); + } else if (false == get_input_archives_for_raw_path(archive_path, archive_paths)) { + throw std::invalid_argument("Invalid archive path"); + } + + if (archive_paths.empty()) { + throw std::invalid_argument("No archive paths specified"); + } +} } // namespace CommandLineArguments::ParsingResult @@ -443,26 +478,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { return ParsingResult::InfoCommand; } - if (archive_path.empty()) { - throw std::invalid_argument("No archive path specified"); - } - - if (false == archive_id.empty()) { - auto archive_fs_path = std::filesystem::path(archive_path) / archive_id; - if (false == std::filesystem::exists(archive_fs_path)) { - throw std::invalid_argument("Requested archive does not exist"); - } - m_input_paths.emplace_back(clp_s::Path{ - .source{clp_s::InputSource::Filesystem}, - .path{archive_fs_path.string()} - }); - } else if (false == get_input_archives_for_raw_path(archive_path, m_input_paths)) { - throw std::invalid_argument("Invalid archive path"); - } - - if (m_input_paths.empty()) { - throw std::invalid_argument("No archive paths specified"); - } + validate_archive_paths(archive_path, archive_id, m_input_paths); validate_network_auth(auth, m_network_auth); @@ -706,26 +722,7 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { return ParsingResult::InfoCommand; } - if (archive_path.empty()) { - throw std::invalid_argument("No archive path specified"); - } - - if (false == archive_id.empty()) { - auto archive_fs_path = std::filesystem::path(archive_path) / archive_id; - if (false == std::filesystem::exists(archive_fs_path)) { - throw std::invalid_argument("Requested archive does not exist"); - } - m_input_paths.emplace_back(clp_s::Path{ - .source{clp_s::InputSource::Filesystem}, - .path{archive_fs_path.string()} - }); - } else if (false == get_input_archives_for_raw_path(archive_path, m_input_paths)) { - throw std::invalid_argument("Invalid archive path"); - } - - if (m_input_paths.empty()) { - throw std::invalid_argument("No archive paths specified"); - } + validate_archive_paths(archive_path, archive_id, m_input_paths); validate_network_auth(auth, m_network_auth); diff --git a/components/core/src/clp_s/InputConfig.cpp b/components/core/src/clp_s/InputConfig.cpp index 260caefd4..83c19a5c3 100644 --- a/components/core/src/clp_s/InputConfig.cpp +++ b/components/core/src/clp_s/InputConfig.cpp @@ -17,7 +17,7 @@ auto get_source_for_path(std::string_view const path) -> InputSource { } auto get_path_object_for_raw_path(std::string_view const path) -> Path { - return Path{.source{get_source_for_path(path)}, .path{std::string{path}}}; + return Path{.source = get_source_for_path(path), .path = std::string{path}}; } auto get_input_files_for_raw_path(std::string_view const path, std::vector& files) -> bool { @@ -41,7 +41,7 @@ auto get_input_files_for_path(Path const& path, std::vector& files) -> boo } for (auto& file : file_paths) { - files.emplace_back(Path{.source{InputSource::Filesystem}, .path{std::move(file)}}); + files.emplace_back(Path{.source = InputSource::Filesystem, .path = std::move(file)}); } return true; } @@ -68,7 +68,7 @@ auto get_input_archives_for_path(Path const& path, std::vector& archives) } for (auto& archive : archive_paths) { - archives.emplace_back(Path{.source{InputSource::Filesystem}, .path{std::move(archive)}}); + archives.emplace_back(Path{.source = InputSource::Filesystem, .path = std::move(archive)}); } return true; } diff --git a/components/core/src/clp_s/JsonFileIterator.hpp b/components/core/src/clp_s/JsonFileIterator.hpp index 76bea3cf8..5464d56df 100644 --- a/components/core/src/clp_s/JsonFileIterator.hpp +++ b/components/core/src/clp_s/JsonFileIterator.hpp @@ -4,7 +4,6 @@ #include #include "../clp/ReaderInterface.hpp" -#include "FileReader.hpp" namespace clp_s { class JsonFileIterator { diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index e8f56c3fd..21e3b0cfd 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -618,8 +618,8 @@ bool JsonParser::parse() { SPDLOG_ERROR( "Encountered curl error while ingesting {} - Code: {} - Message: {}", path.path, - rc.value(), - curl_error_message.value_or("Unkown error") + static_cast(rc.value()), + curl_error_message.value_or("Unknown error") ); m_archive_writer->close(); return false; diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp index d01b1a2b9..19b564c03 100644 --- a/components/core/src/clp_s/Utils.cpp +++ b/components/core/src/clp_s/Utils.cpp @@ -9,7 +9,6 @@ #include #include "archive_constants.hpp" -#include "Utils.hpp" using std::string; using std::string_view; diff --git a/components/core/tests/test-clp_s-end_to_end.cpp b/components/core/tests/test-clp_s-end_to_end.cpp index a9e66aa1b..259b46b93 100644 --- a/components/core/tests/test-clp_s-end_to_end.cpp +++ b/components/core/tests/test-clp_s-end_to_end.cpp @@ -71,9 +71,10 @@ void compress(bool structurize_arrays) { REQUIRE((std::filesystem::is_directory(cTestEndToEndArchiveDirectory))); clp_s::JsonParserOption parser_option{}; - parser_option.input_paths.emplace_back( - clp_s::Path{.source{clp_s::InputSource::Filesystem}, .path{get_test_input_local_path()}} - ); + parser_option.input_paths.emplace_back(clp_s::Path{ + .source = clp_s::InputSource::Filesystem, + .path = get_test_input_local_path() + }); parser_option.archives_dir = cTestEndToEndArchiveDirectory; parser_option.target_encoded_size = cDefaultTargetEncodedSize; parser_option.max_document_size = cDefaultMaxDocumentSize;