From dceb5648e8a2df0ecc65cbe81a07f538f5538359 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Mon, 25 Nov 2024 09:53:02 -0500 Subject: [PATCH 01/65] Add lzma download and port lzma scripts --- components/core/.clang-format | 2 +- components/core/CMakeLists.txt | 24 +- .../clp/streaming_compression/Constants.hpp | 1 + .../streaming_compression/lzma/Compressor.cpp | 303 +++++++++++++++ .../streaming_compression/lzma/Compressor.hpp | 133 +++++++ .../streaming_compression/lzma/Constants.hpp | 15 + .../lzma/Decompressor.cpp | 362 ++++++++++++++++++ .../lzma/Decompressor.hpp | 162 ++++++++ .../core/tests/test-StreamingCompression.cpp | 1 + .../core/tools/scripts/lib_install/liblzma.sh | 66 ++++ .../install-packages-from-source.sh | 1 + .../ubuntu-focal/install-prebuilt-packages.sh | 1 + .../install-packages-from-source.sh | 1 + .../ubuntu-jammy/install-prebuilt-packages.sh | 1 + 14 files changed, 1071 insertions(+), 2 deletions(-) create mode 100644 components/core/src/clp/streaming_compression/lzma/Compressor.cpp create mode 100644 components/core/src/clp/streaming_compression/lzma/Compressor.hpp create mode 100644 components/core/src/clp/streaming_compression/lzma/Constants.hpp create mode 100644 components/core/src/clp/streaming_compression/lzma/Decompressor.cpp create mode 100644 components/core/src/clp/streaming_compression/lzma/Decompressor.hpp create mode 100755 components/core/tools/scripts/lib_install/liblzma.sh diff --git a/components/core/.clang-format b/components/core/.clang-format index ff65adbae..4d0d3a87c 100644 --- a/components/core/.clang-format +++ b/components/core/.clang-format @@ -4,7 +4,7 @@ IncludeCategories: # NOTE: A header is grouped by first matching regex # Library headers. Update when adding new libraries. # NOTE: clang-format retains leading white-space on a line in violation of the YAML spec. - - Regex: "<(absl|antlr4|archive|boost|bsoncxx|catch2|curl|date|fmt|json|log_surgeon|mongocxx\ + - Regex: "<(absl|antlr4|archive|boost|bsoncxx|catch2|curl|date|fmt|json|log_surgeon|lzma|mongocxx\ |msgpack|mysql|openssl|outcome|regex_utils|simdjson|spdlog|sqlite3|string_utils|yaml-cpp|zstd)" Priority: 3 # C system headers diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index e5c9b06c8..92bb6af19 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -11,13 +11,16 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # Set general compressor set(GENERAL_COMPRESSOR "zstd" CACHE STRING "The general-purpose compressor used as the 2nd-stage compressor") -set_property(CACHE GENERAL_COMPRESSOR PROPERTY STRINGS passthrough zstd) +set_property(CACHE GENERAL_COMPRESSOR PROPERTY STRINGS passthrough zstd lzma) if ("${GENERAL_COMPRESSOR}" STREQUAL "passthrough") add_definitions(-DUSE_PASSTHROUGH_COMPRESSION=1) message(STATUS "Using passthrough compression") elseif ("${GENERAL_COMPRESSOR}" STREQUAL "zstd") add_definitions(-DUSE_ZSTD_COMPRESSION=1) message(STATUS "Using Zstandard compression") +elseif ("${GENERAL_COMPRESSOR}" STREQUAL "lzma") + add_definitions(-DUSE_LZMA_COMPRESSION=1) + message(STATUS "Using Lempel–Ziv–Markov chain Algorithm compression") else() message(SEND_ERROR "GENERAL_COMPRESSOR=${GENERAL_COMPRESSOR} is unimplemented.") endif() @@ -224,6 +227,19 @@ else() message(FATAL_ERROR "Could not find ${CLP_LIBS_STRING} libraries for ZStd") endif() +# Find and setup LZMA Library +# Notice that we don't have support to switch between static and shared libraries. +# TODO: add a script in ./cmake/Modules to resolve .a vs. .so +find_package(LibLZMA REQUIRED) +if(LIBLZMA_FOUND) + message(STATUS "Found LIBLZMA_FOUND ${LIBLZMA_VERSION_STRING}") + message(STATUS "Lzma library location: ${LIBLZMA_LIBRARIES}") +else() + message(FATAL_ERROR "Could not find ${CLP_LIBS_STRING} libraries for LIBLZMA_FOUND") +endif() +include_directories(${LIBLZMA_INCLUDE_DIRS}) +message("LZMA Include Dir: ${LIBLZMA_INCLUDE_DIRS}") + # sqlite dependencies set(sqlite_DYNAMIC_LIBS "dl;m;pthread") include(cmake/Modules/FindLibraryDependencies.cmake) @@ -462,6 +478,11 @@ set(SOURCE_FILES_unitTest src/clp/streaming_compression/Compressor.hpp src/clp/streaming_compression/Constants.hpp src/clp/streaming_compression/Decompressor.hpp + src/clp/streaming_compression/lzma/Compressor.cpp + src/clp/streaming_compression/lzma/Compressor.hpp + src/clp/streaming_compression/lzma/Decompressor.cpp + src/clp/streaming_compression/lzma/Decompressor.hpp + src/clp/streaming_compression/lzma/Constants.hpp src/clp/streaming_compression/passthrough/Compressor.cpp src/clp/streaming_compression/passthrough/Compressor.hpp src/clp/streaming_compression/passthrough/Decompressor.cpp @@ -549,6 +570,7 @@ target_link_libraries(unitTest clp::regex_utils clp::string_utils yaml-cpp::yaml-cpp + ${LIBLZMA_LIBRARIES} ZStd::ZStd ) target_compile_features(unitTest diff --git a/components/core/src/clp/streaming_compression/Constants.hpp b/components/core/src/clp/streaming_compression/Constants.hpp index 4649c2e98..080f3a20b 100644 --- a/components/core/src/clp/streaming_compression/Constants.hpp +++ b/components/core/src/clp/streaming_compression/Constants.hpp @@ -7,6 +7,7 @@ namespace clp::streaming_compression { enum class CompressorType : uint8_t { ZSTD = 0x10, + LZMA = 0x20, Passthrough = 0xFF, }; } // namespace clp::streaming_compression diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp new file mode 100644 index 000000000..f10ec915b --- /dev/null +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -0,0 +1,303 @@ +#include "Compressor.hpp" + +// spdlog +#include + +// Project headers +#include "../../Defs.h" + +// File-scope constants +static constexpr size_t cCompressedStreamBlockBufferSize = 4096; // 4KiB + +namespace streaming_compression::lzma { +Compressor::LzmaOption Compressor::m_option; + +Compressor::Compressor() + : ::streaming_compression::Compressor(CompressorType::LZMA), + m_compression_stream_contains_data(false), + m_compressed_stream_file_writer(nullptr), + m_compression_stream(nullptr) { + m_compressed_stream_block_buffer = std::make_unique(cCompressedStreamBlockBufferSize); + m_compression_stream = new lzma_stream; + memset(m_compression_stream, 0, sizeof(lzma_stream)); +} + +Compressor::~Compressor() { + if (nullptr != m_compression_stream) { + delete m_compression_stream; + } +} + +void Compressor::init_lzma_encoder(lzma_stream* strm) { + lzma_options_lzma options; + if (lzma_lzma_preset(&options, m_option.get_compression_level())) { + SPDLOG_ERROR("Failed to initialize LZMA options."); + throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); + } + options.dict_size = m_option.get_dict_size(); + lzma_filter filters[2]{ + {LZMA_FILTER_LZMA2, &options}, + {LZMA_VLI_UNKNOWN, nullptr}, + }; + + // Initialize the encoder using a preset. Set the integrity to check + // to CRC64, which is the default in the xz command line tool. If + // the .xz file needs to be decompressed with XZ Embedded, use + // LZMA_CHECK_CRC32 instead. + lzma_ret ret = lzma_stream_encoder(strm, filters, LZMA_CHECK_CRC64); + + // Return successfully if the initialization went fine. + if (ret == LZMA_OK) { + return; + } + + // Something went wrong. The possible errors are documented in + // lzma/container.h (src/liblzma/api/lzma/container.h in the source + // package or e.g. /usr/include/lzma/container.h depending on the + // install prefix). + char const* msg; + switch (ret) { + case LZMA_MEM_ERROR: + msg = "Memory allocation failed"; + break; + + case LZMA_OPTIONS_ERROR: + msg = "Specified preset is not supported"; + break; + + case LZMA_UNSUPPORTED_CHECK: + msg = "Specified integrity check is not supported"; + break; + + default: + // This is most likely LZMA_PROG_ERROR indicating a bug in + // this program or in liblzma. It is inconvenient to have a + // separate error message for errors that should be impossible + // to occur, but knowing the error code is important for + // debugging. That's why it is good to print the error code + // at least when there is no good error message to show. + msg = "Unknown error, possibly a bug"; + break; + } + + SPDLOG_ERROR("Error initializing the encoder: {} (error code {})", msg, int(ret)); + throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); +} + +void Compressor::open(FileWriter& file_writer, int compression_level) { + if (nullptr != m_compressed_stream_file_writer) { + throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__); + } + + if (false == (0 <= compression_level && compression_level <= 9)) { + throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__); + } + if (compression_level != m_option.get_compression_level()) { + m_option.set_compression_level(compression_level); + } + + init_lzma_encoder(m_compression_stream); + // Setup compressed stream parameters + m_compression_stream->next_in = nullptr; + m_compression_stream->avail_in = 0; + m_compression_stream->next_out = m_compressed_stream_block_buffer.get(); + m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; + + m_compressed_stream_file_writer = &file_writer; + + m_uncompressed_stream_pos = 0; +} + +void Compressor::close() { + if (nullptr == m_compressed_stream_file_writer) { + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + } + + flush_and_close_compression_stream(); + m_compressed_stream_file_writer = nullptr; +} + +void Compressor::write(char const* data, size_t data_length) { + if (nullptr == m_compressed_stream_file_writer) { + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + } + + if (0 == data_length) { + // Nothing needs to be done because we do not need to compress anything + return; + } + if (nullptr == data) { + throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); + } + lzma_action action = LZMA_RUN; + m_compression_stream->next_in = reinterpret_cast(const_cast(data)); + m_compression_stream->avail_in = data_length; + + // Compress all data + bool hit_input_eof = false; + while (!hit_input_eof) { + lzma_ret return_value = lzma_code(m_compression_stream, action); + switch (return_value) { + case LZMA_OK: + case LZMA_BUF_ERROR: + break; + case LZMA_STREAM_END: + hit_input_eof = true; + break; + default: + SPDLOG_ERROR("lzma() returned an unexpected value - {}.", int(return_value)); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + + if (0 == m_compression_stream->avail_in) { + // No more data to compress + break; + } + + // Write output buffer to file if it's full + if (0 == m_compression_stream->avail_out) { + m_compressed_stream_file_writer->write( + reinterpret_cast(m_compressed_stream_block_buffer.get()), + cCompressedStreamBlockBufferSize + ); + m_compression_stream->next_out = m_compressed_stream_block_buffer.get(); + m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; + } + } + + // Write any compressed data + if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) { + m_compressed_stream_file_writer->write( + reinterpret_cast(m_compressed_stream_block_buffer.get()), + cCompressedStreamBlockBufferSize - m_compression_stream->avail_out + ); + m_compression_stream->next_out = m_compressed_stream_block_buffer.get(); + m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; + } + + m_compression_stream->next_in = nullptr; + + m_compression_stream_contains_data = true; + m_uncompressed_stream_pos += data_length; +} + +void Compressor::flush() { + if (false == m_compression_stream_contains_data) { + return; + } + // Z_NO_FLUSH - deflate decides how much data to accumulate before producing output + // Z_SYNC_FLUSH - All pending output flushed to output buf and output aligned to byte + // boundary (completes current block and follows it with empty block that is 3 bits plus + // filler to next byte, followed by 4 bytes Z_PARTIAL_FLUSH - Same as Z_SYNC_FLUSH but + // output not aligned to byte boundary (completes current block and follows it with empty + // fixed codes block that is 10 bits long) Z_BLOCK - Same as Z_SYNC_FLUSH but output not + // aligned on a byte boundary and up to 7 bits of current block held to be written + // Z_FULL_FLUSH - Same as Z_SYNC_FLUSH but compression state reset so that decompression can + // restart from this point if the previous compressed data has been damaged Z_FINISH - + // Pending output flushed and deflate returns Z_STREAM_END if there was enough output space, + // or Z_OK or Z_BUF_ERROR if it needs to be called again with more space + // + + bool flush_complete = false; + while (true) { + lzma_ret return_value = lzma_code(m_compression_stream, LZMA_SYNC_FLUSH); + switch (return_value) { + case LZMA_STREAM_END: + flush_complete = true; + break; + case LZMA_OK: + case LZMA_BUF_ERROR: + break; + default: + SPDLOG_ERROR("lzma() returned an unexpected value - {}.", int(return_value)); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + if (flush_complete) { + break; + } + + // Write output buffer to file if it's full + if (0 == m_compression_stream->avail_out) { + m_compressed_stream_file_writer->write( + reinterpret_cast(m_compressed_stream_block_buffer.get()), + cCompressedStreamBlockBufferSize + ); + m_compression_stream->next_out = m_compressed_stream_block_buffer.get(); + m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; + } + } + + // Write any compressed data + if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) { + m_compressed_stream_file_writer->write( + reinterpret_cast(m_compressed_stream_block_buffer.get()), + cCompressedStreamBlockBufferSize - m_compression_stream->avail_out + ); + m_compression_stream->next_out = m_compressed_stream_block_buffer.get(); + m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; + } + + m_compression_stream_contains_data = false; +} + +ErrorCode Compressor::try_get_pos(size_t& pos) const { + if (nullptr == m_compressed_stream_file_writer) { + return ErrorCode_NotInit; + } + + pos = m_uncompressed_stream_pos; + return ErrorCode_Success; +} + +void Compressor::flush_and_close_compression_stream() { + if (nullptr == m_compressed_stream_file_writer) { + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + } + + bool flush_complete = false; + while (true) { + lzma_ret return_value = lzma_code(m_compression_stream, LZMA_FINISH); + switch (return_value) { + case LZMA_OK: + case LZMA_BUF_ERROR: + break; + case LZMA_STREAM_END: + flush_complete = true; + break; + default: + // SPDLOG_ERROR("deflate() returned an unexpected value - + // {}.", return_value); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + if (flush_complete) { + break; + } + + // Write output buffer to file if it's full + if (0 == m_compression_stream->avail_out) { + m_compressed_stream_file_writer->write( + reinterpret_cast(m_compressed_stream_block_buffer.get()), + cCompressedStreamBlockBufferSize + ); + m_compression_stream->next_out = m_compressed_stream_block_buffer.get(); + m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; + } + } + + // Write any compressed data + if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) { + m_compressed_stream_file_writer->write( + reinterpret_cast(m_compressed_stream_block_buffer.get()), + cCompressedStreamBlockBufferSize - m_compression_stream->avail_out + ); + m_compression_stream->next_out = m_compressed_stream_block_buffer.get(); + m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; + } + + m_compression_stream_contains_data = false; + + lzma_end(m_compression_stream); + m_compression_stream->avail_out = 0; + m_compression_stream->next_out = nullptr; +} +} // namespace streaming_compression::lzma diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp new file mode 100644 index 000000000..d31c7687e --- /dev/null +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -0,0 +1,133 @@ +#ifndef STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP +#define STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP + +// C++ standard libraries +#include +#include + +// ZLIB library +#include +#include + +// Project headers +#include "../../FileWriter.hpp" +#include "../../TraceableException.hpp" +#include "../Compressor.hpp" +#include "Constants.hpp" + +namespace streaming_compression::lzma { +class Compressor : public ::streaming_compression::Compressor { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + + // Methods + char const* what() const noexcept override { + return "streaming_compression::gzip::Compressor operation failed"; + } + }; + + class LzmaOption { + public: + LzmaOption() + : m_compression_level{cDefaultCompressionLevel}, + m_dict_size{cDefaultDictionarySize} {} + + auto set_compression_level(int compression_level) -> void { + if (0 > compression_level) { + m_compression_level = 0; + } else if (9 < compression_level) { + m_compression_level = 9; + } else { + m_compression_level = compression_level; + } + } + + auto set_dict_size(uint32_t dict_size) -> void { m_dict_size = dict_size; } + + [[nodiscard]] auto get_compression_level() const -> int { return m_compression_level; } + + [[nodiscard]] auto get_dict_size() const -> uint32_t { return m_dict_size; } + + private: + int m_compression_level; + uint32_t m_dict_size; + }; + + // Constructor + Compressor(); + + // Destructor + ~Compressor(); + + // Explicitly disable copy and move constructor/assignment + Compressor(Compressor const&) = delete; + Compressor& operator=(Compressor const&) = delete; + + // Methods implementing the WriterInterface + /** + * Writes the given data to the compressor + * @param data + * @param data_length + */ + void write(char const* data, size_t data_length) override; + /** + * Writes any internally buffered data to file and ends the current frame + */ + void flush() override; + + /** + * Tries to get the current position of the write head + * @param pos Position of the write head + * @return ErrorCode_NotInit if the compressor is not open + * @return ErrorCode_Success on success + */ + ErrorCode try_get_pos(size_t& pos) const override; + + // Methods implementing the Compressor interface + /** + * Initialize streaming compressor + * @param file_writer + * @param compression_level + */ + void open(FileWriter& file_writer, int compression_level) override; + + /** + * Closes the compressor + */ + void close() override; + + // Methods + static auto set_compression_level(int compression_level) -> void { + m_option.set_compression_level(compression_level); + } + + static auto set_dict_size(uint32_t dict_size) -> void { m_option.set_dict_size(dict_size); } + +private: + /** + * Flushes the stream and closes it + */ + void flush_and_close_compression_stream(); + + static void init_lzma_encoder(lzma_stream* strm); + static LzmaOption m_option; + + // Variables + FileWriter* m_compressed_stream_file_writer; + + // Compressed stream variables + lzma_stream* m_compression_stream; + bool m_compression_stream_contains_data; + + std::unique_ptr m_compressed_stream_block_buffer; + + size_t m_uncompressed_stream_pos; +}; +} // namespace streaming_compression::lzma + +#endif // STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP diff --git a/components/core/src/clp/streaming_compression/lzma/Constants.hpp b/components/core/src/clp/streaming_compression/lzma/Constants.hpp new file mode 100644 index 000000000..959c09f47 --- /dev/null +++ b/components/core/src/clp/streaming_compression/lzma/Constants.hpp @@ -0,0 +1,15 @@ +#ifndef STREAMING_COMPRESSION_LZMA_CONSTANTS_HPP +#define STREAMING_COMPRESSION_LZMA_CONSTANTS_HPP + +#include + +// C++ libraries +#include +#include + +namespace streaming_compression::lzma { +constexpr int cDefaultCompressionLevel{3}; +constexpr uint32_t cDefaultDictionarySize{LZMA_DICT_SIZE_DEFAULT}; +} // namespace streaming_compression::lzma + +#endif // STREAMING_COMPRESSION_LZMA_CONSTANTS_HPP diff --git a/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp b/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp new file mode 100644 index 000000000..a2ed4d466 --- /dev/null +++ b/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp @@ -0,0 +1,362 @@ +#include "Decompressor.hpp" + +// C++ Standard Libraries +#include + +// Boost libraries +#include + +// spdlog +#include + +// Project headers +#include "../../Defs.h" + +namespace streaming_compression::lzma { +Decompressor::Decompressor() + : ::streaming_compression::Decompressor(CompressorType::LZMA), + m_input_type(InputType::NotInitialized), + m_decompression_stream(nullptr), + m_file_reader(nullptr), + m_file_reader_initial_pos(0), + m_file_read_buffer_length(0), + m_file_read_buffer_capacity(0), + m_decompressed_stream_pos(0), + m_unused_decompressed_stream_block_size(0) { + // Create block to hold unused decompressed data + m_unused_decompressed_stream_block_buffer + = std::make_unique(m_unused_decompressed_stream_block_size); + m_decompression_stream = new lzma_stream; + memset(m_decompression_stream, 0, sizeof(lzma_stream)); +} + +Decompressor::~Decompressor() { + delete m_decompression_stream; +} + +void Decompressor::exact_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { + auto errorcode = try_read(buf, num_bytes_to_read, num_bytes_read); + if (num_bytes_read != num_bytes_to_read) { + SPDLOG_ERROR("FAILED TO READ EXACTLY {} bytes", num_bytes_to_read); + throw; + } + if (errorcode != ErrorCode_Success) { + SPDLOG_ERROR("FAILED TO READ EXACTLY {} bytes", num_bytes_to_read); + throw; + } +} + +ErrorCode Decompressor::try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { + if (InputType::NotInitialized == m_input_type) { + return ErrorCode_NotInit; + } + if (nullptr == buf) { + return ErrorCode_BadParam; + } + if (0 == num_bytes_to_read) { + return ErrorCode_Success; + } + + num_bytes_read = 0; + + m_decompression_stream->next_out = reinterpret_cast(buf); + m_decompression_stream->avail_out = num_bytes_to_read; + while (true) { + // Check if there's data that can be decompressed + if (0 == m_decompression_stream->avail_in) { + if (InputType::File != m_input_type) { + // if we hit here, there must be something wrong + // we have consumed all data buffer but for some reason it still requires more. + return ErrorCode_EndOfFile; + } else { + auto error_code = m_file_reader->try_read( + m_file_read_buffer.get(), + m_file_read_buffer_capacity, + m_file_read_buffer_length + ); + m_decompression_stream->avail_in = m_file_read_buffer_length; + m_decompression_stream->next_in + = reinterpret_cast(m_file_read_buffer.get()); + if (ErrorCode_Success != error_code) { + if (ErrorCode_EndOfFile == error_code) { + num_bytes_read = num_bytes_to_read - m_decompression_stream->avail_out; + m_decompressed_stream_pos += num_bytes_read; + return ErrorCode_EndOfFile; + } + } + } + } + + lzma_ret return_value = lzma_code(m_decompression_stream, LZMA_RUN); + switch (return_value) { + case LZMA_OK: + case LZMA_BUF_ERROR: + if (0 == m_decompression_stream->avail_out) { + m_decompression_stream->next_out = nullptr; + num_bytes_read = num_bytes_to_read; + m_decompressed_stream_pos += num_bytes_read; + return ErrorCode_Success; + } + // by breaking here, enter the next iteration of decompressing + break; + case LZMA_STREAM_END: + if (0 == m_decompression_stream->avail_out) { + m_decompression_stream->next_out = nullptr; + num_bytes_read = num_bytes_to_read; + m_decompressed_stream_pos += num_bytes_read; + return ErrorCode_Success; + } + SPDLOG_ERROR("streaming_compression::lzma::Decompressor wants to read more but " + "reached end of file"); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + case LZMA_MEM_ERROR: + SPDLOG_ERROR("streaming_compression::lzma::Decompressor inflate() ran out of memory" + ); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + default: + SPDLOG_ERROR("inflate() returned an unexpected value - {}.", int(return_value)); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + } +} + +ErrorCode Decompressor::try_seek_from_begin(size_t pos) { + if (InputType::NotInitialized == m_input_type) { + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + } + + // Check if we've already decompressed passed the desired position + if (m_decompressed_stream_pos > pos) { + // ZStd has no way for us to seek back to the desired position, so just reset the stream + // to the beginning + reset_stream(); + } + + // We need to fast-forward the decompression stream to decompressed_stream_pos + ErrorCode error; + while (m_decompressed_stream_pos < pos) { + size_t num_bytes_to_decompress = std::min( + m_unused_decompressed_stream_block_size, + pos - m_decompressed_stream_pos + ); + error = try_read_exact_length( + m_unused_decompressed_stream_block_buffer.get(), + num_bytes_to_decompress + ); + if (ErrorCode_Success != error) { + return error; + } + } + + return ErrorCode_Success; +} + +ErrorCode Decompressor::try_get_pos(size_t& pos) { + if (InputType::NotInitialized == m_input_type) { + return ErrorCode_NotInit; + } + + pos = m_decompressed_stream_pos; + return ErrorCode_Success; +} + +void Decompressor::close() { + if (InputType::NotInitialized == m_input_type) { + return; + } + lzma_end(m_decompression_stream); + m_decompression_stream->avail_out = 0; + m_decompression_stream->next_out = nullptr; + if (InputType::MemoryMappedCompressedFile == m_input_type) { + if (m_memory_mapped_compressed_file.is_open()) { + // An existing file is memory mapped by the decompressor + m_memory_mapped_compressed_file.close(); + } + } else if (InputType::File == m_input_type) { + m_file_read_buffer.reset(); + m_file_read_buffer_capacity = 0; + m_file_read_buffer_length = 0; + m_file_reader = nullptr; + } + m_input_type = InputType::NotInitialized; +} + +void Decompressor::init_decoder(lzma_stream* strm) { + // Initialize a .xz decoder. The decoder supports a memory usage limit + // and a set of flags. + // + // The memory usage of the decompressor depends on the settings used + // to compress a .xz file. It can vary from less than a megabyte to + // a few gigabytes, but in practice (at least for now) it rarely + // exceeds 65 MiB because that's how much memory is required to + // decompress files created with "xz -9". Settings requiring more + // memory take extra effort to use and don't (at least for now) + // provide significantly better compression in most cases. + // + // Memory usage limit is useful if it is important that the + // decompressor won't consume gigabytes of memory. The need + // for limiting depends on the application. In this example, + // no memory usage limiting is used. This is done by setting + // the limit to UINT64_MAX. + // + // The .xz format allows concatenating compressed files as is: + // + // echo foo | xz > foobar.xz + // echo bar | xz >> foobar.xz + // + // When decompressing normal standalone .xz files, LZMA_CONCATENATED + // should always be used to support decompression of concatenated + // .xz files. If LZMA_CONCATENATED isn't used, the decoder will stop + // after the first .xz stream. This can be useful when .xz data has + // been embedded inside another file format. + // + // Flags other than LZMA_CONCATENATED are supported too, and can + // be combined with bitwise-or. See lzma/container.h + // (src/liblzma/api/lzma/container.h in the source package or e.g. + // /usr/include/lzma/container.h depending on the install prefix) + // for details. + lzma_ret ret = lzma_stream_decoder(strm, UINT64_MAX, LZMA_CONCATENATED); + + // Return successfully if the initialization went fine. + if (ret == LZMA_OK) { + return; + } + + // Something went wrong. The possible errors are documented in + // lzma/container.h (src/liblzma/api/lzma/container.h in the source + // package or e.g. /usr/include/lzma/container.h depending on the + // install prefix). + // + // Note that LZMA_MEMLIMIT_ERROR is never possible here. If you + // specify a very tiny limit, the error will be delayed until + // the first headers have been parsed by a call to lzma_code(). + char const* msg; + switch (ret) { + case LZMA_MEM_ERROR: + msg = "Memory allocation failed"; + break; + + case LZMA_OPTIONS_ERROR: + msg = "Unsupported decompressor flags"; + break; + + default: + // This is most likely LZMA_PROG_ERROR indicating a bug in + // this program or in liblzma. It is inconvenient to have a + // separate error message for errors that should be impossible + // to occur, but knowing the error code is important for + // debugging. That's why it is good to print the error code + // at least when there is no good error message to show. + msg = "Unknown error, possibly a bug"; + break; + } + + SPDLOG_ERROR("Error initializing the decoder: {} (error code {})", msg, int(ret)); +} + +void Decompressor::open(char const* compressed_data_buf, size_t compressed_data_buf_size) { + if (InputType::NotInitialized != m_input_type) { + throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__); + } + m_input_type = InputType::CompressedDataBuf; + + // Configure input stream + reset_stream(); + m_decompression_stream->next_in + = reinterpret_cast(const_cast(compressed_data_buf)); + m_decompression_stream->avail_in = compressed_data_buf_size; + m_decompression_stream->next_out = nullptr; + m_decompression_stream->avail_out = 0; +} + +ErrorCode Decompressor::open(std::string const& compressed_file_path) { + if (InputType::NotInitialized != m_input_type) { + throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__); + } + m_input_type = InputType::MemoryMappedCompressedFile; + + // Create memory mapping for compressed_file_path, use boost read only memory mapped file + boost::system::error_code boost_error_code; + size_t compressed_file_size + = boost::filesystem::file_size(compressed_file_path, boost_error_code); + if (boost_error_code) { + SPDLOG_ERROR( + "streaming_compression::zstd::Decompressor: Unable to obtain file size for " + "'{}' - {}.", + compressed_file_path.c_str(), + boost_error_code.message().c_str() + ); + return ErrorCode_Failure; + } + + boost::iostreams::mapped_file_params memory_map_params; + memory_map_params.path = compressed_file_path; + memory_map_params.flags = boost::iostreams::mapped_file::readonly; + memory_map_params.length = compressed_file_size; + memory_map_params.hint = m_memory_mapped_compressed_file.data( + ); // Try to map it to the same memory location as previous memory mapped file + m_memory_mapped_compressed_file.open(memory_map_params); + if (!m_memory_mapped_compressed_file.is_open()) { + SPDLOG_ERROR( + "streaming_compression::lzma::Decompressor: Unable to memory map the " + "compressed file with path: {}", + compressed_file_path.c_str() + ); + return ErrorCode_Failure; + } + + // Configure input stream + reset_stream(); + m_decompression_stream->next_in + = reinterpret_cast(const_cast(m_memory_mapped_compressed_file.data())); + m_decompression_stream->avail_in = compressed_file_size; + m_decompression_stream->next_out = nullptr; + m_decompression_stream->avail_out = 0; + + return ErrorCode_Success; +} + +void Decompressor::open(FileReader& file_reader, size_t file_read_buffer_capacity) { + if (InputType::NotInitialized != m_input_type) { + throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__); + } + m_input_type = InputType::File; + + m_file_reader = &file_reader; + m_file_reader_initial_pos = m_file_reader->get_pos(); + + m_file_read_buffer_capacity = file_read_buffer_capacity; + m_file_read_buffer = std::make_unique(m_file_read_buffer_capacity); + m_file_read_buffer_length = 0; + + // Configure input stream + reset_stream(); + m_decompression_stream->next_in = reinterpret_cast(m_file_read_buffer.get()); + m_decompression_stream->avail_in = m_file_read_buffer_length; + m_decompression_stream->next_out = nullptr; + m_decompression_stream->avail_out = 0; +} + +ErrorCode Decompressor::get_decompressed_stream_region( + size_t decompressed_stream_pos, + char* extraction_buf, + size_t extraction_len +) { + auto error_code = try_seek_from_begin(decompressed_stream_pos); + if (ErrorCode_Success != error_code) { + return error_code; + } + + error_code = try_read_exact_length(extraction_buf, extraction_len); + return error_code; +} + +void Decompressor::reset_stream() { + if (InputType::File == m_input_type) { + m_file_reader->seek_from_begin(m_file_reader_initial_pos); + m_file_read_buffer_length = 0; + } + m_decompressed_stream_pos = 0; + init_decoder(m_decompression_stream); +} +} // namespace streaming_compression::lzma diff --git a/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp b/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp new file mode 100644 index 000000000..996663e44 --- /dev/null +++ b/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp @@ -0,0 +1,162 @@ +#ifndef STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP +#define STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP + +// C++ standard libraries +#include +#include + +// ZLIB library +#include +#include +// Boost libraries +#include + +// Project headers +#include "../../FileReader.hpp" +#include "../../TraceableException.hpp" +#include "../Decompressor.hpp" + +namespace streaming_compression::lzma { +class Decompressor : public ::streaming_compression::Decompressor { +public: + // Types + class OperationFailed : public TraceableException { + public: + // Constructors + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + + // Methods + char const* what() const noexcept override { + return "streaming_compression::lzma::Decompressor operation failed"; + } + }; + + // Constructor + Decompressor(); + + // Destructor + ~Decompressor(); + + // Explicitly disable copy and move constructor/assignment + Decompressor(Decompressor const&) = delete; + Decompressor& operator=(Decompressor const&) = delete; + + // Methods implementing the ReaderInterface + /** + * Tries to read up to a given number of bytes from the decompressor + * @param buf + * @param num_bytes_to_read The number of bytes to try and read + * @param num_bytes_read The actual number of bytes read + * @return Same as FileReader::try_read if the decompressor is attached to a file + * @return ErrorCode_NotInit if the decompressor is not open + * @return ErrorCode_BadParam if buf is invalid + * @return ErrorCode_EndOfFile on EOF + * @return ErrorCode_Failure on decompression failure + * @return ErrorCode_Success on success + */ + ErrorCode try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) override; + + /** + */ + void exact_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read); + + /** + * Tries to seek from the beginning to the given position + * @param pos + * @return ErrorCode_NotInit if the decompressor is not open + * @return Same as ReaderInterface::try_read_exact_length + * @return ErrorCode_Success on success + */ + ErrorCode try_seek_from_begin(size_t pos) override; + /** + * Tries to get the current position of the read head + * @param pos Position of the read head in the file + * @return ErrorCode_NotInit if the decompressor is not open + * @return ErrorCode_Success on success + */ + ErrorCode try_get_pos(size_t& pos) override; + + // Methods implementing the Decompressor interface + void close() override; + /** + * Decompresses and copies the range of uncompressed data described by + * decompressed_stream_pos and extraction_len into extraction_buf + * @param decompressed_stream_pos + * @param extraction_buf + * @param extraction_len + * @return Same as streaming_compression::zstd::Decompressor::try_seek_from_begin + * @return Same as ReaderInterface::try_read_exact_length + */ + ErrorCode get_decompressed_stream_region( + size_t decompressed_stream_pos, + char* extraction_buf, + size_t extraction_len + ) override; + + // Methods + /*** + * Initialize streaming decompressor to decompress from the specified compressed data buffer + * @param compressed_data_buf + * @param compressed_data_buf_size + */ + void open(char const* compressed_data_buf, size_t compressed_data_buf_size) override; + + /*** + * Initialize streaming decompressor to decompress from a compressed file specified by the + * given path + * @param compressed_file_path + * @param decompressed_stream_block_size + * @return ErrorCode_Failure if the provided path cannot be memory mapped + * @return ErrorCode_Success on success + */ + ErrorCode open(std::string const& compressed_file_path); + + /** + * Initializes the decompressor to decompress from an open file + * @param file_reader + * @param file_read_buffer_capacity The maximum amount of data to read from a file at a time + */ + void open(FileReader& file_reader, size_t file_read_buffer_capacity) override; + +private: + // Enum class + enum class InputType { + NotInitialized, // Note: do nothing but generate an error to prevent this required + // parameter is not initialized properly + CompressedDataBuf, + MemoryMappedCompressedFile, + File + }; + + // Methods + /** + * Reset streaming decompression state so it will start decompressing from the beginning of + * the stream afterwards + */ + void reset_stream(); + + void init_decoder(lzma_stream* strm); + + // Variables + InputType m_input_type; + + // Compressed stream variables + lzma_stream* m_decompression_stream{nullptr}; + + boost::iostreams::mapped_file_source m_memory_mapped_compressed_file; + FileReader* m_file_reader; + size_t m_file_reader_initial_pos; + std::unique_ptr m_file_read_buffer; + size_t m_file_read_buffer_length; + size_t m_file_read_buffer_capacity; + + size_t m_decompressed_stream_pos; + size_t m_unused_decompressed_stream_block_size; + std::unique_ptr m_unused_decompressed_stream_block_buffer; + + char const* m_compressed_stream_block; + size_t m_compressed_stream_block_size; +}; +} // namespace streaming_compression::lzma +#endif // STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp index 0fbae9e3a..d632510fc 100644 --- a/components/core/tests/test-StreamingCompression.cpp +++ b/components/core/tests/test-StreamingCompression.cpp @@ -15,6 +15,7 @@ #include "../src/clp/ReadOnlyMemoryMappedFile.hpp" #include "../src/clp/streaming_compression/Compressor.hpp" #include "../src/clp/streaming_compression/Decompressor.hpp" +#include "../src/clp/streaming_compression/lzma/Compressor.hpp" #include "../src/clp/streaming_compression/passthrough/Compressor.hpp" #include "../src/clp/streaming_compression/passthrough/Decompressor.hpp" #include "../src/clp/streaming_compression/zstd/Compressor.hpp" diff --git a/components/core/tools/scripts/lib_install/liblzma.sh b/components/core/tools/scripts/lib_install/liblzma.sh new file mode 100755 index 000000000..1145b2646 --- /dev/null +++ b/components/core/tools/scripts/lib_install/liblzma.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# Dependencies: +# - curl +# - make +# - gcc +# NOTE: Dependencies should be installed outside the script to allow the script to be largely distro-agnostic + +# Exit on any error +set -e + +# Error on undefined variable +set -u + +cUsage="Usage: ${BASH_SOURCE[0]} [ <.deb output directory>]" +if [ "$#" -lt 1 ] ; then + echo $cUsage + exit +fi +version=$1 + +package_name=liblzma +temp_dir=/tmp/${package_name}-installation +deb_output_dir=${temp_dir} +if [[ "$#" -gt 1 ]] ; then + deb_output_dir="$(readlink -f "$2")" + if [ ! -d ${deb_output_dir} ] ; then + echo "${deb_output_dir} does not exist or is not a directory" + exit + fi +fi + +# Note: we won't check if the package already exists + +echo "Checking for elevated privileges..." +privileged_command_prefix="" +if [ ${EUID:-$(id -u)} -ne 0 ] ; then + sudo echo "Script can elevate privileges." + privileged_command_prefix="${privileged_command_prefix} sudo" +fi + +# Get number of cpu cores +num_cpus=$(grep -c ^processor /proc/cpuinfo) + +# Download +mkdir -p $temp_dir +cd $temp_dir +extracted_dir=${temp_dir}/xz-${version} +if [ ! -e ${extracted_dir} ] ; then + tar_filename=xz-${version}.tar.gz + if [ ! -e ${tar_filename} ] ; then + curl -fsSL https://github.com/tukaani-project/xz/releases/download/v${version}/${tar_filename} -o ${tar_filename} + fi + tar -xf ${tar_filename} +fi + +# Build +cd ${extracted_dir} +mkdir build +cd build +cmake -DCMAKE_POSITION_INDEPENDENT_CODE=TRUE ../ +make -j${num_cpus} +make install liblzma + +# Clean up +rm -rf $temp_dir diff --git a/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh b/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh index 1e21314cc..10a2b0482 100755 --- a/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh +++ b/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh @@ -14,6 +14,7 @@ lib_install_scripts_dir=$script_dir/.. "$lib_install_scripts_dir"/fmtlib.sh 8.0.1 "$lib_install_scripts_dir"/libarchive.sh 3.5.1 +"$lib_install_scripts_dir"/liblzma.sh 5.4.6 "$lib_install_scripts_dir"/lz4.sh 1.8.2 "$lib_install_scripts_dir"/mongocxx.sh 3.10.2 "$lib_install_scripts_dir"/msgpack.sh 7.0.0 diff --git a/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh index 706674764..f1e2ee4ff 100755 --- a/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh +++ b/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh @@ -20,6 +20,7 @@ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ libcurl4 \ libcurl4-openssl-dev \ libmariadb-dev \ + liblzma-dev \ libssl-dev \ make \ openjdk-11-jdk \ diff --git a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh index 7799c9ba5..97aaf7093 100755 --- a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh +++ b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh @@ -11,6 +11,7 @@ lib_install_scripts_dir=$script_dir/.. "$lib_install_scripts_dir"/fmtlib.sh 8.0.1 "$lib_install_scripts_dir"/libarchive.sh 3.5.1 +"$lib_install_scripts_dir"/liblzma.sh 5.4.6 "$lib_install_scripts_dir"/lz4.sh 1.8.2 "$lib_install_scripts_dir"/mongocxx.sh 3.10.2 "$lib_install_scripts_dir"/msgpack.sh 7.0.0 diff --git a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh index 92d965b9b..4911a6a98 100755 --- a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh +++ b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh @@ -20,6 +20,7 @@ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ libcurl4 \ libcurl4-openssl-dev \ libmariadb-dev \ + liblzma-dev \ libssl-dev \ openjdk-11-jdk \ pkg-config \ From d5af274f119ad8bb2d5b3bbc9e1e97bca282be7a Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Mon, 25 Nov 2024 12:49:13 -0500 Subject: [PATCH 02/65] Make unit test pass --- .../streaming_compression/lzma/Compressor.cpp | 72 +++++++--------- .../streaming_compression/lzma/Compressor.hpp | 84 +++++++++++-------- .../streaming_compression/lzma/Constants.hpp | 12 +-- .../lzma/Decompressor.cpp | 6 +- .../lzma/Decompressor.hpp | 15 ++-- .../core/tests/test-StreamingCompression.cpp | 6 ++ 6 files changed, 103 insertions(+), 92 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index f10ec915b..7bb13e5d3 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -1,34 +1,22 @@ #include "Compressor.hpp" -// spdlog #include +// Compression libraries +#include +#include + // Project headers #include "../../Defs.h" -// File-scope constants -static constexpr size_t cCompressedStreamBlockBufferSize = 4096; // 4KiB - -namespace streaming_compression::lzma { +namespace clp::streaming_compression::lzma { Compressor::LzmaOption Compressor::m_option; -Compressor::Compressor() - : ::streaming_compression::Compressor(CompressorType::LZMA), - m_compression_stream_contains_data(false), - m_compressed_stream_file_writer(nullptr), - m_compression_stream(nullptr) { - m_compressed_stream_block_buffer = std::make_unique(cCompressedStreamBlockBufferSize); - m_compression_stream = new lzma_stream; - memset(m_compression_stream, 0, sizeof(lzma_stream)); -} - -Compressor::~Compressor() { - if (nullptr != m_compression_stream) { - delete m_compression_stream; - } +Compressor::Compressor() { + memset(m_compression_stream.get(), 0, sizeof(LzmaStream)); } -void Compressor::init_lzma_encoder(lzma_stream* strm) { +void Compressor::init_lzma_encoder(LzmaStream* strm) { lzma_options_lzma options; if (lzma_lzma_preset(&options, m_option.get_compression_level())) { SPDLOG_ERROR("Failed to initialize LZMA options."); @@ -44,10 +32,10 @@ void Compressor::init_lzma_encoder(lzma_stream* strm) { // to CRC64, which is the default in the xz command line tool. If // the .xz file needs to be decompressed with XZ Embedded, use // LZMA_CHECK_CRC32 instead. - lzma_ret ret = lzma_stream_encoder(strm, filters, LZMA_CHECK_CRC64); + auto const ret = lzma_stream_encoder(strm, filters, LZMA_CHECK_CRC64); // Return successfully if the initialization went fine. - if (ret == LZMA_OK) { + if (LZMA_OK == ret) { return; } @@ -96,12 +84,12 @@ void Compressor::open(FileWriter& file_writer, int compression_level) { m_option.set_compression_level(compression_level); } - init_lzma_encoder(m_compression_stream); + init_lzma_encoder(m_compression_stream.get()); // Setup compressed stream parameters m_compression_stream->next_in = nullptr; m_compression_stream->avail_in = 0; - m_compression_stream->next_out = m_compressed_stream_block_buffer.get(); - m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; + m_compression_stream->next_out = m_compressed_stream_block_buffer.data(); + m_compression_stream->avail_out = m_compressed_stream_block_buffer.size(); m_compressed_stream_file_writer = &file_writer; @@ -136,7 +124,7 @@ void Compressor::write(char const* data, size_t data_length) { // Compress all data bool hit_input_eof = false; while (!hit_input_eof) { - lzma_ret return_value = lzma_code(m_compression_stream, action); + auto const return_value = lzma_code(m_compression_stream.get(), action); switch (return_value) { case LZMA_OK: case LZMA_BUF_ERROR: @@ -157,10 +145,10 @@ void Compressor::write(char const* data, size_t data_length) { // Write output buffer to file if it's full if (0 == m_compression_stream->avail_out) { m_compressed_stream_file_writer->write( - reinterpret_cast(m_compressed_stream_block_buffer.get()), + reinterpret_cast(m_compressed_stream_block_buffer.data()), cCompressedStreamBlockBufferSize ); - m_compression_stream->next_out = m_compressed_stream_block_buffer.get(); + m_compression_stream->next_out = m_compressed_stream_block_buffer.data(); m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; } } @@ -168,10 +156,10 @@ void Compressor::write(char const* data, size_t data_length) { // Write any compressed data if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) { m_compressed_stream_file_writer->write( - reinterpret_cast(m_compressed_stream_block_buffer.get()), + reinterpret_cast(m_compressed_stream_block_buffer.data()), cCompressedStreamBlockBufferSize - m_compression_stream->avail_out ); - m_compression_stream->next_out = m_compressed_stream_block_buffer.get(); + m_compression_stream->next_out = m_compressed_stream_block_buffer.data(); m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; } @@ -200,7 +188,7 @@ void Compressor::flush() { bool flush_complete = false; while (true) { - lzma_ret return_value = lzma_code(m_compression_stream, LZMA_SYNC_FLUSH); + auto const return_value = lzma_code(m_compression_stream.get(), LZMA_SYNC_FLUSH); switch (return_value) { case LZMA_STREAM_END: flush_complete = true; @@ -219,10 +207,10 @@ void Compressor::flush() { // Write output buffer to file if it's full if (0 == m_compression_stream->avail_out) { m_compressed_stream_file_writer->write( - reinterpret_cast(m_compressed_stream_block_buffer.get()), + reinterpret_cast(m_compressed_stream_block_buffer.data()), cCompressedStreamBlockBufferSize ); - m_compression_stream->next_out = m_compressed_stream_block_buffer.get(); + m_compression_stream->next_out = m_compressed_stream_block_buffer.data(); m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; } } @@ -230,10 +218,10 @@ void Compressor::flush() { // Write any compressed data if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) { m_compressed_stream_file_writer->write( - reinterpret_cast(m_compressed_stream_block_buffer.get()), + reinterpret_cast(m_compressed_stream_block_buffer.data()), cCompressedStreamBlockBufferSize - m_compression_stream->avail_out ); - m_compression_stream->next_out = m_compressed_stream_block_buffer.get(); + m_compression_stream->next_out = m_compressed_stream_block_buffer.data(); m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; } @@ -256,7 +244,7 @@ void Compressor::flush_and_close_compression_stream() { bool flush_complete = false; while (true) { - lzma_ret return_value = lzma_code(m_compression_stream, LZMA_FINISH); + lzma_ret return_value = lzma_code(m_compression_stream.get(), LZMA_FINISH); switch (return_value) { case LZMA_OK: case LZMA_BUF_ERROR: @@ -276,10 +264,10 @@ void Compressor::flush_and_close_compression_stream() { // Write output buffer to file if it's full if (0 == m_compression_stream->avail_out) { m_compressed_stream_file_writer->write( - reinterpret_cast(m_compressed_stream_block_buffer.get()), + reinterpret_cast(m_compressed_stream_block_buffer.data()), cCompressedStreamBlockBufferSize ); - m_compression_stream->next_out = m_compressed_stream_block_buffer.get(); + m_compression_stream->next_out = m_compressed_stream_block_buffer.data(); m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; } } @@ -287,17 +275,17 @@ void Compressor::flush_and_close_compression_stream() { // Write any compressed data if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) { m_compressed_stream_file_writer->write( - reinterpret_cast(m_compressed_stream_block_buffer.get()), + reinterpret_cast(m_compressed_stream_block_buffer.data()), cCompressedStreamBlockBufferSize - m_compression_stream->avail_out ); - m_compression_stream->next_out = m_compressed_stream_block_buffer.get(); + m_compression_stream->next_out = m_compressed_stream_block_buffer.data(); m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; } m_compression_stream_contains_data = false; - lzma_end(m_compression_stream); + lzma_end(m_compression_stream.get()); m_compression_stream->avail_out = 0; m_compression_stream->next_out = nullptr; } -} // namespace streaming_compression::lzma +} // namespace clp::streaming_compression::lzma diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index d31c7687e..53f82b139 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -1,22 +1,22 @@ -#ifndef STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP -#define STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP +#ifndef CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP +#define CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP -// C++ standard libraries +#include +#include #include -#include -// ZLIB library #include -#include +#include -// Project headers +#include "../../Array.hpp" +#include "../../ErrorCode.hpp" #include "../../FileWriter.hpp" #include "../../TraceableException.hpp" #include "../Compressor.hpp" #include "Constants.hpp" -namespace streaming_compression::lzma { -class Compressor : public ::streaming_compression::Compressor { +namespace clp::streaming_compression::lzma { +class Compressor : public ::clp::streaming_compression::Compressor { public: // Types class OperationFailed : public TraceableException { @@ -26,8 +26,8 @@ class Compressor : public ::streaming_compression::Compressor { : TraceableException(error_code, filename, line_number) {} // Methods - char const* what() const noexcept override { - return "streaming_compression::gzip::Compressor operation failed"; + [[nodiscard]] auto what() const noexcept -> char const* override { + return "streaming_compression::lzma::Compressor operation failed"; } }; @@ -38,10 +38,10 @@ class Compressor : public ::streaming_compression::Compressor { m_dict_size{cDefaultDictionarySize} {} auto set_compression_level(int compression_level) -> void { - if (0 > compression_level) { - m_compression_level = 0; - } else if (9 < compression_level) { - m_compression_level = 9; + if (compression_level < cMinCompressionLevel) { + m_compression_level = cMinCompressionLevel; + } else if (compression_level > cMaxCompressionLevel) { + m_compression_level = cMaxCompressionLevel; } else { m_compression_level = compression_level; } @@ -62,11 +62,15 @@ class Compressor : public ::streaming_compression::Compressor { Compressor(); // Destructor - ~Compressor(); + ~Compressor() override = default; - // Explicitly disable copy and move constructor/assignment + // Delete copy constructor and assignment operator Compressor(Compressor const&) = delete; - Compressor& operator=(Compressor const&) = delete; + auto operator=(Compressor const&) -> Compressor& = delete; + + // Default move constructor and assignment operator + Compressor(Compressor&&) noexcept = default; + auto operator=(Compressor&&) noexcept -> Compressor& = default; // Methods implementing the WriterInterface /** @@ -74,11 +78,12 @@ class Compressor : public ::streaming_compression::Compressor { * @param data * @param data_length */ - void write(char const* data, size_t data_length) override; + auto write(char const* data, size_t data_length) -> void override; + /** * Writes any internally buffered data to file and ends the current frame */ - void flush() override; + auto flush() -> void override; /** * Tries to get the current position of the write head @@ -86,20 +91,28 @@ class Compressor : public ::streaming_compression::Compressor { * @return ErrorCode_NotInit if the compressor is not open * @return ErrorCode_Success on success */ - ErrorCode try_get_pos(size_t& pos) const override; + auto try_get_pos(size_t& pos) const -> ErrorCode override; + + /** + * Closes the compressor + */ + auto close() -> void override; // Methods implementing the Compressor interface /** - * Initialize streaming compressor + * Initializes the compression stream with the default compression level * @param file_writer - * @param compression_level */ - void open(FileWriter& file_writer, int compression_level) override; + auto open(FileWriter& file_writer) -> void override { + this->open(file_writer, cDefaultCompressionLevel); + } /** - * Closes the compressor + * Initializes the compression stream with the given compression level + * @param file_writer + * @param compression_level */ - void close() override; + auto open(FileWriter& file_writer, int compression_level) -> void; // Methods static auto set_compression_level(int compression_level) -> void { @@ -109,25 +122,28 @@ class Compressor : public ::streaming_compression::Compressor { static auto set_dict_size(uint32_t dict_size) -> void { m_option.set_dict_size(dict_size); } private: + using LzmaStream = lzma_stream; + /** * Flushes the stream and closes it */ void flush_and_close_compression_stream(); - static void init_lzma_encoder(lzma_stream* strm); + static void init_lzma_encoder(LzmaStream* strm); static LzmaOption m_option; + static constexpr size_t cCompressedStreamBlockBufferSize{4096}; // 4KiB // Variables - FileWriter* m_compressed_stream_file_writer; + FileWriter* m_compressed_stream_file_writer{nullptr}; // Compressed stream variables - lzma_stream* m_compression_stream; - bool m_compression_stream_contains_data; + std::unique_ptr m_compression_stream{std::make_unique()}; + bool m_compression_stream_contains_data{false}; - std::unique_ptr m_compressed_stream_block_buffer; + Array m_compressed_stream_block_buffer{cCompressedStreamBlockBufferSize}; - size_t m_uncompressed_stream_pos; + size_t m_uncompressed_stream_pos{0}; }; -} // namespace streaming_compression::lzma +} // namespace clp::streaming_compression::lzma -#endif // STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP +#endif // CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP diff --git a/components/core/src/clp/streaming_compression/lzma/Constants.hpp b/components/core/src/clp/streaming_compression/lzma/Constants.hpp index 959c09f47..4e261187a 100644 --- a/components/core/src/clp/streaming_compression/lzma/Constants.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Constants.hpp @@ -1,15 +1,15 @@ #ifndef STREAMING_COMPRESSION_LZMA_CONSTANTS_HPP #define STREAMING_COMPRESSION_LZMA_CONSTANTS_HPP -#include - -// C++ libraries -#include #include -namespace streaming_compression::lzma { +#include + +namespace clp::streaming_compression::lzma { constexpr int cDefaultCompressionLevel{3}; +constexpr int cMinCompressionLevel{0}; +constexpr int cMaxCompressionLevel{9}; constexpr uint32_t cDefaultDictionarySize{LZMA_DICT_SIZE_DEFAULT}; -} // namespace streaming_compression::lzma +} // namespace clp::streaming_compression::lzma #endif // STREAMING_COMPRESSION_LZMA_CONSTANTS_HPP diff --git a/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp b/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp index a2ed4d466..b6a10b418 100644 --- a/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp @@ -12,9 +12,9 @@ // Project headers #include "../../Defs.h" -namespace streaming_compression::lzma { +namespace clp::streaming_compression::lzma { Decompressor::Decompressor() - : ::streaming_compression::Decompressor(CompressorType::LZMA), + : ::clp::streaming_compression::Decompressor(CompressorType::LZMA), m_input_type(InputType::NotInitialized), m_decompression_stream(nullptr), m_file_reader(nullptr), @@ -359,4 +359,4 @@ void Decompressor::reset_stream() { m_decompressed_stream_pos = 0; init_decoder(m_decompression_stream); } -} // namespace streaming_compression::lzma +} // namespace clp::streaming_compression::lzma diff --git a/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp b/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp index 996663e44..5e90f5942 100644 --- a/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp @@ -1,13 +1,14 @@ -#ifndef STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP -#define STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP +#ifndef CLP_STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP +#define CLP_STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP // C++ standard libraries #include #include // ZLIB library -#include #include + +#include // Boost libraries #include @@ -16,8 +17,8 @@ #include "../../TraceableException.hpp" #include "../Decompressor.hpp" -namespace streaming_compression::lzma { -class Decompressor : public ::streaming_compression::Decompressor { +namespace clp::streaming_compression::lzma { +class Decompressor : public ::clp::streaming_compression::Decompressor { public: // Types class OperationFailed : public TraceableException { @@ -158,5 +159,5 @@ class Decompressor : public ::streaming_compression::Decompressor { char const* m_compressed_stream_block; size_t m_compressed_stream_block_size; }; -} // namespace streaming_compression::lzma -#endif // STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP +} // namespace clp::streaming_compression::lzma +#endif // CLP_STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp index d632510fc..d58d4c1ce 100644 --- a/components/core/tests/test-StreamingCompression.cpp +++ b/components/core/tests/test-StreamingCompression.cpp @@ -16,6 +16,7 @@ #include "../src/clp/streaming_compression/Compressor.hpp" #include "../src/clp/streaming_compression/Decompressor.hpp" #include "../src/clp/streaming_compression/lzma/Compressor.hpp" +#include "../src/clp/streaming_compression/lzma/Decompressor.hpp" #include "../src/clp/streaming_compression/passthrough/Compressor.hpp" #include "../src/clp/streaming_compression/passthrough/Decompressor.hpp" #include "../src/clp/streaming_compression/zstd/Compressor.hpp" @@ -56,6 +57,11 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") { decompressor = std::make_unique(); } + SECTION("LZMA compression") { + compressor = std::make_unique(); + decompressor = std::make_unique(); + } + // Initialize buffers Array uncompressed_buffer{cBufferSize}; for (size_t i{0}; i < cBufferSize; ++i) { From b94ca2695d4ebaf7c79a8ac3e31b94eae1e52e16 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Tue, 26 Nov 2024 03:21:38 -0500 Subject: [PATCH 03/65] Refactor lzma compressor to group common functionalities into helplers --- .../streaming_compression/lzma/Compressor.cpp | 210 ++++++------------ .../streaming_compression/lzma/Compressor.hpp | 56 ++--- 2 files changed, 84 insertions(+), 182 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 7bb13e5d3..74a59ebca 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -1,38 +1,40 @@ #include "Compressor.hpp" -#include +#include +#include +#include +#include -// Compression libraries #include -#include +#include -// Project headers -#include "../../Defs.h" +#include "../../ErrorCode.hpp" +#include "../../FileWriter.hpp" +#include "../../TraceableException.hpp" +#include "../../type_utils.hpp" +#include "Constants.hpp" namespace clp::streaming_compression::lzma { -Compressor::LzmaOption Compressor::m_option; - -Compressor::Compressor() { - memset(m_compression_stream.get(), 0, sizeof(LzmaStream)); -} +using clp::size_checked_pointer_cast; -void Compressor::init_lzma_encoder(LzmaStream* strm) { - lzma_options_lzma options; - if (lzma_lzma_preset(&options, m_option.get_compression_level())) { +auto Compressor::init_lzma_encoder(LzmaStream* strm, int compression_level, size_t dict_size) + -> void { + LzmaOptionsLzma options; + if (0 != lzma_lzma_preset(&options, compression_level)) { SPDLOG_ERROR("Failed to initialize LZMA options."); throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } - options.dict_size = m_option.get_dict_size(); - lzma_filter filters[2]{ - {LZMA_FILTER_LZMA2, &options}, - {LZMA_VLI_UNKNOWN, nullptr}, - }; + options.dict_size = dict_size; + std::array filters{{ + {.id = LZMA_FILTER_LZMA2, .options = &options}, + {.id = LZMA_VLI_UNKNOWN, .options = nullptr}, + }}; // Initialize the encoder using a preset. Set the integrity to check // to CRC64, which is the default in the xz command line tool. If // the .xz file needs to be decompressed with XZ Embedded, use // LZMA_CHECK_CRC32 instead. - auto const ret = lzma_stream_encoder(strm, filters, LZMA_CHECK_CRC64); + auto const ret{lzma_stream_encoder(strm, filters.data(), LZMA_CHECK_CRC64)}; // Return successfully if the initialization went fine. if (LZMA_OK == ret) { @@ -43,7 +45,7 @@ void Compressor::init_lzma_encoder(LzmaStream* strm) { // lzma/container.h (src/liblzma/api/lzma/container.h in the source // package or e.g. /usr/include/lzma/container.h depending on the // install prefix). - char const* msg; + char const* msg{nullptr}; switch (ret) { case LZMA_MEM_ERROR: msg = "Memory allocation failed"; @@ -68,23 +70,21 @@ void Compressor::init_lzma_encoder(LzmaStream* strm) { break; } - SPDLOG_ERROR("Error initializing the encoder: {} (error code {})", msg, int(ret)); + SPDLOG_ERROR("Error initializing the encoder: {} (error code {})", msg, static_cast(ret)); throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } -void Compressor::open(FileWriter& file_writer, int compression_level) { +auto Compressor::open(FileWriter& file_writer, int compression_level) -> void { if (nullptr != m_compressed_stream_file_writer) { throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__); } - if (false == (0 <= compression_level && compression_level <= 9)) { + if (compression_level < cMinCompressionLevel || compression_level > cMaxCompressionLevel) { throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__); } - if (compression_level != m_option.get_compression_level()) { - m_option.set_compression_level(compression_level); - } - init_lzma_encoder(m_compression_stream.get()); + memset(m_compression_stream.get(), 0, sizeof(LzmaStream)); + init_lzma_encoder(m_compression_stream.get(), compression_level, m_dict_size); // Setup compressed stream parameters m_compression_stream->next_in = nullptr; m_compression_stream->avail_in = 0; @@ -96,7 +96,7 @@ void Compressor::open(FileWriter& file_writer, int compression_level) { m_uncompressed_stream_pos = 0; } -void Compressor::close() { +auto Compressor::close() -> void { if (nullptr == m_compressed_stream_file_writer) { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } @@ -105,7 +105,7 @@ void Compressor::close() { m_compressed_stream_file_writer = nullptr; } -void Compressor::write(char const* data, size_t data_length) { +auto Compressor::write(char const* data, size_t data_length) -> void { if (nullptr == m_compressed_stream_file_writer) { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } @@ -114,54 +114,15 @@ void Compressor::write(char const* data, size_t data_length) { // Nothing needs to be done because we do not need to compress anything return; } + if (nullptr == data) { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } - lzma_action action = LZMA_RUN; - m_compression_stream->next_in = reinterpret_cast(const_cast(data)); - m_compression_stream->avail_in = data_length; - - // Compress all data - bool hit_input_eof = false; - while (!hit_input_eof) { - auto const return_value = lzma_code(m_compression_stream.get(), action); - switch (return_value) { - case LZMA_OK: - case LZMA_BUF_ERROR: - break; - case LZMA_STREAM_END: - hit_input_eof = true; - break; - default: - SPDLOG_ERROR("lzma() returned an unexpected value - {}.", int(return_value)); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } - if (0 == m_compression_stream->avail_in) { - // No more data to compress - break; - } + m_compression_stream->next_in = size_checked_pointer_cast(data); + m_compression_stream->avail_in = data_length; - // Write output buffer to file if it's full - if (0 == m_compression_stream->avail_out) { - m_compressed_stream_file_writer->write( - reinterpret_cast(m_compressed_stream_block_buffer.data()), - cCompressedStreamBlockBufferSize - ); - m_compression_stream->next_out = m_compressed_stream_block_buffer.data(); - m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; - } - } - - // Write any compressed data - if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) { - m_compressed_stream_file_writer->write( - reinterpret_cast(m_compressed_stream_block_buffer.data()), - cCompressedStreamBlockBufferSize - m_compression_stream->avail_out - ); - m_compression_stream->next_out = m_compressed_stream_block_buffer.data(); - m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; - } + run_lzma(LZMA_RUN); m_compression_stream->next_in = nullptr; @@ -169,7 +130,7 @@ void Compressor::write(char const* data, size_t data_length) { m_uncompressed_stream_pos += data_length; } -void Compressor::flush() { +auto Compressor::flush() -> void { if (false == m_compression_stream_contains_data) { return; } @@ -184,51 +145,11 @@ void Compressor::flush() { // restart from this point if the previous compressed data has been damaged Z_FINISH - // Pending output flushed and deflate returns Z_STREAM_END if there was enough output space, // or Z_OK or Z_BUF_ERROR if it needs to be called again with more space - // - - bool flush_complete = false; - while (true) { - auto const return_value = lzma_code(m_compression_stream.get(), LZMA_SYNC_FLUSH); - switch (return_value) { - case LZMA_STREAM_END: - flush_complete = true; - break; - case LZMA_OK: - case LZMA_BUF_ERROR: - break; - default: - SPDLOG_ERROR("lzma() returned an unexpected value - {}.", int(return_value)); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } - if (flush_complete) { - break; - } - - // Write output buffer to file if it's full - if (0 == m_compression_stream->avail_out) { - m_compressed_stream_file_writer->write( - reinterpret_cast(m_compressed_stream_block_buffer.data()), - cCompressedStreamBlockBufferSize - ); - m_compression_stream->next_out = m_compressed_stream_block_buffer.data(); - m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; - } - } - - // Write any compressed data - if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) { - m_compressed_stream_file_writer->write( - reinterpret_cast(m_compressed_stream_block_buffer.data()), - cCompressedStreamBlockBufferSize - m_compression_stream->avail_out - ); - m_compression_stream->next_out = m_compressed_stream_block_buffer.data(); - m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; - } - + run_lzma(LZMA_SYNC_FLUSH); m_compression_stream_contains_data = false; } -ErrorCode Compressor::try_get_pos(size_t& pos) const { +auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode { if (nullptr == m_compressed_stream_file_writer) { return ErrorCode_NotInit; } @@ -237,55 +158,64 @@ ErrorCode Compressor::try_get_pos(size_t& pos) const { return ErrorCode_Success; } -void Compressor::flush_and_close_compression_stream() { +auto Compressor::flush_and_close_compression_stream() -> void { if (nullptr == m_compressed_stream_file_writer) { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } - bool flush_complete = false; + run_lzma(LZMA_FINISH); + + m_compression_stream_contains_data = false; + + lzma_end(m_compression_stream.get()); + m_compression_stream->avail_out = 0; + m_compression_stream->next_out = nullptr; +} + +auto Compressor::run_lzma(LzmaAction action) -> void { + // Compress all data + bool hit_input_eof{false}; while (true) { - lzma_ret return_value = lzma_code(m_compression_stream.get(), LZMA_FINISH); - switch (return_value) { + auto const rc = lzma_code(m_compression_stream.get(), action); + switch (rc) { case LZMA_OK: case LZMA_BUF_ERROR: break; case LZMA_STREAM_END: - flush_complete = true; + hit_input_eof = true; break; default: - // SPDLOG_ERROR("deflate() returned an unexpected value - - // {}.", return_value); + SPDLOG_ERROR("lzma() returned an unexpected value - {}.", static_cast(rc)); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } - if (flush_complete) { + + if (LZMA_RUN == action && 0 == m_compression_stream->avail_in) { + // No more data to compress + break; + } + + if (hit_input_eof) { break; } // Write output buffer to file if it's full if (0 == m_compression_stream->avail_out) { - m_compressed_stream_file_writer->write( - reinterpret_cast(m_compressed_stream_block_buffer.data()), - cCompressedStreamBlockBufferSize - ); - m_compression_stream->next_out = m_compressed_stream_block_buffer.data(); - m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; + write_data(); } } // Write any compressed data if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) { - m_compressed_stream_file_writer->write( - reinterpret_cast(m_compressed_stream_block_buffer.data()), - cCompressedStreamBlockBufferSize - m_compression_stream->avail_out - ); - m_compression_stream->next_out = m_compressed_stream_block_buffer.data(); - m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; + write_data(); } +} - m_compression_stream_contains_data = false; - - lzma_end(m_compression_stream.get()); - m_compression_stream->avail_out = 0; - m_compression_stream->next_out = nullptr; +auto Compressor::write_data() -> void { + m_compressed_stream_file_writer->write( + size_checked_pointer_cast(m_compressed_stream_block_buffer.data()), + cCompressedStreamBlockBufferSize - m_compression_stream->avail_out + ); + m_compression_stream->next_out = m_compressed_stream_block_buffer.data(); + m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; } } // namespace clp::streaming_compression::lzma diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index 53f82b139..f6c6b4963 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -1,12 +1,12 @@ #ifndef CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP #define CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP -#include +#include + #include #include #include -#include #include "../../Array.hpp" #include "../../ErrorCode.hpp" @@ -31,35 +31,8 @@ class Compressor : public ::clp::streaming_compression::Compressor { } }; - class LzmaOption { - public: - LzmaOption() - : m_compression_level{cDefaultCompressionLevel}, - m_dict_size{cDefaultDictionarySize} {} - - auto set_compression_level(int compression_level) -> void { - if (compression_level < cMinCompressionLevel) { - m_compression_level = cMinCompressionLevel; - } else if (compression_level > cMaxCompressionLevel) { - m_compression_level = cMaxCompressionLevel; - } else { - m_compression_level = compression_level; - } - } - - auto set_dict_size(uint32_t dict_size) -> void { m_dict_size = dict_size; } - - [[nodiscard]] auto get_compression_level() const -> int { return m_compression_level; } - - [[nodiscard]] auto get_dict_size() const -> uint32_t { return m_dict_size; } - - private: - int m_compression_level; - uint32_t m_dict_size; - }; - // Constructor - Compressor(); + Compressor() = default; // Destructor ~Compressor() override = default; @@ -114,24 +87,22 @@ class Compressor : public ::clp::streaming_compression::Compressor { */ auto open(FileWriter& file_writer, int compression_level) -> void; - // Methods - static auto set_compression_level(int compression_level) -> void { - m_option.set_compression_level(compression_level); - } - - static auto set_dict_size(uint32_t dict_size) -> void { m_option.set_dict_size(dict_size); } - private: + using LzmaAction = lzma_action; + using LzmaFilter = lzma_filter; + using LzmaOptionsLzma = lzma_options_lzma; using LzmaStream = lzma_stream; + static auto + init_lzma_encoder(LzmaStream* strm, int compression_level, size_t dict_size) -> void; + static constexpr size_t cCompressedStreamBlockBufferSize{4096}; // 4KiB + /** * Flushes the stream and closes it */ - void flush_and_close_compression_stream(); - - static void init_lzma_encoder(LzmaStream* strm); - static LzmaOption m_option; - static constexpr size_t cCompressedStreamBlockBufferSize{4096}; // 4KiB + auto flush_and_close_compression_stream() -> void; + auto write_data() -> void; + auto run_lzma(lzma_action action) -> void; // Variables FileWriter* m_compressed_stream_file_writer{nullptr}; @@ -139,6 +110,7 @@ class Compressor : public ::clp::streaming_compression::Compressor { // Compressed stream variables std::unique_ptr m_compression_stream{std::make_unique()}; bool m_compression_stream_contains_data{false}; + size_t m_dict_size{cDefaultDictionarySize}; Array m_compressed_stream_block_buffer{cCompressedStreamBlockBufferSize}; From 707c41219a5e2ad91ccbf01b91df973e9856ef6d Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Wed, 27 Nov 2024 01:14:54 -0500 Subject: [PATCH 04/65] Improve comments --- .../streaming_compression/lzma/Compressor.cpp | 35 ++++++++----------- .../streaming_compression/lzma/Compressor.hpp | 25 +++++++++++-- 2 files changed, 37 insertions(+), 23 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 74a59ebca..6f6b5b4cf 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -21,7 +21,7 @@ auto Compressor::init_lzma_encoder(LzmaStream* strm, int compression_level, size -> void { LzmaOptionsLzma options; if (0 != lzma_lzma_preset(&options, compression_level)) { - SPDLOG_ERROR("Failed to initialize LZMA options."); + SPDLOG_ERROR("Failed to initialize LZMA options' compression level."); throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } options.dict_size = dict_size; @@ -122,7 +122,9 @@ auto Compressor::write(char const* data, size_t data_length) -> void { m_compression_stream->next_in = size_checked_pointer_cast(data); m_compression_stream->avail_in = data_length; - run_lzma(LZMA_RUN); + // Normal compression encoding workflow. Continue until the input buffer is + // exhausted. + compress(LZMA_RUN); m_compression_stream->next_in = nullptr; @@ -134,18 +136,9 @@ auto Compressor::flush() -> void { if (false == m_compression_stream_contains_data) { return; } - // Z_NO_FLUSH - deflate decides how much data to accumulate before producing output - // Z_SYNC_FLUSH - All pending output flushed to output buf and output aligned to byte - // boundary (completes current block and follows it with empty block that is 3 bits plus - // filler to next byte, followed by 4 bytes Z_PARTIAL_FLUSH - Same as Z_SYNC_FLUSH but - // output not aligned to byte boundary (completes current block and follows it with empty - // fixed codes block that is 10 bits long) Z_BLOCK - Same as Z_SYNC_FLUSH but output not - // aligned on a byte boundary and up to 7 bits of current block held to be written - // Z_FULL_FLUSH - Same as Z_SYNC_FLUSH but compression state reset so that decompression can - // restart from this point if the previous compressed data has been damaged Z_FINISH - - // Pending output flushed and deflate returns Z_STREAM_END if there was enough output space, - // or Z_OK or Z_BUF_ERROR if it needs to be called again with more space - run_lzma(LZMA_SYNC_FLUSH); + + // Forces all the buffered data to be available at output + compress(LZMA_SYNC_FLUSH); m_compression_stream_contains_data = false; } @@ -163,7 +156,8 @@ auto Compressor::flush_and_close_compression_stream() -> void { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } - run_lzma(LZMA_FINISH); + // Same as flush but all the input data must have been given to the encoder + compress(LZMA_FINISH); m_compression_stream_contains_data = false; @@ -172,8 +166,7 @@ auto Compressor::flush_and_close_compression_stream() -> void { m_compression_stream->next_out = nullptr; } -auto Compressor::run_lzma(LzmaAction action) -> void { - // Compress all data +auto Compressor::compress(LzmaAction action) -> void { bool hit_input_eof{false}; while (true) { auto const rc = lzma_code(m_compression_stream.get(), action); @@ -200,17 +193,17 @@ auto Compressor::run_lzma(LzmaAction action) -> void { // Write output buffer to file if it's full if (0 == m_compression_stream->avail_out) { - write_data(); + pipe_data(); } } - // Write any compressed data + // Write remaining compressed data if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) { - write_data(); + pipe_data(); } } -auto Compressor::write_data() -> void { +auto Compressor::pipe_data() -> void { m_compressed_stream_file_writer->write( size_checked_pointer_cast(m_compressed_stream_block_buffer.data()), cCompressedStreamBlockBufferSize - m_compression_stream->avail_out diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index f6c6b4963..03f32a186 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -93,6 +93,13 @@ class Compressor : public ::clp::streaming_compression::Compressor { using LzmaOptionsLzma = lzma_options_lzma; using LzmaStream = lzma_stream; + /** + * Initialize the Lzma compression stream + * @param strm A pre-allocated `lzma_stream` object + * @param compression_level + * @param dict_size Dictionary size that indicates how many bytes of the + * recently processed uncompressed data is kept in memory + */ static auto init_lzma_encoder(LzmaStream* strm, int compression_level, size_t dict_size) -> void; static constexpr size_t cCompressedStreamBlockBufferSize{4096}; // 4KiB @@ -101,8 +108,22 @@ class Compressor : public ::clp::streaming_compression::Compressor { * Flushes the stream and closes it */ auto flush_and_close_compression_stream() -> void; - auto write_data() -> void; - auto run_lzma(lzma_action action) -> void; + + /** + * Repeatedly invoke lzma_code() compression workflow until LZMA_STREAM_END + * is reached. + * The workflow action needs to be kept the same throughout this process. + * See also: https://github.com/frida/xz/blob/main/src/liblzma/api/lzma/base.h#L246 + * + * @param action + */ + auto compress(lzma_action action) -> void; + + /** + * Pipes the current compressed data in the lzma buffer to the output file + * and reset the compression buffer to receive new data. + */ + auto pipe_data() -> void; // Variables FileWriter* m_compressed_stream_file_writer{nullptr}; From 6d1ab8fa907632a9af6001f9075404fc09708633 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Wed, 27 Nov 2024 11:08:09 -0500 Subject: [PATCH 05/65] Fix reference link --- .../core/src/clp/streaming_compression/lzma/Compressor.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index 03f32a186..80052e50c 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -113,7 +113,7 @@ class Compressor : public ::clp::streaming_compression::Compressor { * Repeatedly invoke lzma_code() compression workflow until LZMA_STREAM_END * is reached. * The workflow action needs to be kept the same throughout this process. - * See also: https://github.com/frida/xz/blob/main/src/liblzma/api/lzma/base.h#L246 + * See also: https://github.com/tukaani-project/xz/blob/master/src/liblzma/api/lzma/base.h#L274 * * @param action */ From 89b57074a7851d66310bb32b3031da566a3902f4 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Wed, 27 Nov 2024 11:43:56 -0500 Subject: [PATCH 06/65] Add install for CentOS --- .../lib_install/centos-stream-9/install-prebuilt-packages.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh index e90f54733..eede5e004 100755 --- a/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh +++ b/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh @@ -16,4 +16,5 @@ dnf install -y \ libzstd-devel \ make \ mariadb-connector-c-devel \ - openssl-devel + openssl-devel \ + xz-devel From c646cea6325763dbebc23c790d87445a7c0c8ecd Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Wed, 27 Nov 2024 14:04:31 -0500 Subject: [PATCH 07/65] Apply coderabbit suggestions --- components/core/CMakeLists.txt | 6 +-- .../streaming_compression/lzma/Compressor.cpp | 44 +++++++++---------- .../streaming_compression/lzma/Compressor.hpp | 6 +-- .../core/tools/scripts/lib_install/liblzma.sh | 24 +++++----- 4 files changed, 39 insertions(+), 41 deletions(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 92bb6af19..56156c131 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -232,13 +232,13 @@ endif() # TODO: add a script in ./cmake/Modules to resolve .a vs. .so find_package(LibLZMA REQUIRED) if(LIBLZMA_FOUND) - message(STATUS "Found LIBLZMA_FOUND ${LIBLZMA_VERSION_STRING}") + message(STATUS "Found Lzma ${LIBLZMA_VERSION_STRING}") message(STATUS "Lzma library location: ${LIBLZMA_LIBRARIES}") else() - message(FATAL_ERROR "Could not find ${CLP_LIBS_STRING} libraries for LIBLZMA_FOUND") + message(FATAL_ERROR "Could not find ${CLP_LIBS_STRING} libraries for Lzma") endif() include_directories(${LIBLZMA_INCLUDE_DIRS}) -message("LZMA Include Dir: ${LIBLZMA_INCLUDE_DIRS}") +message("Lzma Include Dir: ${LIBLZMA_INCLUDE_DIRS}") # sqlite dependencies set(sqlite_DYNAMIC_LIBS "dl;m;pthread") diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 6f6b5b4cf..c7b46cd6c 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -83,13 +83,13 @@ auto Compressor::open(FileWriter& file_writer, int compression_level) -> void { throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__); } - memset(m_compression_stream.get(), 0, sizeof(LzmaStream)); - init_lzma_encoder(m_compression_stream.get(), compression_level, m_dict_size); + m_compression_stream = LZMA_STREAM_INIT; + init_lzma_encoder(&m_compression_stream, compression_level, m_dict_size); // Setup compressed stream parameters - m_compression_stream->next_in = nullptr; - m_compression_stream->avail_in = 0; - m_compression_stream->next_out = m_compressed_stream_block_buffer.data(); - m_compression_stream->avail_out = m_compressed_stream_block_buffer.size(); + m_compression_stream.next_in = nullptr; + m_compression_stream.avail_in = 0; + m_compression_stream.next_out = m_compressed_stream_block_buffer.data(); + m_compression_stream.avail_out = m_compressed_stream_block_buffer.size(); m_compressed_stream_file_writer = &file_writer; @@ -119,14 +119,14 @@ auto Compressor::write(char const* data, size_t data_length) -> void { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } - m_compression_stream->next_in = size_checked_pointer_cast(data); - m_compression_stream->avail_in = data_length; + m_compression_stream.next_in = size_checked_pointer_cast(data); + m_compression_stream.avail_in = data_length; // Normal compression encoding workflow. Continue until the input buffer is // exhausted. compress(LZMA_RUN); - m_compression_stream->next_in = nullptr; + m_compression_stream.next_in = nullptr; m_compression_stream_contains_data = true; m_uncompressed_stream_pos += data_length; @@ -161,44 +161,44 @@ auto Compressor::flush_and_close_compression_stream() -> void { m_compression_stream_contains_data = false; - lzma_end(m_compression_stream.get()); - m_compression_stream->avail_out = 0; - m_compression_stream->next_out = nullptr; + lzma_end(&m_compression_stream); + m_compression_stream.avail_out = 0; + m_compression_stream.next_out = nullptr; } auto Compressor::compress(LzmaAction action) -> void { - bool hit_input_eof{false}; + bool hit_stream_end{false}; while (true) { - auto const rc = lzma_code(m_compression_stream.get(), action); + auto const rc = lzma_code(&m_compression_stream, action); switch (rc) { case LZMA_OK: case LZMA_BUF_ERROR: break; case LZMA_STREAM_END: - hit_input_eof = true; + hit_stream_end = true; break; default: SPDLOG_ERROR("lzma() returned an unexpected value - {}.", static_cast(rc)); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } - if (LZMA_RUN == action && 0 == m_compression_stream->avail_in) { + if (LZMA_RUN == action && 0 == m_compression_stream.avail_in) { // No more data to compress break; } - if (hit_input_eof) { + if (hit_stream_end) { break; } // Write output buffer to file if it's full - if (0 == m_compression_stream->avail_out) { + if (0 == m_compression_stream.avail_out) { pipe_data(); } } // Write remaining compressed data - if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) { + if (m_compression_stream.avail_out < cCompressedStreamBlockBufferSize) { pipe_data(); } } @@ -206,9 +206,9 @@ auto Compressor::compress(LzmaAction action) -> void { auto Compressor::pipe_data() -> void { m_compressed_stream_file_writer->write( size_checked_pointer_cast(m_compressed_stream_block_buffer.data()), - cCompressedStreamBlockBufferSize - m_compression_stream->avail_out + cCompressedStreamBlockBufferSize - m_compression_stream.avail_out ); - m_compression_stream->next_out = m_compressed_stream_block_buffer.data(); - m_compression_stream->avail_out = cCompressedStreamBlockBufferSize; + m_compression_stream.next_out = m_compressed_stream_block_buffer.data(); + m_compression_stream.avail_out = cCompressedStreamBlockBufferSize; } } // namespace clp::streaming_compression::lzma diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index 80052e50c..d10810e88 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -1,8 +1,6 @@ #ifndef CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP #define CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP -#include - #include #include @@ -129,11 +127,11 @@ class Compressor : public ::clp::streaming_compression::Compressor { FileWriter* m_compressed_stream_file_writer{nullptr}; // Compressed stream variables - std::unique_ptr m_compression_stream{std::make_unique()}; + LzmaStream m_compression_stream; bool m_compression_stream_contains_data{false}; size_t m_dict_size{cDefaultDictionarySize}; - Array m_compressed_stream_block_buffer{cCompressedStreamBlockBufferSize}; + Array m_compressed_stream_block_buffer{cCompressedStreamBlockBufferSize}; size_t m_uncompressed_stream_pos{0}; }; diff --git a/components/core/tools/scripts/lib_install/liblzma.sh b/components/core/tools/scripts/lib_install/liblzma.sh index 1145b2646..28766eced 100755 --- a/components/core/tools/scripts/lib_install/liblzma.sh +++ b/components/core/tools/scripts/lib_install/liblzma.sh @@ -1,16 +1,23 @@ #!/bin/bash +# Exit on any error +set -e + +# Error on undefined variable +set -u + # Dependencies: # - curl # - make # - gcc # NOTE: Dependencies should be installed outside the script to allow the script to be largely distro-agnostic -# Exit on any error -set -e - -# Error on undefined variable -set -u +for cmd in curl make gcc; do + if ! $cmd --version >/dev/null 2>&1; then + echo "Error: Required dependency '$cmd' not found" + exit 1 + fi +done cUsage="Usage: ${BASH_SOURCE[0]} [ <.deb output directory>]" if [ "$#" -lt 1 ] ; then @@ -32,13 +39,6 @@ fi # Note: we won't check if the package already exists -echo "Checking for elevated privileges..." -privileged_command_prefix="" -if [ ${EUID:-$(id -u)} -ne 0 ] ; then - sudo echo "Script can elevate privileges." - privileged_command_prefix="${privileged_command_prefix} sudo" -fi - # Get number of cpu cores num_cpus=$(grep -c ^processor /proc/cpuinfo) From c91e5fb90752c0d89190b88ce45cafeab4e163a6 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Wed, 27 Nov 2024 14:21:10 -0500 Subject: [PATCH 08/65] Remove decompressor related files --- components/core/CMakeLists.txt | 2 - .../lzma/Decompressor.cpp | 362 ------------------ .../lzma/Decompressor.hpp | 163 -------- .../core/tests/test-StreamingCompression.cpp | 7 +- 4 files changed, 5 insertions(+), 529 deletions(-) delete mode 100644 components/core/src/clp/streaming_compression/lzma/Decompressor.cpp delete mode 100644 components/core/src/clp/streaming_compression/lzma/Decompressor.hpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 56156c131..312c6e2ef 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -480,8 +480,6 @@ set(SOURCE_FILES_unitTest src/clp/streaming_compression/Decompressor.hpp src/clp/streaming_compression/lzma/Compressor.cpp src/clp/streaming_compression/lzma/Compressor.hpp - src/clp/streaming_compression/lzma/Decompressor.cpp - src/clp/streaming_compression/lzma/Decompressor.hpp src/clp/streaming_compression/lzma/Constants.hpp src/clp/streaming_compression/passthrough/Compressor.cpp src/clp/streaming_compression/passthrough/Compressor.hpp diff --git a/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp b/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp deleted file mode 100644 index b6a10b418..000000000 --- a/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp +++ /dev/null @@ -1,362 +0,0 @@ -#include "Decompressor.hpp" - -// C++ Standard Libraries -#include - -// Boost libraries -#include - -// spdlog -#include - -// Project headers -#include "../../Defs.h" - -namespace clp::streaming_compression::lzma { -Decompressor::Decompressor() - : ::clp::streaming_compression::Decompressor(CompressorType::LZMA), - m_input_type(InputType::NotInitialized), - m_decompression_stream(nullptr), - m_file_reader(nullptr), - m_file_reader_initial_pos(0), - m_file_read_buffer_length(0), - m_file_read_buffer_capacity(0), - m_decompressed_stream_pos(0), - m_unused_decompressed_stream_block_size(0) { - // Create block to hold unused decompressed data - m_unused_decompressed_stream_block_buffer - = std::make_unique(m_unused_decompressed_stream_block_size); - m_decompression_stream = new lzma_stream; - memset(m_decompression_stream, 0, sizeof(lzma_stream)); -} - -Decompressor::~Decompressor() { - delete m_decompression_stream; -} - -void Decompressor::exact_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { - auto errorcode = try_read(buf, num_bytes_to_read, num_bytes_read); - if (num_bytes_read != num_bytes_to_read) { - SPDLOG_ERROR("FAILED TO READ EXACTLY {} bytes", num_bytes_to_read); - throw; - } - if (errorcode != ErrorCode_Success) { - SPDLOG_ERROR("FAILED TO READ EXACTLY {} bytes", num_bytes_to_read); - throw; - } -} - -ErrorCode Decompressor::try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) { - if (InputType::NotInitialized == m_input_type) { - return ErrorCode_NotInit; - } - if (nullptr == buf) { - return ErrorCode_BadParam; - } - if (0 == num_bytes_to_read) { - return ErrorCode_Success; - } - - num_bytes_read = 0; - - m_decompression_stream->next_out = reinterpret_cast(buf); - m_decompression_stream->avail_out = num_bytes_to_read; - while (true) { - // Check if there's data that can be decompressed - if (0 == m_decompression_stream->avail_in) { - if (InputType::File != m_input_type) { - // if we hit here, there must be something wrong - // we have consumed all data buffer but for some reason it still requires more. - return ErrorCode_EndOfFile; - } else { - auto error_code = m_file_reader->try_read( - m_file_read_buffer.get(), - m_file_read_buffer_capacity, - m_file_read_buffer_length - ); - m_decompression_stream->avail_in = m_file_read_buffer_length; - m_decompression_stream->next_in - = reinterpret_cast(m_file_read_buffer.get()); - if (ErrorCode_Success != error_code) { - if (ErrorCode_EndOfFile == error_code) { - num_bytes_read = num_bytes_to_read - m_decompression_stream->avail_out; - m_decompressed_stream_pos += num_bytes_read; - return ErrorCode_EndOfFile; - } - } - } - } - - lzma_ret return_value = lzma_code(m_decompression_stream, LZMA_RUN); - switch (return_value) { - case LZMA_OK: - case LZMA_BUF_ERROR: - if (0 == m_decompression_stream->avail_out) { - m_decompression_stream->next_out = nullptr; - num_bytes_read = num_bytes_to_read; - m_decompressed_stream_pos += num_bytes_read; - return ErrorCode_Success; - } - // by breaking here, enter the next iteration of decompressing - break; - case LZMA_STREAM_END: - if (0 == m_decompression_stream->avail_out) { - m_decompression_stream->next_out = nullptr; - num_bytes_read = num_bytes_to_read; - m_decompressed_stream_pos += num_bytes_read; - return ErrorCode_Success; - } - SPDLOG_ERROR("streaming_compression::lzma::Decompressor wants to read more but " - "reached end of file"); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - case LZMA_MEM_ERROR: - SPDLOG_ERROR("streaming_compression::lzma::Decompressor inflate() ran out of memory" - ); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - default: - SPDLOG_ERROR("inflate() returned an unexpected value - {}.", int(return_value)); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } - } -} - -ErrorCode Decompressor::try_seek_from_begin(size_t pos) { - if (InputType::NotInitialized == m_input_type) { - throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); - } - - // Check if we've already decompressed passed the desired position - if (m_decompressed_stream_pos > pos) { - // ZStd has no way for us to seek back to the desired position, so just reset the stream - // to the beginning - reset_stream(); - } - - // We need to fast-forward the decompression stream to decompressed_stream_pos - ErrorCode error; - while (m_decompressed_stream_pos < pos) { - size_t num_bytes_to_decompress = std::min( - m_unused_decompressed_stream_block_size, - pos - m_decompressed_stream_pos - ); - error = try_read_exact_length( - m_unused_decompressed_stream_block_buffer.get(), - num_bytes_to_decompress - ); - if (ErrorCode_Success != error) { - return error; - } - } - - return ErrorCode_Success; -} - -ErrorCode Decompressor::try_get_pos(size_t& pos) { - if (InputType::NotInitialized == m_input_type) { - return ErrorCode_NotInit; - } - - pos = m_decompressed_stream_pos; - return ErrorCode_Success; -} - -void Decompressor::close() { - if (InputType::NotInitialized == m_input_type) { - return; - } - lzma_end(m_decompression_stream); - m_decompression_stream->avail_out = 0; - m_decompression_stream->next_out = nullptr; - if (InputType::MemoryMappedCompressedFile == m_input_type) { - if (m_memory_mapped_compressed_file.is_open()) { - // An existing file is memory mapped by the decompressor - m_memory_mapped_compressed_file.close(); - } - } else if (InputType::File == m_input_type) { - m_file_read_buffer.reset(); - m_file_read_buffer_capacity = 0; - m_file_read_buffer_length = 0; - m_file_reader = nullptr; - } - m_input_type = InputType::NotInitialized; -} - -void Decompressor::init_decoder(lzma_stream* strm) { - // Initialize a .xz decoder. The decoder supports a memory usage limit - // and a set of flags. - // - // The memory usage of the decompressor depends on the settings used - // to compress a .xz file. It can vary from less than a megabyte to - // a few gigabytes, but in practice (at least for now) it rarely - // exceeds 65 MiB because that's how much memory is required to - // decompress files created with "xz -9". Settings requiring more - // memory take extra effort to use and don't (at least for now) - // provide significantly better compression in most cases. - // - // Memory usage limit is useful if it is important that the - // decompressor won't consume gigabytes of memory. The need - // for limiting depends on the application. In this example, - // no memory usage limiting is used. This is done by setting - // the limit to UINT64_MAX. - // - // The .xz format allows concatenating compressed files as is: - // - // echo foo | xz > foobar.xz - // echo bar | xz >> foobar.xz - // - // When decompressing normal standalone .xz files, LZMA_CONCATENATED - // should always be used to support decompression of concatenated - // .xz files. If LZMA_CONCATENATED isn't used, the decoder will stop - // after the first .xz stream. This can be useful when .xz data has - // been embedded inside another file format. - // - // Flags other than LZMA_CONCATENATED are supported too, and can - // be combined with bitwise-or. See lzma/container.h - // (src/liblzma/api/lzma/container.h in the source package or e.g. - // /usr/include/lzma/container.h depending on the install prefix) - // for details. - lzma_ret ret = lzma_stream_decoder(strm, UINT64_MAX, LZMA_CONCATENATED); - - // Return successfully if the initialization went fine. - if (ret == LZMA_OK) { - return; - } - - // Something went wrong. The possible errors are documented in - // lzma/container.h (src/liblzma/api/lzma/container.h in the source - // package or e.g. /usr/include/lzma/container.h depending on the - // install prefix). - // - // Note that LZMA_MEMLIMIT_ERROR is never possible here. If you - // specify a very tiny limit, the error will be delayed until - // the first headers have been parsed by a call to lzma_code(). - char const* msg; - switch (ret) { - case LZMA_MEM_ERROR: - msg = "Memory allocation failed"; - break; - - case LZMA_OPTIONS_ERROR: - msg = "Unsupported decompressor flags"; - break; - - default: - // This is most likely LZMA_PROG_ERROR indicating a bug in - // this program or in liblzma. It is inconvenient to have a - // separate error message for errors that should be impossible - // to occur, but knowing the error code is important for - // debugging. That's why it is good to print the error code - // at least when there is no good error message to show. - msg = "Unknown error, possibly a bug"; - break; - } - - SPDLOG_ERROR("Error initializing the decoder: {} (error code {})", msg, int(ret)); -} - -void Decompressor::open(char const* compressed_data_buf, size_t compressed_data_buf_size) { - if (InputType::NotInitialized != m_input_type) { - throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__); - } - m_input_type = InputType::CompressedDataBuf; - - // Configure input stream - reset_stream(); - m_decompression_stream->next_in - = reinterpret_cast(const_cast(compressed_data_buf)); - m_decompression_stream->avail_in = compressed_data_buf_size; - m_decompression_stream->next_out = nullptr; - m_decompression_stream->avail_out = 0; -} - -ErrorCode Decompressor::open(std::string const& compressed_file_path) { - if (InputType::NotInitialized != m_input_type) { - throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__); - } - m_input_type = InputType::MemoryMappedCompressedFile; - - // Create memory mapping for compressed_file_path, use boost read only memory mapped file - boost::system::error_code boost_error_code; - size_t compressed_file_size - = boost::filesystem::file_size(compressed_file_path, boost_error_code); - if (boost_error_code) { - SPDLOG_ERROR( - "streaming_compression::zstd::Decompressor: Unable to obtain file size for " - "'{}' - {}.", - compressed_file_path.c_str(), - boost_error_code.message().c_str() - ); - return ErrorCode_Failure; - } - - boost::iostreams::mapped_file_params memory_map_params; - memory_map_params.path = compressed_file_path; - memory_map_params.flags = boost::iostreams::mapped_file::readonly; - memory_map_params.length = compressed_file_size; - memory_map_params.hint = m_memory_mapped_compressed_file.data( - ); // Try to map it to the same memory location as previous memory mapped file - m_memory_mapped_compressed_file.open(memory_map_params); - if (!m_memory_mapped_compressed_file.is_open()) { - SPDLOG_ERROR( - "streaming_compression::lzma::Decompressor: Unable to memory map the " - "compressed file with path: {}", - compressed_file_path.c_str() - ); - return ErrorCode_Failure; - } - - // Configure input stream - reset_stream(); - m_decompression_stream->next_in - = reinterpret_cast(const_cast(m_memory_mapped_compressed_file.data())); - m_decompression_stream->avail_in = compressed_file_size; - m_decompression_stream->next_out = nullptr; - m_decompression_stream->avail_out = 0; - - return ErrorCode_Success; -} - -void Decompressor::open(FileReader& file_reader, size_t file_read_buffer_capacity) { - if (InputType::NotInitialized != m_input_type) { - throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__); - } - m_input_type = InputType::File; - - m_file_reader = &file_reader; - m_file_reader_initial_pos = m_file_reader->get_pos(); - - m_file_read_buffer_capacity = file_read_buffer_capacity; - m_file_read_buffer = std::make_unique(m_file_read_buffer_capacity); - m_file_read_buffer_length = 0; - - // Configure input stream - reset_stream(); - m_decompression_stream->next_in = reinterpret_cast(m_file_read_buffer.get()); - m_decompression_stream->avail_in = m_file_read_buffer_length; - m_decompression_stream->next_out = nullptr; - m_decompression_stream->avail_out = 0; -} - -ErrorCode Decompressor::get_decompressed_stream_region( - size_t decompressed_stream_pos, - char* extraction_buf, - size_t extraction_len -) { - auto error_code = try_seek_from_begin(decompressed_stream_pos); - if (ErrorCode_Success != error_code) { - return error_code; - } - - error_code = try_read_exact_length(extraction_buf, extraction_len); - return error_code; -} - -void Decompressor::reset_stream() { - if (InputType::File == m_input_type) { - m_file_reader->seek_from_begin(m_file_reader_initial_pos); - m_file_read_buffer_length = 0; - } - m_decompressed_stream_pos = 0; - init_decoder(m_decompression_stream); -} -} // namespace clp::streaming_compression::lzma diff --git a/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp b/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp deleted file mode 100644 index 5e90f5942..000000000 --- a/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp +++ /dev/null @@ -1,163 +0,0 @@ -#ifndef CLP_STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP -#define CLP_STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP - -// C++ standard libraries -#include -#include - -// ZLIB library -#include - -#include -// Boost libraries -#include - -// Project headers -#include "../../FileReader.hpp" -#include "../../TraceableException.hpp" -#include "../Decompressor.hpp" - -namespace clp::streaming_compression::lzma { -class Decompressor : public ::clp::streaming_compression::Decompressor { -public: - // Types - class OperationFailed : public TraceableException { - public: - // Constructors - OperationFailed(ErrorCode error_code, char const* const filename, int line_number) - : TraceableException(error_code, filename, line_number) {} - - // Methods - char const* what() const noexcept override { - return "streaming_compression::lzma::Decompressor operation failed"; - } - }; - - // Constructor - Decompressor(); - - // Destructor - ~Decompressor(); - - // Explicitly disable copy and move constructor/assignment - Decompressor(Decompressor const&) = delete; - Decompressor& operator=(Decompressor const&) = delete; - - // Methods implementing the ReaderInterface - /** - * Tries to read up to a given number of bytes from the decompressor - * @param buf - * @param num_bytes_to_read The number of bytes to try and read - * @param num_bytes_read The actual number of bytes read - * @return Same as FileReader::try_read if the decompressor is attached to a file - * @return ErrorCode_NotInit if the decompressor is not open - * @return ErrorCode_BadParam if buf is invalid - * @return ErrorCode_EndOfFile on EOF - * @return ErrorCode_Failure on decompression failure - * @return ErrorCode_Success on success - */ - ErrorCode try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) override; - - /** - */ - void exact_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read); - - /** - * Tries to seek from the beginning to the given position - * @param pos - * @return ErrorCode_NotInit if the decompressor is not open - * @return Same as ReaderInterface::try_read_exact_length - * @return ErrorCode_Success on success - */ - ErrorCode try_seek_from_begin(size_t pos) override; - /** - * Tries to get the current position of the read head - * @param pos Position of the read head in the file - * @return ErrorCode_NotInit if the decompressor is not open - * @return ErrorCode_Success on success - */ - ErrorCode try_get_pos(size_t& pos) override; - - // Methods implementing the Decompressor interface - void close() override; - /** - * Decompresses and copies the range of uncompressed data described by - * decompressed_stream_pos and extraction_len into extraction_buf - * @param decompressed_stream_pos - * @param extraction_buf - * @param extraction_len - * @return Same as streaming_compression::zstd::Decompressor::try_seek_from_begin - * @return Same as ReaderInterface::try_read_exact_length - */ - ErrorCode get_decompressed_stream_region( - size_t decompressed_stream_pos, - char* extraction_buf, - size_t extraction_len - ) override; - - // Methods - /*** - * Initialize streaming decompressor to decompress from the specified compressed data buffer - * @param compressed_data_buf - * @param compressed_data_buf_size - */ - void open(char const* compressed_data_buf, size_t compressed_data_buf_size) override; - - /*** - * Initialize streaming decompressor to decompress from a compressed file specified by the - * given path - * @param compressed_file_path - * @param decompressed_stream_block_size - * @return ErrorCode_Failure if the provided path cannot be memory mapped - * @return ErrorCode_Success on success - */ - ErrorCode open(std::string const& compressed_file_path); - - /** - * Initializes the decompressor to decompress from an open file - * @param file_reader - * @param file_read_buffer_capacity The maximum amount of data to read from a file at a time - */ - void open(FileReader& file_reader, size_t file_read_buffer_capacity) override; - -private: - // Enum class - enum class InputType { - NotInitialized, // Note: do nothing but generate an error to prevent this required - // parameter is not initialized properly - CompressedDataBuf, - MemoryMappedCompressedFile, - File - }; - - // Methods - /** - * Reset streaming decompression state so it will start decompressing from the beginning of - * the stream afterwards - */ - void reset_stream(); - - void init_decoder(lzma_stream* strm); - - // Variables - InputType m_input_type; - - // Compressed stream variables - lzma_stream* m_decompression_stream{nullptr}; - - boost::iostreams::mapped_file_source m_memory_mapped_compressed_file; - FileReader* m_file_reader; - size_t m_file_reader_initial_pos; - std::unique_ptr m_file_read_buffer; - size_t m_file_read_buffer_length; - size_t m_file_read_buffer_capacity; - - size_t m_decompressed_stream_pos; - size_t m_unused_decompressed_stream_block_size; - std::unique_ptr m_unused_decompressed_stream_block_buffer; - - char const* m_compressed_stream_block; - size_t m_compressed_stream_block_size; -}; -} // namespace clp::streaming_compression::lzma -#endif // CLP_STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp index d58d4c1ce..6dac8ba52 100644 --- a/components/core/tests/test-StreamingCompression.cpp +++ b/components/core/tests/test-StreamingCompression.cpp @@ -16,7 +16,6 @@ #include "../src/clp/streaming_compression/Compressor.hpp" #include "../src/clp/streaming_compression/Decompressor.hpp" #include "../src/clp/streaming_compression/lzma/Compressor.hpp" -#include "../src/clp/streaming_compression/lzma/Decompressor.hpp" #include "../src/clp/streaming_compression/passthrough/Compressor.hpp" #include "../src/clp/streaming_compression/passthrough/Decompressor.hpp" #include "../src/clp/streaming_compression/zstd/Compressor.hpp" @@ -59,7 +58,6 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") { SECTION("LZMA compression") { compressor = std::make_unique(); - decompressor = std::make_unique(); } // Initialize buffers @@ -81,6 +79,11 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") { file_writer.close(); // Decompress and compare + if (nullptr == decompressor) { + boost::filesystem::remove(compressed_file_path); + return; + } + clp::ReadOnlyMemoryMappedFile const memory_mapped_compressed_file{compressed_file_path}; auto const compressed_file_view{memory_mapped_compressed_file.get_view()}; decompressor->open(compressed_file_view.data(), compressed_file_view.size()); From 26b06638740d15c5657de301138d46977da25203 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Sat, 30 Nov 2024 02:11:46 -0500 Subject: [PATCH 09/65] Address review concerns --- .../streaming_compression/lzma/Compressor.cpp | 99 +++++++++---------- .../streaming_compression/lzma/Compressor.hpp | 18 +--- 2 files changed, 50 insertions(+), 67 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index c7b46cd6c..6092207d6 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -15,17 +15,15 @@ #include "Constants.hpp" namespace clp::streaming_compression::lzma { -using clp::size_checked_pointer_cast; - -auto Compressor::init_lzma_encoder(LzmaStream* strm, int compression_level, size_t dict_size) +auto Compressor::init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size) -> void { - LzmaOptionsLzma options; + lzma_options_lzma options; if (0 != lzma_lzma_preset(&options, compression_level)) { SPDLOG_ERROR("Failed to initialize LZMA options' compression level."); throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } options.dict_size = dict_size; - std::array filters{{ + std::array filters{{ {.id = LZMA_FILTER_LZMA2, .options = &options}, {.id = LZMA_VLI_UNKNOWN, .options = nullptr}, }}; @@ -34,10 +32,10 @@ auto Compressor::init_lzma_encoder(LzmaStream* strm, int compression_level, size // to CRC64, which is the default in the xz command line tool. If // the .xz file needs to be decompressed with XZ Embedded, use // LZMA_CHECK_CRC32 instead. - auto const ret{lzma_stream_encoder(strm, filters.data(), LZMA_CHECK_CRC64)}; + auto const rc{lzma_stream_encoder(strm, filters.data(), LZMA_CHECK_CRC64)}; // Return successfully if the initialization went fine. - if (LZMA_OK == ret) { + if (LZMA_OK == rc) { return; } @@ -46,7 +44,7 @@ auto Compressor::init_lzma_encoder(LzmaStream* strm, int compression_level, size // package or e.g. /usr/include/lzma/container.h depending on the // install prefix). char const* msg{nullptr}; - switch (ret) { + switch (rc) { case LZMA_MEM_ERROR: msg = "Memory allocation failed"; break; @@ -60,17 +58,12 @@ auto Compressor::init_lzma_encoder(LzmaStream* strm, int compression_level, size break; default: - // This is most likely LZMA_PROG_ERROR indicating a bug in - // this program or in liblzma. It is inconvenient to have a - // separate error message for errors that should be impossible - // to occur, but knowing the error code is important for - // debugging. That's why it is good to print the error code - // at least when there is no good error message to show. - msg = "Unknown error, possibly a bug"; + // This is most likely LZMA_PROG_ERROR indicating a bug in liblzma + msg = "Unknown error"; break; } - SPDLOG_ERROR("Error initializing the encoder: {} (error code {})", msg, static_cast(ret)); + SPDLOG_ERROR("Error initializing the encoder: {} (error code {})", msg, static_cast(rc)); throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } @@ -85,9 +78,12 @@ auto Compressor::open(FileWriter& file_writer, int compression_level) -> void { m_compression_stream = LZMA_STREAM_INIT; init_lzma_encoder(&m_compression_stream, compression_level, m_dict_size); - // Setup compressed stream parameters + + // No input upon initialization m_compression_stream.next_in = nullptr; m_compression_stream.avail_in = 0; + + // Attach output buffer to LZMA stream m_compression_stream.next_out = m_compressed_stream_block_buffer.data(); m_compression_stream.avail_out = m_compressed_stream_block_buffer.size(); @@ -101,7 +97,13 @@ auto Compressor::close() -> void { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } - flush_and_close_compression_stream(); + run_lzma(LZMA_FINISH); + lzma_end(&m_compression_stream); + + // Detach output buffer from LZMA stream + m_compression_stream.next_out = nullptr; + m_compression_stream.avail_out = 0; + m_compressed_stream_file_writer = nullptr; } @@ -119,27 +121,22 @@ auto Compressor::write(char const* data, size_t data_length) -> void { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } - m_compression_stream.next_in = size_checked_pointer_cast(data); + // Attach input data to LZMA stream + m_compression_stream.next_in = clp::size_checked_pointer_cast(data); m_compression_stream.avail_in = data_length; - // Normal compression encoding workflow. Continue until the input buffer is - // exhausted. - compress(LZMA_RUN); - - m_compression_stream.next_in = nullptr; + run_lzma(LZMA_RUN); - m_compression_stream_contains_data = true; m_uncompressed_stream_pos += data_length; } auto Compressor::flush() -> void { - if (false == m_compression_stream_contains_data) { + if (m_compression_stream_is_flushed) { return; } // Forces all the buffered data to be available at output - compress(LZMA_SYNC_FLUSH); - m_compression_stream_contains_data = false; + run_lzma(LZMA_SYNC_FLUSH); } auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode { @@ -151,43 +148,39 @@ auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode { return ErrorCode_Success; } -auto Compressor::flush_and_close_compression_stream() -> void { - if (nullptr == m_compressed_stream_file_writer) { - throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); - } - - // Same as flush but all the input data must have been given to the encoder - compress(LZMA_FINISH); - - m_compression_stream_contains_data = false; - - lzma_end(&m_compression_stream); - m_compression_stream.avail_out = 0; - m_compression_stream.next_out = nullptr; -} - -auto Compressor::compress(LzmaAction action) -> void { - bool hit_stream_end{false}; +auto Compressor::run_lzma(lzma_action action) -> void { + m_compression_stream_is_flushed = false; + bool end_of_stream{false}; while (true) { + if (0 == m_compression_stream.avail_in) { // No more input data + if (LZMA_RUN == action) { + // All input data have been processed, so we can safely detach + // input data from LZMA stream. + m_compression_stream.next_in = nullptr; + break; + } + } else { + if (LZMA_FINISH == action) { + SPDLOG_ERROR("Tried to close LZMA compressor with unprocessed input data."); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + } + auto const rc = lzma_code(&m_compression_stream, action); switch (rc) { case LZMA_OK: case LZMA_BUF_ERROR: break; case LZMA_STREAM_END: - hit_stream_end = true; + end_of_stream = true; break; default: SPDLOG_ERROR("lzma() returned an unexpected value - {}.", static_cast(rc)); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } - if (LZMA_RUN == action && 0 == m_compression_stream.avail_in) { - // No more data to compress - break; - } - - if (hit_stream_end) { + if (end_of_stream) { + m_compression_stream_is_flushed = true; break; } @@ -205,7 +198,7 @@ auto Compressor::compress(LzmaAction action) -> void { auto Compressor::pipe_data() -> void { m_compressed_stream_file_writer->write( - size_checked_pointer_cast(m_compressed_stream_block_buffer.data()), + clp::size_checked_pointer_cast(m_compressed_stream_block_buffer.data()), cCompressedStreamBlockBufferSize - m_compression_stream.avail_out ); m_compression_stream.next_out = m_compressed_stream_block_buffer.data(); diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index d10810e88..5b1adb404 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -86,11 +86,6 @@ class Compressor : public ::clp::streaming_compression::Compressor { auto open(FileWriter& file_writer, int compression_level) -> void; private: - using LzmaAction = lzma_action; - using LzmaFilter = lzma_filter; - using LzmaOptionsLzma = lzma_options_lzma; - using LzmaStream = lzma_stream; - /** * Initialize the Lzma compression stream * @param strm A pre-allocated `lzma_stream` object @@ -99,14 +94,9 @@ class Compressor : public ::clp::streaming_compression::Compressor { * recently processed uncompressed data is kept in memory */ static auto - init_lzma_encoder(LzmaStream* strm, int compression_level, size_t dict_size) -> void; + init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size) -> void; static constexpr size_t cCompressedStreamBlockBufferSize{4096}; // 4KiB - /** - * Flushes the stream and closes it - */ - auto flush_and_close_compression_stream() -> void; - /** * Repeatedly invoke lzma_code() compression workflow until LZMA_STREAM_END * is reached. @@ -115,7 +105,7 @@ class Compressor : public ::clp::streaming_compression::Compressor { * * @param action */ - auto compress(lzma_action action) -> void; + auto run_lzma(lzma_action action) -> void; /** * Pipes the current compressed data in the lzma buffer to the output file @@ -127,8 +117,8 @@ class Compressor : public ::clp::streaming_compression::Compressor { FileWriter* m_compressed_stream_file_writer{nullptr}; // Compressed stream variables - LzmaStream m_compression_stream; - bool m_compression_stream_contains_data{false}; + lzma_stream m_compression_stream; + bool m_compression_stream_is_flushed{true}; size_t m_dict_size{cDefaultDictionarySize}; Array m_compressed_stream_block_buffer{cCompressedStreamBlockBufferSize}; From 740bc1c1216f999c881dffd49564eeabcf1d4bbd Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Mon, 2 Dec 2024 10:20:12 -0500 Subject: [PATCH 10/65] Address review concern --- .../streaming_compression/lzma/Compressor.cpp | 26 +++++++++++++------ .../streaming_compression/lzma/Compressor.hpp | 15 +++-------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 6092207d6..a1d5dfaa2 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -14,13 +14,21 @@ #include "../../type_utils.hpp" #include "Constants.hpp" -namespace clp::streaming_compression::lzma { -auto Compressor::init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size) - -> void { +namespace { +using clp::streaming_compression::lzma::Compressor; + +/** + * Initialize the Lzma compression stream + * @param strm A pre-allocated `lzma_stream` object + * @param compression_level + * @param dict_size Dictionary size that indicates how many bytes of the + * recently processed uncompressed data is kept in memory + */ +auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size) -> void { lzma_options_lzma options; if (0 != lzma_lzma_preset(&options, compression_level)) { SPDLOG_ERROR("Failed to initialize LZMA options' compression level."); - throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); + throw Compressor::OperationFailed(clp::ErrorCode_BadParam, __FILENAME__, __LINE__); } options.dict_size = dict_size; std::array filters{{ @@ -64,9 +72,11 @@ auto Compressor::init_lzma_encoder(lzma_stream* strm, int compression_level, siz } SPDLOG_ERROR("Error initializing the encoder: {} (error code {})", msg, static_cast(rc)); - throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); + throw Compressor::OperationFailed(clp::ErrorCode_BadParam, __FILENAME__, __LINE__); } +} // namespace +namespace clp::streaming_compression::lzma { auto Compressor::open(FileWriter& file_writer, int compression_level) -> void { if (nullptr != m_compressed_stream_file_writer) { throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__); @@ -186,17 +196,17 @@ auto Compressor::run_lzma(lzma_action action) -> void { // Write output buffer to file if it's full if (0 == m_compression_stream.avail_out) { - pipe_data(); + flush_stream_output_block_buffer(); } } // Write remaining compressed data if (m_compression_stream.avail_out < cCompressedStreamBlockBufferSize) { - pipe_data(); + flush_stream_output_block_buffer(); } } -auto Compressor::pipe_data() -> void { +auto Compressor::flush_stream_output_block_buffer() -> void { m_compressed_stream_file_writer->write( clp::size_checked_pointer_cast(m_compressed_stream_block_buffer.data()), cCompressedStreamBlockBufferSize - m_compression_stream.avail_out diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index 5b1adb404..4afdce36a 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -86,15 +86,6 @@ class Compressor : public ::clp::streaming_compression::Compressor { auto open(FileWriter& file_writer, int compression_level) -> void; private: - /** - * Initialize the Lzma compression stream - * @param strm A pre-allocated `lzma_stream` object - * @param compression_level - * @param dict_size Dictionary size that indicates how many bytes of the - * recently processed uncompressed data is kept in memory - */ - static auto - init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size) -> void; static constexpr size_t cCompressedStreamBlockBufferSize{4096}; // 4KiB /** @@ -108,10 +99,10 @@ class Compressor : public ::clp::streaming_compression::Compressor { auto run_lzma(lzma_action action) -> void; /** - * Pipes the current compressed data in the lzma buffer to the output file - * and reset the compression buffer to receive new data. + * Flushes the current compressed data in the lzma output buffer to the + * output file handler. Reset the compression buffer to receive new data. */ - auto pipe_data() -> void; + auto flush_stream_output_block_buffer() -> void; // Variables FileWriter* m_compressed_stream_file_writer{nullptr}; From e2be8833595b3281cdeaaccfcd1255849ce33b29 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Mon, 2 Dec 2024 10:22:18 -0500 Subject: [PATCH 11/65] Simplify else-if --- .../core/src/clp/streaming_compression/lzma/Compressor.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index a1d5dfaa2..c40ca7652 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -169,8 +169,7 @@ auto Compressor::run_lzma(lzma_action action) -> void { m_compression_stream.next_in = nullptr; break; } - } else { - if (LZMA_FINISH == action) { + } else if (LZMA_FINISH == action) { SPDLOG_ERROR("Tried to close LZMA compressor with unprocessed input data."); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } From 905367d6e4e08174fb30b7da67d00e5455ad14de Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Mon, 2 Dec 2024 10:23:19 -0500 Subject: [PATCH 12/65] Fix else-if --- .../core/src/clp/streaming_compression/lzma/Compressor.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index c40ca7652..610c7cc17 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -170,9 +170,8 @@ auto Compressor::run_lzma(lzma_action action) -> void { break; } } else if (LZMA_FINISH == action) { - SPDLOG_ERROR("Tried to close LZMA compressor with unprocessed input data."); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } + SPDLOG_ERROR("Tried to close LZMA compressor with unprocessed input data."); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } auto const rc = lzma_code(&m_compression_stream, action); From 8ae88b2a86f880a02133ac9ee3cb3a1ed5921a9d Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Mon, 2 Dec 2024 10:44:30 -0500 Subject: [PATCH 13/65] Add lzma (xz) dep to MacOS --- components/core/tools/scripts/lib_install/macos/install-all.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/components/core/tools/scripts/lib_install/macos/install-all.sh b/components/core/tools/scripts/lib_install/macos/install-all.sh index 97e41903d..cb24dd054 100755 --- a/components/core/tools/scripts/lib_install/macos/install-all.sh +++ b/components/core/tools/scripts/lib_install/macos/install-all.sh @@ -21,6 +21,7 @@ brew install \ mongo-cxx-driver \ msgpack-cxx \ spdlog \ + xz \ zstd # Install pkg-config if it isn't already installed From 0d0c20eaf35271572b068fe03887695d0f62f69d Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Mon, 2 Dec 2024 12:24:45 -0500 Subject: [PATCH 14/65] Refactor helper run_lzma() --- .../streaming_compression/lzma/Compressor.cpp | 79 +++++++++---------- .../streaming_compression/lzma/Compressor.hpp | 2 + 2 files changed, 38 insertions(+), 43 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 610c7cc17..11260c6e9 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -107,7 +107,14 @@ auto Compressor::close() -> void { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } - run_lzma(LZMA_FINISH); + if (m_compression_stream.avail_in > 0) { + SPDLOG_ERROR("Tried to close LZMA compressor with unprocessed input data."); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + + while (false == m_compression_stream_is_flushed) { + run_lzma(LZMA_FINISH); + } lzma_end(&m_compression_stream); // Detach output buffer from LZMA stream @@ -134,19 +141,22 @@ auto Compressor::write(char const* data, size_t data_length) -> void { // Attach input data to LZMA stream m_compression_stream.next_in = clp::size_checked_pointer_cast(data); m_compression_stream.avail_in = data_length; + m_compression_stream_is_flushed = false; - run_lzma(LZMA_RUN); + while (m_compression_stream.avail_in > 0) { + run_lzma(LZMA_RUN); + } + + // All input data have been encoded so detach input data + m_compression_stream.next_in = nullptr; m_uncompressed_stream_pos += data_length; } auto Compressor::flush() -> void { - if (m_compression_stream_is_flushed) { - return; + while (false == m_compression_stream_is_flushed) { + run_lzma(LZMA_SYNC_FLUSH); } - - // Forces all the buffered data to be available at output - run_lzma(LZMA_SYNC_FLUSH); } auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode { @@ -159,52 +169,35 @@ auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode { } auto Compressor::run_lzma(lzma_action action) -> void { - m_compression_stream_is_flushed = false; - bool end_of_stream{false}; - while (true) { - if (0 == m_compression_stream.avail_in) { // No more input data - if (LZMA_RUN == action) { - // All input data have been processed, so we can safely detach - // input data from LZMA stream. - m_compression_stream.next_in = nullptr; - break; - } - } else if (LZMA_FINISH == action) { - SPDLOG_ERROR("Tried to close LZMA compressor with unprocessed input data."); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } - - auto const rc = lzma_code(&m_compression_stream, action); - switch (rc) { - case LZMA_OK: - case LZMA_BUF_ERROR: - break; - case LZMA_STREAM_END: - end_of_stream = true; - break; - default: - SPDLOG_ERROR("lzma() returned an unexpected value - {}.", static_cast(rc)); + auto const rc = lzma_code(&m_compression_stream, action); + switch (rc) { + case LZMA_OK: + break; + case LZMA_BUF_ERROR: // No encoding progress can be made + if (m_compression_stream.avail_in > 0) { + SPDLOG_ERROR("LZMA compressor input stream is corrupt."); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } - - if (end_of_stream) { + } + break; + case LZMA_STREAM_END: m_compression_stream_is_flushed = true; break; - } - - // Write output buffer to file if it's full - if (0 == m_compression_stream.avail_out) { - flush_stream_output_block_buffer(); - } + default: + SPDLOG_ERROR("lzma() returned an unexpected value - {}.", static_cast(rc)); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } - // Write remaining compressed data - if (m_compression_stream.avail_out < cCompressedStreamBlockBufferSize) { + // Write output buffer to file if it's full or flushed + if (0 == m_compression_stream.avail_out || m_compression_stream_is_flushed) { flush_stream_output_block_buffer(); } } auto Compressor::flush_stream_output_block_buffer() -> void { + if (cCompressedStreamBlockBufferSize == m_compression_stream.avail_out) { + // Nothing to flush + return; + } m_compressed_stream_file_writer->write( clp::size_checked_pointer_cast(m_compressed_stream_block_buffer.data()), cCompressedStreamBlockBufferSize - m_compression_stream.avail_out diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index 4afdce36a..1953001f2 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -53,6 +53,8 @@ class Compressor : public ::clp::streaming_compression::Compressor { /** * Writes any internally buffered data to file and ends the current frame + * + * Forces all the encoded data buffered by LZMA to be available at output */ auto flush() -> void override; From 559485d18c64eb32e7a72169ff303b3baae07d52 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Mon, 2 Dec 2024 12:36:07 -0500 Subject: [PATCH 15/65] Update function doc --- .../core/src/clp/streaming_compression/lzma/Compressor.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index 1953001f2..3eb062223 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -91,9 +91,10 @@ class Compressor : public ::clp::streaming_compression::Compressor { static constexpr size_t cCompressedStreamBlockBufferSize{4096}; // 4KiB /** - * Repeatedly invoke lzma_code() compression workflow until LZMA_STREAM_END - * is reached. - * The workflow action needs to be kept the same throughout this process. + * Invoke lzma_code() encoding workflow for one time with the given action. + * + * Once flushing starts, the workflow action needs to stay the same until + * flushing is complete (aka LZMA_STREAM_END is reached). * See also: https://github.com/tukaani-project/xz/blob/master/src/liblzma/api/lzma/base.h#L274 * * @param action From 7c69c6919f6fd41a83d0f5e5865bb565014e9723 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Mon, 2 Dec 2024 12:48:02 -0500 Subject: [PATCH 16/65] Clarify unit test early termination --- components/core/tests/test-StreamingCompression.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp index 6dac8ba52..a47012ca3 100644 --- a/components/core/tests/test-StreamingCompression.cpp +++ b/components/core/tests/test-StreamingCompression.cpp @@ -4,8 +4,10 @@ #include #include #include +#include #include +#include #include #include @@ -78,12 +80,16 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") { compressor->close(); file_writer.close(); - // Decompress and compare - if (nullptr == decompressor) { + if (boost::dynamic_pointer_cast( + std::move(compressor) + )) + { + // TODO: remove this LZMA testing early termination boost::filesystem::remove(compressed_file_path); return; } + // Decompress and compare clp::ReadOnlyMemoryMappedFile const memory_mapped_compressed_file{compressed_file_path}; auto const compressed_file_view{memory_mapped_compressed_file.get_view()}; decompressor->open(compressed_file_view.data(), compressed_file_view.size()); From a6d68b8f66fa2e9978cad8486d2c6b9220b78c10 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Mon, 2 Dec 2024 12:52:29 -0500 Subject: [PATCH 17/65] Update components/core/tests/test-StreamingCompression.cpp Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- components/core/tests/test-StreamingCompression.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp index a47012ca3..a7f2ee78c 100644 --- a/components/core/tests/test-StreamingCompression.cpp +++ b/components/core/tests/test-StreamingCompression.cpp @@ -58,7 +58,9 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") { decompressor = std::make_unique(); } - SECTION("LZMA compression") { + SECTION("LZMA compression (compression-only test)") { + // Note: Decompressor initialization is intentionally omitted as this is a + // compression-only test. See early termination logic below. compressor = std::make_unique(); } From 1519c21c7d88d4860c00e243abae6ca8443d5fa1 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Tue, 3 Dec 2024 02:34:23 -0500 Subject: [PATCH 18/65] Split LZMA_RUN from flush actions --- .../streaming_compression/lzma/Compressor.cpp | 86 ++++++++++++++----- .../streaming_compression/lzma/Compressor.hpp | 18 ++-- 2 files changed, 76 insertions(+), 28 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 11260c6e9..e6e95e7c8 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -17,6 +17,11 @@ namespace { using clp::streaming_compression::lzma::Compressor; +auto is_flush_action(lzma_action action) { + return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action || LZMA_FULL_BARRIER == action + || LZMA_FINISH == action; +} + /** * Initialize the Lzma compression stream * @param strm A pre-allocated `lzma_stream` object @@ -42,7 +47,6 @@ auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_siz // LZMA_CHECK_CRC32 instead. auto const rc{lzma_stream_encoder(strm, filters.data(), LZMA_CHECK_CRC64)}; - // Return successfully if the initialization went fine. if (LZMA_OK == rc) { return; } @@ -112,9 +116,7 @@ auto Compressor::close() -> void { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } - while (false == m_compression_stream_is_flushed) { - run_lzma(LZMA_FINISH); - } + flush_lzma(LZMA_FINISH); lzma_end(&m_compression_stream); // Detach output buffer from LZMA stream @@ -141,10 +143,9 @@ auto Compressor::write(char const* data, size_t data_length) -> void { // Attach input data to LZMA stream m_compression_stream.next_in = clp::size_checked_pointer_cast(data); m_compression_stream.avail_in = data_length; - m_compression_stream_is_flushed = false; while (m_compression_stream.avail_in > 0) { - run_lzma(LZMA_RUN); + encode_lzma_once(); } // All input data have been encoded so detach input data @@ -154,9 +155,7 @@ auto Compressor::write(char const* data, size_t data_length) -> void { } auto Compressor::flush() -> void { - while (false == m_compression_stream_is_flushed) { - run_lzma(LZMA_SYNC_FLUSH); - } + flush_lzma(LZMA_SYNC_FLUSH); } auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode { @@ -168,29 +167,70 @@ auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode { return ErrorCode_Success; } -auto Compressor::run_lzma(lzma_action action) -> void { - auto const rc = lzma_code(&m_compression_stream, action); +auto Compressor::encode_lzma_once() -> void { + if (0 == m_compression_stream.avail_in) { + return; + } + + if (0 == m_compression_stream.avail_out) { + flush_stream_output_block_buffer(); + } + + auto const rc = lzma_code(&m_compression_stream, LZMA_RUN); switch (rc) { case LZMA_OK: break; case LZMA_BUF_ERROR: // No encoding progress can be made - if (m_compression_stream.avail_in > 0) { - SPDLOG_ERROR("LZMA compressor input stream is corrupt."); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } - break; - case LZMA_STREAM_END: - m_compression_stream_is_flushed = true; - break; + SPDLOG_ERROR("LZMA compressor input stream is corrupt."); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); default: - SPDLOG_ERROR("lzma() returned an unexpected value - {}.", static_cast(rc)); + SPDLOG_ERROR("lzma_code() returned an unexpected value - {}.", static_cast(rc)); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } +} - // Write output buffer to file if it's full or flushed - if (0 == m_compression_stream.avail_out || m_compression_stream_is_flushed) { - flush_stream_output_block_buffer(); +auto Compressor::flush_lzma(lzma_action flush_action) -> void { + if (false == is_flush_action(flush_action)) { + SPDLOG_ERROR( + "lzma_code() supplied with invalid flush action - {}.", + static_cast(flush_action) + ); + throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); + } + + bool flushed{false}; + while (false == flushed) { + auto const rc = lzma_code(&m_compression_stream, flush_action); + switch (rc) { + case LZMA_OK: + break; + case LZMA_STREAM_END: + // NOTE: this might not be true when multithreaded encoder is + // used with LZMA_FULL_BARRIER. For now, we skip this check. + flushed = true; + break; + case LZMA_BUF_ERROR: // No encoding progress can be made + // NOTE: this can happen if we are using LZMA_FULL_FLUSH or + // LZMA_FULL_BARRIER. These two actions keeps encoding input + // data alongside flushing already encoded but buffered data. + SPDLOG_ERROR("LZMA compressor input stream is corrupt."); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + default: + SPDLOG_ERROR( + "lzma_code() returned an unexpected value - {}.", + static_cast(rc) + ); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } + + // Write output buffer to file if it's full + if (0 == m_compression_stream.avail_out) { + flush_stream_output_block_buffer(); + } } + + // Write the last chunk of output + flush_stream_output_block_buffer(); } auto Compressor::flush_stream_output_block_buffer() -> void { diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index 3eb062223..045345829 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -91,15 +91,24 @@ class Compressor : public ::clp::streaming_compression::Compressor { static constexpr size_t cCompressedStreamBlockBufferSize{4096}; // 4KiB /** - * Invoke lzma_code() encoding workflow for one time with the given action. + * Invoke lzma_code() encoding workflow once with LZMA_RUN + * + * The encoded data may be buffered and thus not immediately available at + * the output block. + */ + auto encode_lzma_once() -> void; + + /** + * Invoke lzma_code() repeatedly with the given flushing action until all + * encoded data is made available at the output block * * Once flushing starts, the workflow action needs to stay the same until - * flushing is complete (aka LZMA_STREAM_END is reached). + * flushing is signaled completed by LZMA (aka LZMA_STREAM_END is reached). * See also: https://github.com/tukaani-project/xz/blob/master/src/liblzma/api/lzma/base.h#L274 * - * @param action + * @param flush_action */ - auto run_lzma(lzma_action action) -> void; + auto flush_lzma(lzma_action flush_action) -> void; /** * Flushes the current compressed data in the lzma output buffer to the @@ -112,7 +121,6 @@ class Compressor : public ::clp::streaming_compression::Compressor { // Compressed stream variables lzma_stream m_compression_stream; - bool m_compression_stream_is_flushed{true}; size_t m_dict_size{cDefaultDictionarySize}; Array m_compressed_stream_block_buffer{cCompressedStreamBlockBufferSize}; From 655bb46dcf853e41bf790444d550506c66ff6163 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Tue, 3 Dec 2024 03:26:10 -0500 Subject: [PATCH 19/65] Refactor unit test --- .../core/tests/test-StreamingCompression.cpp | 122 +++++++++--------- 1 file changed, 64 insertions(+), 58 deletions(-) diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp index a47012ca3..a52a42ef7 100644 --- a/components/core/tests/test-StreamingCompression.cpp +++ b/components/core/tests/test-StreamingCompression.cpp @@ -4,10 +4,10 @@ #include #include #include +#include #include #include -#include #include #include @@ -28,69 +28,39 @@ using clp::ErrorCode_Success; using clp::FileWriter; using clp::streaming_compression::Compressor; using clp::streaming_compression::Decompressor; - -TEST_CASE("StreamingCompression", "[StreamingCompression]") { - // Initialize constants - constexpr size_t cBufferSize{128L * 1024 * 1024}; // 128MB - constexpr auto cCompressionChunkSizes = std::to_array( - {cBufferSize / 100, - cBufferSize / 50, - cBufferSize / 25, - cBufferSize / 10, - cBufferSize / 5, - cBufferSize / 2, - cBufferSize} - ); - constexpr size_t cAlphabetLength{26}; - std::string const compressed_file_path{"test_streaming_compressed_file.bin"}; - - // Initialize compression devices - std::unique_ptr compressor; - std::unique_ptr decompressor; - - SECTION("ZStd single phase compression") { - compressor = std::make_unique(); - decompressor = std::make_unique(); - } - - SECTION("Passthrough compression") { - compressor = std::make_unique(); - decompressor = std::make_unique(); - } - - SECTION("LZMA compression") { - compressor = std::make_unique(); - } - - // Initialize buffers - Array uncompressed_buffer{cBufferSize}; - for (size_t i{0}; i < cBufferSize; ++i) { - uncompressed_buffer.at(i) = static_cast(('a' + (i % cAlphabetLength))); - } - - Array decompressed_buffer{cBufferSize}; - - // Compress +using std::string; +using std::string_view; + +namespace { +constexpr string_view cCompressedFilePath{"test_streaming_compressed_file.bin"}; +constexpr size_t cBufferSize{128L * 1024 * 1024}; // 128MB +constexpr auto cCompressionChunkSizes = std::to_array( + {cBufferSize / 100, + cBufferSize / 50, + cBufferSize / 25, + cBufferSize / 10, + cBufferSize / 5, + cBufferSize / 2, + cBufferSize} +); + +auto compress(std::unique_ptr compressor, char const* const src) -> void { FileWriter file_writer; - file_writer.open(compressed_file_path, FileWriter::OpenMode::CREATE_FOR_WRITING); + file_writer.open(string(cCompressedFilePath), FileWriter::OpenMode::CREATE_FOR_WRITING); compressor->open(file_writer); for (auto const chunk_size : cCompressionChunkSizes) { - compressor->write(uncompressed_buffer.data(), chunk_size); + compressor->write(src, chunk_size); } compressor->close(); file_writer.close(); +} - if (boost::dynamic_pointer_cast( - std::move(compressor) - )) - { - // TODO: remove this LZMA testing early termination - boost::filesystem::remove(compressed_file_path); - return; - } - - // Decompress and compare - clp::ReadOnlyMemoryMappedFile const memory_mapped_compressed_file{compressed_file_path}; +auto decompress_and_compare( + std::unique_ptr decompressor, + Array const& uncompressed_buffer, + Array& decompressed_buffer +) -> void { + clp::ReadOnlyMemoryMappedFile const memory_mapped_compressed_file{string(cCompressedFilePath)}; auto const compressed_file_view{memory_mapped_compressed_file.get_view()}; decompressor->open(compressed_file_view.data(), compressed_file_view.size()); @@ -123,7 +93,43 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") { ) == num_uncompressed_bytes) ); +} +} // namespace + +TEST_CASE("StreamingCompression", "[StreamingCompression]") { + // Initialize constants + constexpr size_t cAlphabetLength{26}; + + // Initialize compression devices + std::unique_ptr compressor; + std::unique_ptr decompressor; + + // Initialize buffers + Array decompressed_buffer{cBufferSize}; + Array uncompressed_buffer{cBufferSize}; + for (size_t i{0}; i < cBufferSize; ++i) { + uncompressed_buffer.at(i) = static_cast(('a' + (i % cAlphabetLength))); + } + + SECTION("ZStd single phase compression") { + compressor = std::make_unique(); + compress(std::move(compressor), uncompressed_buffer.data()); + decompressor = std::make_unique(); + decompress_and_compare(std::move(decompressor), uncompressed_buffer, decompressed_buffer); + } + + SECTION("Passthrough compression") { + compressor = std::make_unique(); + compress(std::move(compressor), uncompressed_buffer.data()); + decompressor = std::make_unique(); + decompress_and_compare(std::move(decompressor), uncompressed_buffer, decompressed_buffer); + } + + SECTION("LZMA compression") { + compressor = std::make_unique(); + compress(std::move(compressor), uncompressed_buffer.data()); + } // Cleanup - boost::filesystem::remove(compressed_file_path); + boost::filesystem::remove(string(cCompressedFilePath)); } From 4fb6c0147a054fdf7970c10ccf64d2435eb13bce Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Tue, 3 Dec 2024 03:29:06 -0500 Subject: [PATCH 20/65] Update components/core/src/clp/streaming_compression/lzma/Compressor.cpp Co-authored-by: haiqi96 <14502009+haiqi96@users.noreply.github.com> --- .../core/src/clp/streaming_compression/lzma/Compressor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 11260c6e9..11bfdc5b5 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -21,8 +21,8 @@ using clp::streaming_compression::lzma::Compressor; * Initialize the Lzma compression stream * @param strm A pre-allocated `lzma_stream` object * @param compression_level - * @param dict_size Dictionary size that indicates how many bytes of the - * recently processed uncompressed data is kept in memory + * @param dict_size Dictionary size that specifies how many bytes of the + * recently processed uncompressed data to keep in the memory */ auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size) -> void { lzma_options_lzma options; From 2b85f01d7d6934a19df1203baa2beba94fc395f6 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Tue, 3 Dec 2024 03:35:42 -0500 Subject: [PATCH 21/65] Fix import --- components/core/tests/test-StreamingCompression.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp index 9d28a5ec3..2b2dfe85f 100644 --- a/components/core/tests/test-StreamingCompression.cpp +++ b/components/core/tests/test-StreamingCompression.cpp @@ -26,7 +26,6 @@ using clp::Array; using clp::ErrorCode_Success; using clp::FileWriter; using clp::streaming_compression::Compressor; -using clp::streaming_compression::Decompressor; using std::string; using std::string_view; From eda7d6c97a4da5884a5439adfc582e0dab1aabe8 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Wed, 4 Dec 2024 10:19:35 -0500 Subject: [PATCH 22/65] Apply suggestions from code review Co-authored-by: haiqi96 <14502009+haiqi96@users.noreply.github.com> --- .../core/src/clp/streaming_compression/lzma/Compressor.cpp | 4 ++-- .../core/src/clp/streaming_compression/lzma/Compressor.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 071ad77b8..f5c0fedd4 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -17,13 +17,13 @@ namespace { using clp::streaming_compression::lzma::Compressor; -auto is_flush_action(lzma_action action) { +auto is_flush_action(lzma_action action) -> bool { return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action || LZMA_FULL_BARRIER == action || LZMA_FINISH == action; } /** - * Initialize the Lzma compression stream + * Initialize the LZMA compression stream * @param strm A pre-allocated `lzma_stream` object * @param compression_level * @param dict_size Dictionary size that specifies how many bytes of the diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index 045345829..593c26835 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -100,7 +100,7 @@ class Compressor : public ::clp::streaming_compression::Compressor { /** * Invoke lzma_code() repeatedly with the given flushing action until all - * encoded data is made available at the output block + * encoded data is made available at the output block buffer * * Once flushing starts, the workflow action needs to stay the same until * flushing is signaled completed by LZMA (aka LZMA_STREAM_END is reached). @@ -111,7 +111,7 @@ class Compressor : public ::clp::streaming_compression::Compressor { auto flush_lzma(lzma_action flush_action) -> void; /** - * Flushes the current compressed data in the lzma output buffer to the + * Flushes the current compressed data in the LZMA output buffer to the * output file handler. Reset the compression buffer to receive new data. */ auto flush_stream_output_block_buffer() -> void; From 4164a9d43731dcae330d87338e19882bbc437e62 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Wed, 4 Dec 2024 11:01:41 -0500 Subject: [PATCH 23/65] Address review concern --- .../streaming_compression/lzma/Compressor.cpp | 59 +++++++++++-------- .../streaming_compression/lzma/Compressor.hpp | 18 +++--- .../core/tests/test-StreamingCompression.cpp | 2 + 3 files changed, 46 insertions(+), 33 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index f5c0fedd4..50a813ea4 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -144,12 +144,11 @@ auto Compressor::write(char const* data, size_t data_length) -> void { m_compression_stream.next_in = clp::size_checked_pointer_cast(data); m_compression_stream.avail_in = data_length; - while (m_compression_stream.avail_in > 0) { - encode_lzma_once(); - } + encode_lzma(); // All input data have been encoded so detach input data m_compression_stream.next_in = nullptr; + m_compression_stream.avail_in = 0; m_uncompressed_stream_pos += data_length; } @@ -167,26 +166,31 @@ auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode { return ErrorCode_Success; } -auto Compressor::encode_lzma_once() -> void { - if (0 == m_compression_stream.avail_in) { - return; - } +auto Compressor::encode_lzma() -> void { + while (m_compression_stream.avail_in > 0) { + // Write output buffer to file if it's full + if (0 == m_compression_stream.avail_out) { + flush_stream_output_block_buffer(); + } - if (0 == m_compression_stream.avail_out) { - flush_stream_output_block_buffer(); + auto const rc = lzma_code(&m_compression_stream, LZMA_RUN); + switch (rc) { + case LZMA_OK: + break; + case LZMA_BUF_ERROR: // No encoding progress can be made + SPDLOG_ERROR("LZMA compressor input stream is corrupt."); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + default: + SPDLOG_ERROR( + "lzma_code() returned an unexpected value - {}.", + static_cast(rc) + ); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + } } - auto const rc = lzma_code(&m_compression_stream, LZMA_RUN); - switch (rc) { - case LZMA_OK: - break; - case LZMA_BUF_ERROR: // No encoding progress can be made - SPDLOG_ERROR("LZMA compressor input stream is corrupt."); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - default: - SPDLOG_ERROR("lzma_code() returned an unexpected value - {}.", static_cast(rc)); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); - } + // Write the last chunk of output + flush_stream_output_block_buffer(); } auto Compressor::flush_lzma(lzma_action flush_action) -> void { @@ -198,8 +202,18 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } + /** + * Once flushing starts, the workflow action needs to stay the same until + * flushing is signaled completed by LZMA (aka LZMA_STREAM_END is reached). + * See also: https://github.com/tukaani-project/xz/blob/master/src/liblzma/api/lzma/base.h#L274 + */ bool flushed{false}; while (false == flushed) { + // Write output buffer to file if it's full + if (0 == m_compression_stream.avail_out) { + flush_stream_output_block_buffer(); + } + auto const rc = lzma_code(&m_compression_stream, flush_action); switch (rc) { case LZMA_OK: @@ -222,11 +236,6 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void { ); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } - - // Write output buffer to file if it's full - if (0 == m_compression_stream.avail_out) { - flush_stream_output_block_buffer(); - } } // Write the last chunk of output diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index 593c26835..c8c12b9cb 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -91,22 +91,24 @@ class Compressor : public ::clp::streaming_compression::Compressor { static constexpr size_t cCompressedStreamBlockBufferSize{4096}; // 4KiB /** - * Invoke lzma_code() encoding workflow once with LZMA_RUN + * Invoke lzma_code() repeatedly with LZMA_RUN until the input is exhausted * - * The encoded data may be buffered and thus not immediately available at - * the output block. + * At the end of the workflow, the last bytes of encoded data may still be + * buffered and thus not immediately available at the output block buffer. + * + * Assumes input stream and output block buffer are both in valid states. + * @throw `OperationFailed` if LZMA returns an unexpected error value */ - auto encode_lzma_once() -> void; + auto encode_lzma() -> void; /** * Invoke lzma_code() repeatedly with the given flushing action until all * encoded data is made available at the output block buffer * - * Once flushing starts, the workflow action needs to stay the same until - * flushing is signaled completed by LZMA (aka LZMA_STREAM_END is reached). - * See also: https://github.com/tukaani-project/xz/blob/master/src/liblzma/api/lzma/base.h#L274 - * + * Assumes input stream and output block buffer are both in valid states. * @param flush_action + * @throw `OperationFailed` if the provided action is not an LZMA flush + * action, or if LZMA returns an unexpected error value */ auto flush_lzma(lzma_action flush_action) -> void; diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp index 2b2dfe85f..a52a42ef7 100644 --- a/components/core/tests/test-StreamingCompression.cpp +++ b/components/core/tests/test-StreamingCompression.cpp @@ -16,6 +16,7 @@ #include "../src/clp/FileWriter.hpp" #include "../src/clp/ReadOnlyMemoryMappedFile.hpp" #include "../src/clp/streaming_compression/Compressor.hpp" +#include "../src/clp/streaming_compression/Decompressor.hpp" #include "../src/clp/streaming_compression/lzma/Compressor.hpp" #include "../src/clp/streaming_compression/passthrough/Compressor.hpp" #include "../src/clp/streaming_compression/passthrough/Decompressor.hpp" @@ -26,6 +27,7 @@ using clp::Array; using clp::ErrorCode_Success; using clp::FileWriter; using clp::streaming_compression::Compressor; +using clp::streaming_compression::Decompressor; using std::string; using std::string_view; From 8ab0653c8e555e3e1d62d9631c7077410d3f475b Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Wed, 4 Dec 2024 20:24:52 -0500 Subject: [PATCH 24/65] Add a comment --- .../core/src/clp/streaming_compression/lzma/Compressor.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 50a813ea4..65445061a 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -117,6 +117,8 @@ auto Compressor::close() -> void { } flush_lzma(LZMA_FINISH); + + // Deallocates LZMA stream's internal data structures lzma_end(&m_compression_stream); // Detach output buffer from LZMA stream From c436f214669b5895f64cf429be383cf48f3e0f6a Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Fri, 6 Dec 2024 00:44:25 -0500 Subject: [PATCH 25/65] Apply suggestions from code review Co-authored-by: haiqi96 <14502009+haiqi96@users.noreply.github.com> --- .../core/src/clp/streaming_compression/lzma/Compressor.cpp | 5 ++++- .../core/src/clp/streaming_compression/lzma/Compressor.hpp | 6 +++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 65445061a..8d518249c 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -156,6 +156,9 @@ auto Compressor::write(char const* data, size_t data_length) -> void { } auto Compressor::flush() -> void { + if (nullptr == m_compressed_stream_file_writer) { + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + } flush_lzma(LZMA_SYNC_FLUSH); } @@ -228,7 +231,7 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void { case LZMA_BUF_ERROR: // No encoding progress can be made // NOTE: this can happen if we are using LZMA_FULL_FLUSH or // LZMA_FULL_BARRIER. These two actions keeps encoding input - // data alongside flushing already encoded but buffered data. + // data alongside flushing buffered encoded data. SPDLOG_ERROR("LZMA compressor input stream is corrupt."); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); default: diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index c8c12b9cb..323464545 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -94,7 +94,7 @@ class Compressor : public ::clp::streaming_compression::Compressor { * Invoke lzma_code() repeatedly with LZMA_RUN until the input is exhausted * * At the end of the workflow, the last bytes of encoded data may still be - * buffered and thus not immediately available at the output block buffer. + * buffered in the LZMA stream and thus not immediately available at the output block buffer. * * Assumes input stream and output block buffer are both in valid states. * @throw `OperationFailed` if LZMA returns an unexpected error value @@ -113,8 +113,8 @@ class Compressor : public ::clp::streaming_compression::Compressor { auto flush_lzma(lzma_action flush_action) -> void; /** - * Flushes the current compressed data in the LZMA output buffer to the - * output file handler. Reset the compression buffer to receive new data. + * Flushes the current compressed data in the output block buffer to the + * output file handler. Reset the output block buffer to receive new data. */ auto flush_stream_output_block_buffer() -> void; From 7bd34d256797514c0de5e19570f8bdf8d02cc6b1 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Fri, 6 Dec 2024 01:06:36 -0500 Subject: [PATCH 26/65] Update comment to 100-char length --- .../streaming_compression/lzma/Compressor.cpp | 30 ++++++++----------- .../streaming_compression/lzma/Compressor.hpp | 12 ++++---- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 8d518249c..6c4a29206 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -41,20 +41,18 @@ auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_siz {.id = LZMA_VLI_UNKNOWN, .options = nullptr}, }}; - // Initialize the encoder using a preset. Set the integrity to check - // to CRC64, which is the default in the xz command line tool. If - // the .xz file needs to be decompressed with XZ Embedded, use - // LZMA_CHECK_CRC32 instead. + // Initialize the encoder using a preset. Set the integrity to check to CRC64, which is the + // default in the xz command line tool. If the .xz file needs to be decompressed with + // XZ-Embedded, use LZMA_CHECK_CRC32 instead. auto const rc{lzma_stream_encoder(strm, filters.data(), LZMA_CHECK_CRC64)}; if (LZMA_OK == rc) { return; } - // Something went wrong. The possible errors are documented in - // lzma/container.h (src/liblzma/api/lzma/container.h in the source - // package or e.g. /usr/include/lzma/container.h depending on the - // install prefix). + // Something went wrong. The possible errors are documented in lzma/container.h + // (src/liblzma/api/lzma/container.h in the source package or e.g. /usr/include/lzma/container.h + // depending on the install prefix). char const* msg{nullptr}; switch (rc) { case LZMA_MEM_ERROR: @@ -193,9 +191,6 @@ auto Compressor::encode_lzma() -> void { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } } - - // Write the last chunk of output - flush_stream_output_block_buffer(); } auto Compressor::flush_lzma(lzma_action flush_action) -> void { @@ -208,8 +203,8 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void { } /** - * Once flushing starts, the workflow action needs to stay the same until - * flushing is signaled completed by LZMA (aka LZMA_STREAM_END is reached). + * Once flushing starts, the workflow action needs to stay the same until flushing is signaled + * complete by LZMA (aka LZMA_STREAM_END is reached). * See also: https://github.com/tukaani-project/xz/blob/master/src/liblzma/api/lzma/base.h#L274 */ bool flushed{false}; @@ -224,14 +219,13 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void { case LZMA_OK: break; case LZMA_STREAM_END: - // NOTE: this might not be true when multithreaded encoder is - // used with LZMA_FULL_BARRIER. For now, we skip this check. + // NOTE: this might not be true when multithreaded encoder is used with + // LZMA_FULL_BARRIER. For now, we skip this check. flushed = true; break; case LZMA_BUF_ERROR: // No encoding progress can be made - // NOTE: this can happen if we are using LZMA_FULL_FLUSH or - // LZMA_FULL_BARRIER. These two actions keeps encoding input - // data alongside flushing buffered encoded data. + // NOTE: this can happen if we are using LZMA_FULL_FLUSH or LZMA_FULL_BARRIER. These + // two actions keeps encoding input data alongside flushing buffered encoded data. SPDLOG_ERROR("LZMA compressor input stream is corrupt."); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); default: diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index 323464545..286819893 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -93,8 +93,8 @@ class Compressor : public ::clp::streaming_compression::Compressor { /** * Invoke lzma_code() repeatedly with LZMA_RUN until the input is exhausted * - * At the end of the workflow, the last bytes of encoded data may still be - * buffered in the LZMA stream and thus not immediately available at the output block buffer. + * At the end of the workflow, the last bytes of encoded data may still be buffered in the LZMA + * stream and thus not immediately available at the output block buffer. * * Assumes input stream and output block buffer are both in valid states. * @throw `OperationFailed` if LZMA returns an unexpected error value @@ -102,8 +102,8 @@ class Compressor : public ::clp::streaming_compression::Compressor { auto encode_lzma() -> void; /** - * Invoke lzma_code() repeatedly with the given flushing action until all - * encoded data is made available at the output block buffer + * Invoke lzma_code() repeatedly with the given flushing action until all encoded data is made + * available at the output block buffer * * Assumes input stream and output block buffer are both in valid states. * @param flush_action @@ -113,8 +113,8 @@ class Compressor : public ::clp::streaming_compression::Compressor { auto flush_lzma(lzma_action flush_action) -> void; /** - * Flushes the current compressed data in the output block buffer to the - * output file handler. Reset the output block buffer to receive new data. + * Flushes the current compressed data in the output block buffer to the output file handler. + * Reset the output block buffer to receive new data. */ auto flush_stream_output_block_buffer() -> void; From efd2b2759088c874c2d0a1191b8e4e1d1d16105f Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Tue, 10 Dec 2024 23:05:31 -0500 Subject: [PATCH 27/65] Fix according to coding style guidelines --- .../streaming_compression/lzma/Compressor.cpp | 16 ++++++++++------ .../streaming_compression/lzma/Compressor.hpp | 18 +++++++++--------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 6c4a29206..dc2ca222f 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -17,18 +17,22 @@ namespace { using clp::streaming_compression::lzma::Compressor; -auto is_flush_action(lzma_action action) -> bool { - return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action || LZMA_FULL_BARRIER == action - || LZMA_FINISH == action; -} +auto is_flush_action(lzma_action action) -> bool; /** - * Initialize the LZMA compression stream + * Initializes the LZMA compression stream * @param strm A pre-allocated `lzma_stream` object * @param compression_level * @param dict_size Dictionary size that specifies how many bytes of the * recently processed uncompressed data to keep in the memory */ +auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size) -> void; + +auto is_flush_action(lzma_action action) -> bool { + return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action || LZMA_FULL_BARRIER == action + || LZMA_FINISH == action; +} + auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size) -> void { lzma_options_lzma options; if (0 != lzma_lzma_preset(&options, compression_level)) { @@ -41,7 +45,7 @@ auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_siz {.id = LZMA_VLI_UNKNOWN, .options = nullptr}, }}; - // Initialize the encoder using a preset. Set the integrity to check to CRC64, which is the + // Initializes the encoder using a preset. Set the integrity to check to CRC64, which is the // default in the xz command line tool. If the .xz file needs to be decompressed with // XZ-Embedded, use LZMA_CHECK_CRC32 instead. auto const rc{lzma_stream_encoder(strm, filters.data(), LZMA_CHECK_CRC64)}; diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index 286819893..b4255cc1c 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -43,6 +43,13 @@ class Compressor : public ::clp::streaming_compression::Compressor { Compressor(Compressor&&) noexcept = default; auto operator=(Compressor&&) noexcept -> Compressor& = default; + /** + * Initializes the compression stream with the given compression level + * @param file_writer + * @param compression_level + */ + auto open(FileWriter& file_writer, int compression_level) -> void; + // Methods implementing the WriterInterface /** * Writes the given data to the compressor @@ -80,18 +87,11 @@ class Compressor : public ::clp::streaming_compression::Compressor { this->open(file_writer, cDefaultCompressionLevel); } - /** - * Initializes the compression stream with the given compression level - * @param file_writer - * @param compression_level - */ - auto open(FileWriter& file_writer, int compression_level) -> void; - private: static constexpr size_t cCompressedStreamBlockBufferSize{4096}; // 4KiB /** - * Invoke lzma_code() repeatedly with LZMA_RUN until the input is exhausted + * Invokes lzma_code() repeatedly with LZMA_RUN until the input is exhausted * * At the end of the workflow, the last bytes of encoded data may still be buffered in the LZMA * stream and thus not immediately available at the output block buffer. @@ -102,7 +102,7 @@ class Compressor : public ::clp::streaming_compression::Compressor { auto encode_lzma() -> void; /** - * Invoke lzma_code() repeatedly with the given flushing action until all encoded data is made + * Invokes lzma_code() repeatedly with the given flushing action until all encoded data is made * available at the output block buffer * * Assumes input stream and output block buffer are both in valid states. From c530f9287ecf51350220bfca501347f3f79b1d5b Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Wed, 11 Dec 2024 21:10:58 -0500 Subject: [PATCH 28/65] Apply suggestions from code review Co-authored-by: davidlion --- components/core/tools/scripts/lib_install/liblzma.sh | 2 +- .../lib_install/ubuntu-focal/install-prebuilt-packages.sh | 2 +- .../lib_install/ubuntu-jammy/install-prebuilt-packages.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/components/core/tools/scripts/lib_install/liblzma.sh b/components/core/tools/scripts/lib_install/liblzma.sh index 28766eced..a73ff79b9 100755 --- a/components/core/tools/scripts/lib_install/liblzma.sh +++ b/components/core/tools/scripts/lib_install/liblzma.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Exit on any error set -e diff --git a/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh index f1e2ee4ff..b373cbe4d 100755 --- a/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh +++ b/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh @@ -19,8 +19,8 @@ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ git \ libcurl4 \ libcurl4-openssl-dev \ - libmariadb-dev \ liblzma-dev \ + libmariadb-dev \ libssl-dev \ make \ openjdk-11-jdk \ diff --git a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh index 4911a6a98..e2e17283b 100755 --- a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh +++ b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh @@ -19,8 +19,8 @@ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ libboost-program-options-dev \ libcurl4 \ libcurl4-openssl-dev \ - libmariadb-dev \ liblzma-dev \ + libmariadb-dev \ libssl-dev \ openjdk-11-jdk \ pkg-config \ From e751ee6f5fe3d757713520b494a2e23edc1a6453 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Thu, 12 Dec 2024 01:27:10 -0500 Subject: [PATCH 29/65] Update CMakeLists.txt --- components/core/CMakeLists.txt | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 312c6e2ef..9d0c51c9f 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -11,16 +11,16 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # Set general compressor set(GENERAL_COMPRESSOR "zstd" CACHE STRING "The general-purpose compressor used as the 2nd-stage compressor") -set_property(CACHE GENERAL_COMPRESSOR PROPERTY STRINGS passthrough zstd lzma) -if ("${GENERAL_COMPRESSOR}" STREQUAL "passthrough") +set_property(CACHE GENERAL_COMPRESSOR PROPERTY STRINGS lzma passthrough zstd) +if ("${GENERAL_COMPRESSOR}" STREQUAL "lzma") + add_definitions(-DUSE_LZMA_COMPRESSION=1) + message(STATUS "Using Lempel–Ziv–Markov chain Algorithm compression") +elseif ("${GENERAL_COMPRESSOR}" STREQUAL "passthrough") add_definitions(-DUSE_PASSTHROUGH_COMPRESSION=1) message(STATUS "Using passthrough compression") elseif ("${GENERAL_COMPRESSOR}" STREQUAL "zstd") add_definitions(-DUSE_ZSTD_COMPRESSION=1) message(STATUS "Using Zstandard compression") -elseif ("${GENERAL_COMPRESSOR}" STREQUAL "lzma") - add_definitions(-DUSE_LZMA_COMPRESSION=1) - message(STATUS "Using Lempel–Ziv–Markov chain Algorithm compression") else() message(SEND_ERROR "GENERAL_COMPRESSOR=${GENERAL_COMPRESSOR} is unimplemented.") endif() @@ -228,17 +228,17 @@ else() endif() # Find and setup LZMA Library -# Notice that we don't have support to switch between static and shared libraries. -# TODO: add a script in ./cmake/Modules to resolve .a vs. .so +# TODO: Add support to enforce static linking against LZMA when desired. For a hack, we can set +# `CMAKE_FIND_LIBRARY_SUFFIXES` to ask CMake to prefer the static lib over the shared one. find_package(LibLZMA REQUIRED) if(LIBLZMA_FOUND) message(STATUS "Found Lzma ${LIBLZMA_VERSION_STRING}") message(STATUS "Lzma library location: ${LIBLZMA_LIBRARIES}") + message(STATUS "Lzma Include Dir: ${LIBLZMA_INCLUDE_DIRS}") else() message(FATAL_ERROR "Could not find ${CLP_LIBS_STRING} libraries for Lzma") endif() include_directories(${LIBLZMA_INCLUDE_DIRS}) -message("Lzma Include Dir: ${LIBLZMA_INCLUDE_DIRS}") # sqlite dependencies set(sqlite_DYNAMIC_LIBS "dl;m;pthread") From 1c5efcdbb3567c16d9ed14a02eab50525f8ea426 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Thu, 12 Dec 2024 01:30:35 -0500 Subject: [PATCH 30/65] Address review concern --- .../clp/streaming_compression/lzma/Compressor.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index dc2ca222f..1330da53f 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -21,19 +21,19 @@ auto is_flush_action(lzma_action action) -> bool; /** * Initializes the LZMA compression stream - * @param strm A pre-allocated `lzma_stream` object + * @param stream A pre-allocated `lzma_stream` object * @param compression_level * @param dict_size Dictionary size that specifies how many bytes of the * recently processed uncompressed data to keep in the memory */ -auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size) -> void; +auto init_lzma_encoder(lzma_stream* stream, int compression_level, size_t dict_size) -> void; auto is_flush_action(lzma_action action) -> bool { return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action || LZMA_FULL_BARRIER == action || LZMA_FINISH == action; } -auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size) -> void { +auto init_lzma_encoder(lzma_stream* stream, int compression_level, size_t dict_size) -> void { lzma_options_lzma options; if (0 != lzma_lzma_preset(&options, compression_level)) { SPDLOG_ERROR("Failed to initialize LZMA options' compression level."); @@ -48,7 +48,7 @@ auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_siz // Initializes the encoder using a preset. Set the integrity to check to CRC64, which is the // default in the xz command line tool. If the .xz file needs to be decompressed with // XZ-Embedded, use LZMA_CHECK_CRC32 instead. - auto const rc{lzma_stream_encoder(strm, filters.data(), LZMA_CHECK_CRC64)}; + auto const rc = lzma_stream_encoder(stream, filters.data(), LZMA_CHECK_CRC64); if (LZMA_OK == rc) { return; @@ -71,8 +71,11 @@ auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_siz msg = "Specified integrity check is not supported"; break; + case LZMA_PROG_ERROR: + msg = "Input arguments are not sane"; + break; + default: - // This is most likely LZMA_PROG_ERROR indicating a bug in liblzma msg = "Unknown error"; break; } From 856c7cb544a8122b8c9e7e9063d077a587da9913 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Thu, 12 Dec 2024 01:57:32 -0500 Subject: [PATCH 31/65] Update TODO --- components/core/CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 9d0c51c9f..3b5f9aff4 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -228,8 +228,10 @@ else() endif() # Find and setup LZMA Library -# TODO: Add support to enforce static linking against LZMA when desired. For a hack, we can set -# `CMAKE_FIND_LIBRARY_SUFFIXES` to ask CMake to prefer the static lib over the shared one. +# TODO: Add a script in ./cmake/Modules to properly import LZMA in find_package()'s module mode +if(CLP_USE_STATIC_LIBS) + set(LibLZMA_USE_STATIC_LIBS ON) +endif() find_package(LibLZMA REQUIRED) if(LIBLZMA_FOUND) message(STATUS "Found Lzma ${LIBLZMA_VERSION_STRING}") From 43e22d2ec5a4480b6f02a0be31eec6f8efc5406c Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Thu, 12 Dec 2024 01:59:24 -0500 Subject: [PATCH 32/65] Case fix --- components/core/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 3b5f9aff4..160f6766d 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -230,7 +230,7 @@ endif() # Find and setup LZMA Library # TODO: Add a script in ./cmake/Modules to properly import LZMA in find_package()'s module mode if(CLP_USE_STATIC_LIBS) - set(LibLZMA_USE_STATIC_LIBS ON) + set(LIBLZMA_USE_STATIC_LIBS ON) endif() find_package(LibLZMA REQUIRED) if(LIBLZMA_FOUND) From 829a6b2d7c8bde7011c451022db7926593335ebd Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Thu, 12 Dec 2024 12:02:42 -0500 Subject: [PATCH 33/65] Remove unnecessary function inline comments --- .../streaming_compression/lzma/Compressor.cpp | 113 ++++++++++-------- .../streaming_compression/lzma/Compressor.hpp | 5 + .../core/tests/test-StreamingCompression.cpp | 15 ++- 3 files changed, 76 insertions(+), 57 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 1330da53f..7edd61ae9 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -8,6 +8,7 @@ #include #include +#include "../../Array.hpp" #include "../../ErrorCode.hpp" #include "../../FileWriter.hpp" #include "../../TraceableException.hpp" @@ -15,25 +16,68 @@ #include "Constants.hpp" namespace { +using clp::Array; using clp::streaming_compression::lzma::Compressor; +/** + * Attaches a pre-allocated block buffer to encoder's output stream + * + * Subsequent calls to this function resets the output buffer to its initial state. + * @param stream + * @param out_buffer + */ +auto attach_stream_output_buffer(lzma_stream* stream, Array& out_buffer) -> void; + +auto detach_stream_input_src(lzma_stream* stream) -> void; + +auto detach_stream_output_buffer(lzma_stream* stream) -> void; + auto is_flush_action(lzma_action action) -> bool; /** - * Initializes the LZMA compression stream - * @param stream A pre-allocated `lzma_stream` object + * Initializes an LZMA compression encoder and its streams + * + * @param stream A pre-allocated `lzma_stream` object that is to be initialized * @param compression_level * @param dict_size Dictionary size that specifies how many bytes of the * recently processed uncompressed data to keep in the memory + * @param check Type of integrity check calculated from the uncompressed data. LZMA_CHECK_CRC64 is + * the default in the xz command line tool. If the .xz file needs to be decompressed + * with XZ-Embedded, use LZMA_CHECK_CRC32 instead. */ -auto init_lzma_encoder(lzma_stream* stream, int compression_level, size_t dict_size) -> void; +auto init_lzma_encoder( + lzma_stream* stream, + int compression_level, + size_t dict_size, + lzma_check check = LZMA_CHECK_CRC64 +) -> void; + +auto attach_stream_output_buffer(lzma_stream* stream, Array& out_buffer) -> void { + stream->next_out = out_buffer.data(); + stream->avail_out = out_buffer.size(); +} + +auto detach_stream_input_src(lzma_stream* stream) -> void { + stream->next_in = nullptr; + stream->avail_in = 0; +} + +auto detach_stream_output_buffer(lzma_stream* stream) -> void { + stream->next_out = nullptr; + stream->avail_out = 0; +} auto is_flush_action(lzma_action action) -> bool { return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action || LZMA_FULL_BARRIER == action || LZMA_FINISH == action; } -auto init_lzma_encoder(lzma_stream* stream, int compression_level, size_t dict_size) -> void { +auto init_lzma_encoder( + lzma_stream* stream, + int compression_level, + size_t dict_size, + lzma_check check +) -> void { lzma_options_lzma options; if (0 != lzma_lzma_preset(&options, compression_level)) { SPDLOG_ERROR("Failed to initialize LZMA options' compression level."); @@ -45,18 +89,11 @@ auto init_lzma_encoder(lzma_stream* stream, int compression_level, size_t dict_s {.id = LZMA_VLI_UNKNOWN, .options = nullptr}, }}; - // Initializes the encoder using a preset. Set the integrity to check to CRC64, which is the - // default in the xz command line tool. If the .xz file needs to be decompressed with - // XZ-Embedded, use LZMA_CHECK_CRC32 instead. - auto const rc = lzma_stream_encoder(stream, filters.data(), LZMA_CHECK_CRC64); - + auto const rc = lzma_stream_encoder(stream, filters.data(), check); if (LZMA_OK == rc) { return; } - // Something went wrong. The possible errors are documented in lzma/container.h - // (src/liblzma/api/lzma/container.h in the source package or e.g. /usr/include/lzma/container.h - // depending on the install prefix). char const* msg{nullptr}; switch (rc) { case LZMA_MEM_ERROR: @@ -97,17 +134,9 @@ auto Compressor::open(FileWriter& file_writer, int compression_level) -> void { m_compression_stream = LZMA_STREAM_INIT; init_lzma_encoder(&m_compression_stream, compression_level, m_dict_size); - - // No input upon initialization - m_compression_stream.next_in = nullptr; - m_compression_stream.avail_in = 0; - - // Attach output buffer to LZMA stream - m_compression_stream.next_out = m_compressed_stream_block_buffer.data(); - m_compression_stream.avail_out = m_compressed_stream_block_buffer.size(); - + detach_stream_input_src(&m_compression_stream); + attach_stream_output_buffer(&m_compression_stream, m_compressed_stream_block_buffer); m_compressed_stream_file_writer = &file_writer; - m_uncompressed_stream_pos = 0; } @@ -122,14 +151,9 @@ auto Compressor::close() -> void { } flush_lzma(LZMA_FINISH); - // Deallocates LZMA stream's internal data structures lzma_end(&m_compression_stream); - - // Detach output buffer from LZMA stream - m_compression_stream.next_out = nullptr; - m_compression_stream.avail_out = 0; - + detach_stream_output_buffer(&m_compression_stream); m_compressed_stream_file_writer = nullptr; } @@ -139,7 +163,6 @@ auto Compressor::write(char const* data, size_t data_length) -> void { } if (0 == data_length) { - // Nothing needs to be done because we do not need to compress anything return; } @@ -147,16 +170,10 @@ auto Compressor::write(char const* data, size_t data_length) -> void { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } - // Attach input data to LZMA stream m_compression_stream.next_in = clp::size_checked_pointer_cast(data); m_compression_stream.avail_in = data_length; - encode_lzma(); - - // All input data have been encoded so detach input data - m_compression_stream.next_in = nullptr; - m_compression_stream.avail_in = 0; - + detach_stream_input_src(&m_compression_stream); m_uncompressed_stream_pos += data_length; } @@ -178,7 +195,6 @@ auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode { auto Compressor::encode_lzma() -> void { while (m_compression_stream.avail_in > 0) { - // Write output buffer to file if it's full if (0 == m_compression_stream.avail_out) { flush_stream_output_block_buffer(); } @@ -187,8 +203,10 @@ auto Compressor::encode_lzma() -> void { switch (rc) { case LZMA_OK: break; - case LZMA_BUF_ERROR: // No encoding progress can be made - SPDLOG_ERROR("LZMA compressor input stream is corrupt."); + case LZMA_BUF_ERROR: + SPDLOG_ERROR( + "LZMA compressor input stream is corrupt. No encoding progress can be made." + ); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); default: SPDLOG_ERROR( @@ -209,14 +227,8 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } - /** - * Once flushing starts, the workflow action needs to stay the same until flushing is signaled - * complete by LZMA (aka LZMA_STREAM_END is reached). - * See also: https://github.com/tukaani-project/xz/blob/master/src/liblzma/api/lzma/base.h#L274 - */ bool flushed{false}; while (false == flushed) { - // Write output buffer to file if it's full if (0 == m_compression_stream.avail_out) { flush_stream_output_block_buffer(); } @@ -230,10 +242,12 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void { // LZMA_FULL_BARRIER. For now, we skip this check. flushed = true; break; - case LZMA_BUF_ERROR: // No encoding progress can be made + case LZMA_BUF_ERROR: // NOTE: this can happen if we are using LZMA_FULL_FLUSH or LZMA_FULL_BARRIER. These // two actions keeps encoding input data alongside flushing buffered encoded data. - SPDLOG_ERROR("LZMA compressor input stream is corrupt."); + SPDLOG_ERROR( + "LZMA compressor input stream is corrupt. No encoding progress can be made." + ); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); default: SPDLOG_ERROR( @@ -244,20 +258,17 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void { } } - // Write the last chunk of output flush_stream_output_block_buffer(); } auto Compressor::flush_stream_output_block_buffer() -> void { if (cCompressedStreamBlockBufferSize == m_compression_stream.avail_out) { - // Nothing to flush return; } m_compressed_stream_file_writer->write( clp::size_checked_pointer_cast(m_compressed_stream_block_buffer.data()), cCompressedStreamBlockBufferSize - m_compression_stream.avail_out ); - m_compression_stream.next_out = m_compressed_stream_block_buffer.data(); - m_compression_stream.avail_out = cCompressedStreamBlockBufferSize; + attach_stream_output_buffer(&m_compression_stream, m_compressed_stream_block_buffer); } } // namespace clp::streaming_compression::lzma diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index b4255cc1c..986137aa2 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -45,6 +45,7 @@ class Compressor : public ::clp::streaming_compression::Compressor { /** * Initializes the compression stream with the given compression level + * * @param file_writer * @param compression_level */ @@ -105,6 +106,10 @@ class Compressor : public ::clp::streaming_compression::Compressor { * Invokes lzma_code() repeatedly with the given flushing action until all encoded data is made * available at the output block buffer * + * Once flushing starts, the workflow action needs to stay the same until flushing is signaled + * complete by LZMA (aka LZMA_STREAM_END is reached). + * See also: https://github.com/tukaani-project/xz/blob/master/src/liblzma/api/lzma/base.h#L274 + * * Assumes input stream and output block buffer are both in valid states. * @param flush_action * @throw `OperationFailed` if the provided action is not an LZMA flush diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp index a52a42ef7..4076eb88f 100644 --- a/components/core/tests/test-StreamingCompression.cpp +++ b/components/core/tests/test-StreamingCompression.cpp @@ -44,7 +44,15 @@ constexpr auto cCompressionChunkSizes = std::to_array( cBufferSize} ); -auto compress(std::unique_ptr compressor, char const* const src) -> void { +auto compress(std::unique_ptr compressor, char const* src) -> void; + +auto decompress_and_compare( + std::unique_ptr decompressor, + Array const& uncompressed_buffer, + Array& decompressed_buffer +) -> void; + +auto compress(std::unique_ptr compressor, char const* src) -> void { FileWriter file_writer; file_writer.open(string(cCompressedFilePath), FileWriter::OpenMode::CREATE_FOR_WRITING); compressor->open(file_writer); @@ -84,7 +92,6 @@ auto decompress_and_compare( num_uncompressed_bytes += chunk_size; } - // Sanity check REQUIRE( (std::accumulate( cCompressionChunkSizes.cbegin(), @@ -97,14 +104,11 @@ auto decompress_and_compare( } // namespace TEST_CASE("StreamingCompression", "[StreamingCompression]") { - // Initialize constants constexpr size_t cAlphabetLength{26}; - // Initialize compression devices std::unique_ptr compressor; std::unique_ptr decompressor; - // Initialize buffers Array decompressed_buffer{cBufferSize}; Array uncompressed_buffer{cBufferSize}; for (size_t i{0}; i < cBufferSize; ++i) { @@ -130,6 +134,5 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") { compress(std::move(compressor), uncompressed_buffer.data()); } - // Cleanup boost::filesystem::remove(string(cCompressedFilePath)); } From 81e180795cccd1b1d7380f989f0068e669d19b6b Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Thu, 12 Dec 2024 12:07:56 -0500 Subject: [PATCH 34/65] Improve comment --- .../core/src/clp/streaming_compression/lzma/Compressor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 7edd61ae9..36a5038b4 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -238,7 +238,7 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void { case LZMA_OK: break; case LZMA_STREAM_END: - // NOTE: this might not be true when multithreaded encoder is used with + // NOTE: flush may not have completed if a multithreaded encoder is using action // LZMA_FULL_BARRIER. For now, we skip this check. flushed = true; break; From 09b73c7ff6413066aa3d98a5694a04c130b50a4f Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Tue, 17 Dec 2024 00:56:19 -0500 Subject: [PATCH 35/65] Refactor lzma stream related functions into a nested helper class --- .../streaming_compression/lzma/Compressor.cpp | 194 +++++++----------- .../streaming_compression/lzma/Compressor.hpp | 56 ++++- 2 files changed, 123 insertions(+), 127 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 36a5038b4..4a43e93e8 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -8,134 +8,25 @@ #include #include -#include "../../Array.hpp" #include "../../ErrorCode.hpp" #include "../../FileWriter.hpp" #include "../../TraceableException.hpp" #include "../../type_utils.hpp" #include "Constants.hpp" -namespace { -using clp::Array; -using clp::streaming_compression::lzma::Compressor; - -/** - * Attaches a pre-allocated block buffer to encoder's output stream - * - * Subsequent calls to this function resets the output buffer to its initial state. - * @param stream - * @param out_buffer - */ -auto attach_stream_output_buffer(lzma_stream* stream, Array& out_buffer) -> void; - -auto detach_stream_input_src(lzma_stream* stream) -> void; - -auto detach_stream_output_buffer(lzma_stream* stream) -> void; - -auto is_flush_action(lzma_action action) -> bool; - -/** - * Initializes an LZMA compression encoder and its streams - * - * @param stream A pre-allocated `lzma_stream` object that is to be initialized - * @param compression_level - * @param dict_size Dictionary size that specifies how many bytes of the - * recently processed uncompressed data to keep in the memory - * @param check Type of integrity check calculated from the uncompressed data. LZMA_CHECK_CRC64 is - * the default in the xz command line tool. If the .xz file needs to be decompressed - * with XZ-Embedded, use LZMA_CHECK_CRC32 instead. - */ -auto init_lzma_encoder( - lzma_stream* stream, - int compression_level, - size_t dict_size, - lzma_check check = LZMA_CHECK_CRC64 -) -> void; - -auto attach_stream_output_buffer(lzma_stream* stream, Array& out_buffer) -> void { - stream->next_out = out_buffer.data(); - stream->avail_out = out_buffer.size(); -} - -auto detach_stream_input_src(lzma_stream* stream) -> void { - stream->next_in = nullptr; - stream->avail_in = 0; -} - -auto detach_stream_output_buffer(lzma_stream* stream) -> void { - stream->next_out = nullptr; - stream->avail_out = 0; -} - -auto is_flush_action(lzma_action action) -> bool { - return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action || LZMA_FULL_BARRIER == action - || LZMA_FINISH == action; -} - -auto init_lzma_encoder( - lzma_stream* stream, - int compression_level, - size_t dict_size, - lzma_check check -) -> void { - lzma_options_lzma options; - if (0 != lzma_lzma_preset(&options, compression_level)) { - SPDLOG_ERROR("Failed to initialize LZMA options' compression level."); - throw Compressor::OperationFailed(clp::ErrorCode_BadParam, __FILENAME__, __LINE__); - } - options.dict_size = dict_size; - std::array filters{{ - {.id = LZMA_FILTER_LZMA2, .options = &options}, - {.id = LZMA_VLI_UNKNOWN, .options = nullptr}, - }}; - - auto const rc = lzma_stream_encoder(stream, filters.data(), check); - if (LZMA_OK == rc) { - return; - } - - char const* msg{nullptr}; - switch (rc) { - case LZMA_MEM_ERROR: - msg = "Memory allocation failed"; - break; - - case LZMA_OPTIONS_ERROR: - msg = "Specified preset is not supported"; - break; - - case LZMA_UNSUPPORTED_CHECK: - msg = "Specified integrity check is not supported"; - break; - - case LZMA_PROG_ERROR: - msg = "Input arguments are not sane"; - break; - - default: - msg = "Unknown error"; - break; - } - - SPDLOG_ERROR("Error initializing the encoder: {} (error code {})", msg, static_cast(rc)); - throw Compressor::OperationFailed(clp::ErrorCode_BadParam, __FILENAME__, __LINE__); -} -} // namespace - namespace clp::streaming_compression::lzma { auto Compressor::open(FileWriter& file_writer, int compression_level) -> void { if (nullptr != m_compressed_stream_file_writer) { throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__); } - if (compression_level < cMinCompressionLevel || compression_level > cMaxCompressionLevel) { throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__); } + m_compression_level = compression_level; - m_compression_stream = LZMA_STREAM_INIT; - init_lzma_encoder(&m_compression_stream, compression_level, m_dict_size); - detach_stream_input_src(&m_compression_stream); - attach_stream_output_buffer(&m_compression_stream, m_compressed_stream_block_buffer); + m_lzma_ops.init_lzma_encoder(); + m_lzma_ops.detach_input_src(); + m_lzma_ops.attach_output_buffer(); m_compressed_stream_file_writer = &file_writer; m_uncompressed_stream_pos = 0; } @@ -153,7 +44,7 @@ auto Compressor::close() -> void { flush_lzma(LZMA_FINISH); // Deallocates LZMA stream's internal data structures lzma_end(&m_compression_stream); - detach_stream_output_buffer(&m_compression_stream); + m_lzma_ops.detach_output_buffer(); m_compressed_stream_file_writer = nullptr; } @@ -173,7 +64,7 @@ auto Compressor::write(char const* data, size_t data_length) -> void { m_compression_stream.next_in = clp::size_checked_pointer_cast(data); m_compression_stream.avail_in = data_length; encode_lzma(); - detach_stream_input_src(&m_compression_stream); + m_lzma_ops.detach_input_src(); m_uncompressed_stream_pos += data_length; } @@ -188,7 +79,6 @@ auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode { if (nullptr == m_compressed_stream_file_writer) { return ErrorCode_NotInit; } - pos = m_uncompressed_stream_pos; return ErrorCode_Success; } @@ -198,7 +88,6 @@ auto Compressor::encode_lzma() -> void { if (0 == m_compression_stream.avail_out) { flush_stream_output_block_buffer(); } - auto const rc = lzma_code(&m_compression_stream, LZMA_RUN); switch (rc) { case LZMA_OK: @@ -219,7 +108,7 @@ auto Compressor::encode_lzma() -> void { } auto Compressor::flush_lzma(lzma_action flush_action) -> void { - if (false == is_flush_action(flush_action)) { + if (false == LzmaStreamOperations::is_flush_action(flush_action)) { SPDLOG_ERROR( "lzma_code() supplied with invalid flush action - {}.", static_cast(flush_action) @@ -232,7 +121,6 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void { if (0 == m_compression_stream.avail_out) { flush_stream_output_block_buffer(); } - auto const rc = lzma_code(&m_compression_stream, flush_action); switch (rc) { case LZMA_OK: @@ -257,7 +145,6 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void { throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } } - flush_stream_output_block_buffer(); } @@ -269,6 +156,71 @@ auto Compressor::flush_stream_output_block_buffer() -> void { clp::size_checked_pointer_cast(m_compressed_stream_block_buffer.data()), cCompressedStreamBlockBufferSize - m_compression_stream.avail_out ); - attach_stream_output_buffer(&m_compression_stream, m_compressed_stream_block_buffer); + m_lzma_ops.attach_output_buffer(); +} + +auto Compressor::LzmaStreamOperations::is_flush_action(lzma_action action) -> bool { + return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action || LZMA_FULL_BARRIER == action + || LZMA_FINISH == action; +} + +auto Compressor::LzmaStreamOperations::attach_output_buffer() -> void { + m_p->m_compression_stream.next_out = m_p->m_compressed_stream_block_buffer.data(); + m_p->m_compression_stream.avail_out = m_p->m_compressed_stream_block_buffer.size(); +} + +auto Compressor::LzmaStreamOperations::detach_input_src() -> void { + m_p->m_compression_stream.next_in = nullptr; + m_p->m_compression_stream.avail_in = 0; +} + +auto Compressor::LzmaStreamOperations::detach_output_buffer() -> void { + m_p->m_compression_stream.next_out = nullptr; + m_p->m_compression_stream.avail_out = 0; +} + +auto Compressor::LzmaStreamOperations::init_lzma_encoder(lzma_check check) -> void { + lzma_options_lzma options; + if (0 != lzma_lzma_preset(&options, m_p->m_compression_level)) { + SPDLOG_ERROR("Failed to initialize LZMA options' compression level."); + throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); + } + options.dict_size = m_p->m_dict_size; + std::array filters{{ + {.id = LZMA_FILTER_LZMA2, .options = &options}, + {.id = LZMA_VLI_UNKNOWN, .options = nullptr}, + }}; + + m_p->m_compression_stream = LZMA_STREAM_INIT; + auto const rc = lzma_stream_encoder(&m_p->m_compression_stream, filters.data(), check); + if (LZMA_OK == rc) { + return; + } + + char const* msg{nullptr}; + switch (rc) { + case LZMA_MEM_ERROR: + msg = "Memory allocation failed"; + break; + + case LZMA_OPTIONS_ERROR: + msg = "Specified preset is not supported"; + break; + + case LZMA_UNSUPPORTED_CHECK: + msg = "Specified integrity check is not supported"; + break; + + case LZMA_PROG_ERROR: + msg = "Input arguments are not sane"; + break; + + default: + msg = "Unknown error"; + break; + } + + SPDLOG_ERROR("Error initializing the encoder: {} (error code {})", msg, static_cast(rc)); + throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } } // namespace clp::streaming_compression::lzma diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index 986137aa2..3e7af18ff 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -2,7 +2,7 @@ #define CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP #include -#include +#include #include @@ -89,6 +89,48 @@ class Compressor : public ::clp::streaming_compression::Compressor { } private: + class LzmaStreamOperations { + public: + // Constructor + LzmaStreamOperations(Compressor* parent) : m_p(parent) {} + + // Destructor + ~LzmaStreamOperations() = default; + + // Delete copy constructor and assignment operator + LzmaStreamOperations(LzmaStreamOperations const&) = delete; + auto operator=(LzmaStreamOperations const&) -> LzmaStreamOperations& = delete; + + // Default move constructor and assignment operator + LzmaStreamOperations(LzmaStreamOperations&&) noexcept = default; + auto operator=(LzmaStreamOperations&&) noexcept -> LzmaStreamOperations& = default; + + [[nodiscard]] static auto is_flush_action(lzma_action action) -> bool; + + /** + * Attaches a pre-allocated block buffer to the encoder's output stream + * + * Subsequent calls to this function resets the output buffer to its initial state. + */ + auto attach_output_buffer() -> void; + + auto detach_input_src() -> void; + + auto detach_output_buffer() -> void; + + /** + * Initializes an LZMA compression encoder and its streams + * + * @param check Type of integrity check calculated from the uncompressed data. + * LZMA_CHECK_CRC64 is the default in the xz command line tool. If the .xz file needs to be + * decompressed with XZ-Embedded, use LZMA_CHECK_CRC32 instead. + */ + auto init_lzma_encoder(lzma_check check = LZMA_CHECK_CRC64) -> void; + + private: + Compressor* m_p; + }; + static constexpr size_t cCompressedStreamBlockBufferSize{4096}; // 4KiB /** @@ -119,7 +161,8 @@ class Compressor : public ::clp::streaming_compression::Compressor { /** * Flushes the current compressed data in the output block buffer to the output file handler. - * Reset the output block buffer to receive new data. + * + * Also resets the output block buffer to receive new data. */ auto flush_stream_output_block_buffer() -> void; @@ -127,11 +170,12 @@ class Compressor : public ::clp::streaming_compression::Compressor { FileWriter* m_compressed_stream_file_writer{nullptr}; // Compressed stream variables - lzma_stream m_compression_stream; - size_t m_dict_size{cDefaultDictionarySize}; - + LzmaStreamOperations m_lzma_ops{this}; Array m_compressed_stream_block_buffer{cCompressedStreamBlockBufferSize}; - + int m_compression_level{cDefaultCompressionLevel}; + lzma_stream m_compression_stream = LZMA_STREAM_INIT; + // Specifies how many bytes of the recently processed uncompressed data to keep in the memory + size_t m_dict_size{cDefaultDictionarySize}; size_t m_uncompressed_stream_pos{0}; }; } // namespace clp::streaming_compression::lzma From 7cedb25ef7831583219598374a14a433e80a9564 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Wed, 18 Dec 2024 21:27:35 -0500 Subject: [PATCH 36/65] Adress coderabbit suggestions --- .../core/src/clp/streaming_compression/lzma/Compressor.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 4a43e93e8..3e6bb0254 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -37,8 +37,9 @@ auto Compressor::close() -> void { } if (m_compression_stream.avail_in > 0) { - SPDLOG_ERROR("Tried to close LZMA compressor with unprocessed input data."); - throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); + SPDLOG_WARN("Trying to close LZMA compressor with unprocessed input data. Processing and " + "flushing remaining data."); + flush_lzma(LZMA_FULL_FLUSH); } flush_lzma(LZMA_FINISH); From 3dbe388342f6e4a8053f75bafcf75ed6a103b32b Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 27 Nov 2024 11:54:47 -0500 Subject: [PATCH 37/65] feat(clp-package): Add support for deleting archives that are exclusively within a time range. (#594) Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- .../clp_package_utils/__init__.py | 13 ++ .../clp_package_utils/general.py | 2 +- .../clp_package_utils/scripts/compress.py | 10 +- .../clp_package_utils/scripts/decompress.py | 14 +- .../clp_package_utils/scripts/del_archives.py | 103 +++++++++++++ .../scripts/native/compress.py | 8 - .../scripts/native/decompress.py | 8 - .../scripts/native/del_archives.py | 139 ++++++++++++++++++ .../scripts/native/search.py | 8 - .../clp_package_utils/scripts/search.py | 10 +- .../clp_package_utils/scripts/start_clp.py | 10 +- .../clp_package_utils/scripts/stop_clp.py | 10 +- .../src/sbin/admin-tools/del-archives.sh | 9 ++ 13 files changed, 272 insertions(+), 72 deletions(-) create mode 100644 components/clp-package-utils/clp_package_utils/scripts/del_archives.py create mode 100644 components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py create mode 100755 components/package-template/src/sbin/admin-tools/del-archives.sh diff --git a/components/clp-package-utils/clp_package_utils/__init__.py b/components/clp-package-utils/clp_package_utils/__init__.py index e69de29bb..5253a87e5 100644 --- a/components/clp-package-utils/clp_package_utils/__init__.py +++ b/components/clp-package-utils/clp_package_utils/__init__.py @@ -0,0 +1,13 @@ +import logging + +# Set up console logging +logging_console_handler = logging.StreamHandler() +logging_formatter = logging.Formatter( + "%(asctime)s.%(msecs)03d %(levelname)s [%(module)s] %(message)s", datefmt="%Y-%m-%dT%H:%M:%S" +) +logging_console_handler.setFormatter(logging_formatter) + +# Set up root logger +root_logger = logging.getLogger() +root_logger.setLevel(logging.INFO) +root_logger.addHandler(logging_console_handler) diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index f42542ebc..5fae8166f 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -107,7 +107,7 @@ def get_clp_home(): return clp_home.resolve() -def generate_container_name(job_type: JobType) -> str: +def generate_container_name(job_type: str) -> str: """ :param job_type: :return: A unique container name for the given job type. diff --git a/components/clp-package-utils/clp_package_utils/scripts/compress.py b/components/clp-package-utils/clp_package_utils/scripts/compress.py index d0aa30913..efd3180ae 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/compress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/compress.py @@ -18,15 +18,7 @@ validate_and_load_db_credentials_file, ) -# Setup logging -# Create logger logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) -# Setup console logging -logging_console_handler = logging.StreamHandler() -logging_formatter = logging.Formatter("%(asctime)s [%(levelname)s] [%(name)s] %(message)s") -logging_console_handler.setFormatter(logging_formatter) -logger.addHandler(logging_console_handler) def main(argv): @@ -66,7 +58,7 @@ def main(argv): logger.exception("Failed to load config.") return -1 - container_name = generate_container_name(JobType.COMPRESSION) + container_name = generate_container_name(str(JobType.COMPRESSION)) container_clp_config, mounts = generate_container_config(clp_config, clp_home) generated_config_path_on_container, generated_config_path_on_host = dump_container_config( diff --git a/components/clp-package-utils/clp_package_utils/scripts/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/decompress.py index 9085fb162..325f2add6 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/decompress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/decompress.py @@ -25,15 +25,7 @@ validate_path_could_be_dir, ) -# Setup logging -# Create logger -logger = logging.getLogger("clp") -logger.setLevel(logging.DEBUG) -# Setup console logging -logging_console_handler = logging.StreamHandler() -logging_formatter = logging.Formatter("%(asctime)s [%(levelname)s] [%(name)s] %(message)s") -logging_console_handler.setFormatter(logging_formatter) -logger.addHandler(logging_console_handler) +logger = logging.getLogger(__file__) def validate_and_load_config( @@ -89,7 +81,7 @@ def handle_extract_file_cmd( if clp_config is None: return -1 - container_name = generate_container_name(JobType.FILE_EXTRACTION) + container_name = generate_container_name(str(JobType.FILE_EXTRACTION)) container_clp_config, mounts = generate_container_config(clp_config, clp_home) generated_config_path_on_container, generated_config_path_on_host = dump_container_config( container_clp_config, clp_config, container_name @@ -164,7 +156,7 @@ def handle_extract_stream_cmd( if clp_config is None: return -1 - container_name = generate_container_name(JobType.IR_EXTRACTION) + container_name = generate_container_name(str(JobType.IR_EXTRACTION)) container_clp_config, mounts = generate_container_config(clp_config, clp_home) generated_config_path_on_container, generated_config_path_on_host = dump_container_config( container_clp_config, clp_config, container_name diff --git a/components/clp-package-utils/clp_package_utils/scripts/del_archives.py b/components/clp-package-utils/clp_package_utils/scripts/del_archives.py new file mode 100644 index 000000000..54d959771 --- /dev/null +++ b/components/clp-package-utils/clp_package_utils/scripts/del_archives.py @@ -0,0 +1,103 @@ +import argparse +import logging +import subprocess +import sys +from pathlib import Path + +from clp_package_utils.general import ( + CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, + dump_container_config, + generate_container_config, + generate_container_name, + generate_container_start_cmd, + get_clp_home, + load_config_file, + validate_and_load_db_credentials_file, +) + +logger = logging.getLogger(__file__) + + +def main(argv): + clp_home = get_clp_home() + default_config_file_path = clp_home / CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH + + args_parser = argparse.ArgumentParser( + description="Deletes archives that fall within the specified time range." + ) + args_parser.add_argument( + "--config", + "-c", + default=str(default_config_file_path), + help="CLP package configuration file.", + ) + args_parser.add_argument( + "--begin-ts", + type=int, + default=0, + help="Time-range lower-bound (inclusive) as milliseconds from the UNIX epoch.", + ) + args_parser.add_argument( + "--end-ts", + type=int, + required=True, + help="Time-range upper-bound (include) as milliseconds from the UNIX epoch.", + ) + parsed_args = args_parser.parse_args(argv[1:]) + + # Validate and load config file + try: + config_file_path = Path(parsed_args.config) + clp_config = load_config_file(config_file_path, default_config_file_path, clp_home) + clp_config.validate_logs_dir() + + # Validate and load necessary credentials + validate_and_load_db_credentials_file(clp_config, clp_home, False) + except: + logger.exception("Failed to load config.") + return -1 + + # Validate the input timestamp + begin_ts = parsed_args.begin_ts + end_ts = parsed_args.end_ts + if begin_ts > end_ts: + logger.error("begin-ts must be <= end-ts") + return -1 + if end_ts < 0 or begin_ts < 0: + logger.error("begin_ts and end_ts must be non-negative.") + return -1 + + container_name = generate_container_name("del-archives") + + container_clp_config, mounts = generate_container_config(clp_config, clp_home) + generated_config_path_on_container, generated_config_path_on_host = dump_container_config( + container_clp_config, clp_config, container_name + ) + + necessary_mounts = [mounts.clp_home, mounts.logs_dir, mounts.archives_output_dir] + container_start_cmd = generate_container_start_cmd( + container_name, necessary_mounts, clp_config.execution_container + ) + + # fmt: off + del_archive_cmd = [ + "python3", + "-m", "clp_package_utils.scripts.native.del_archives", + "--config", str(generated_config_path_on_container), + str(begin_ts), + str(end_ts) + + ] + # fmt: on + + cmd = container_start_cmd + del_archive_cmd + subprocess.run(cmd, check=True) + + # Remove generated files + generated_config_path_on_host.unlink() + + return 0 + + +if "__main__" == __name__: + sys.exit(main(sys.argv)) diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/compress.py b/components/clp-package-utils/clp_package_utils/scripts/native/compress.py index cb495204f..b6d9bb7eb 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/native/compress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/compress.py @@ -23,15 +23,7 @@ load_config_file, ) -# Setup logging -# Create logger logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) -# Setup console logging -logging_console_handler = logging.StreamHandler() -logging_formatter = logging.Formatter("%(asctime)s [%(levelname)s] [%(name)s] %(message)s") -logging_console_handler.setFormatter(logging_formatter) -logger.addHandler(logging_console_handler) def print_compression_job_status(job_row, current_time): diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py index 7cce5d92a..d16cdcb6f 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py @@ -32,15 +32,7 @@ wait_for_query_job, ) -# Setup logging -# Create logger logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) -# Setup console logging -logging_console_handler = logging.StreamHandler() -logging_formatter = logging.Formatter("%(asctime)s [%(levelname)s] [%(name)s] %(message)s") -logging_console_handler.setFormatter(logging_formatter) -logger.addHandler(logging_console_handler) def get_orig_file_id(db_config: Database, path: str) -> Optional[str]: diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py b/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py new file mode 100644 index 000000000..735bf299d --- /dev/null +++ b/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py @@ -0,0 +1,139 @@ +import argparse +import logging +import shutil +import sys +from contextlib import closing +from pathlib import Path +from typing import List + +from clp_py_utils.clp_config import Database +from clp_py_utils.sql_adapter import SQL_Adapter + +from clp_package_utils.general import ( + CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, + get_clp_home, + load_config_file, +) + +logger = logging.getLogger(__file__) + + +def main(argv): + clp_home = get_clp_home() + default_config_file_path = clp_home / CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH + + args_parser = argparse.ArgumentParser( + description="Deletes archives that fall within the specified time range." + ) + args_parser.add_argument( + "--config", + "-c", + required=True, + default=str(default_config_file_path), + help="CLP configuration file.", + ) + args_parser.add_argument( + "begin_ts", + type=int, + help="Time-range lower-bound (inclusive) as milliseconds from the UNIX epoch.", + ) + args_parser.add_argument( + "end_ts", + type=int, + help="Time-range upper-bound (include) as milliseconds from the UNIX epoch.", + ) + parsed_args = args_parser.parse_args(argv[1:]) + + # Validate and load config file + config_file_path = Path(parsed_args.config) + try: + clp_config = load_config_file(config_file_path, default_config_file_path, clp_home) + clp_config.validate_logs_dir() + except: + logger.exception("Failed to load config.") + return -1 + + database_config = clp_config.database + archives_dir = clp_config.archive_output.directory + if not archives_dir.exists(): + logger.error("`archive_output.directory` doesn't exist.") + return -1 + + return _delete_archives( + archives_dir, + database_config, + parsed_args.begin_ts, + parsed_args.end_ts, + ) + + +def _delete_archives( + archives_dir: Path, + database_config: Database, + begin_ts: int, + end_ts: int, +) -> int: + """ + Deletes all archives where `begin_ts <= archive.begin_timestamp` and + `archive.end_timestamp <= end_ts` from both the metadata database and disk. + :param archives_dir: + :param database_config: + :param begin_ts: + :param end_ts: + :return: 0 on success, -1 otherwise. + """ + + archive_ids: List[str] + logger.info("Starting to delete archives from the database.") + try: + sql_adapter = SQL_Adapter(database_config) + clp_db_connection_params = database_config.get_clp_connection_params_and_type(True) + table_prefix = clp_db_connection_params["table_prefix"] + with closing(sql_adapter.create_connection(True)) as db_conn, closing( + db_conn.cursor(dictionary=True) + ) as db_cursor: + db_cursor.execute( + f""" + DELETE FROM `{table_prefix}archives` + WHERE begin_timestamp >= %s AND end_timestamp <= %s + RETURNING id + """, + (begin_ts, end_ts), + ) + results = db_cursor.fetchall() + + if 0 == len(results): + logger.info("No archives (exclusively) within the specified time range.") + return 0 + + archive_ids = [result["id"] for result in results] + db_cursor.execute( + f""" + DELETE FROM `{table_prefix}files` + WHERE archive_id in ({', '.join(['%s'] * len(archive_ids))}) + """, + archive_ids, + ) + db_conn.commit() + except Exception: + logger.exception("Failed to delete archives from the database. Aborting deletion.") + return -1 + + logger.info(f"Finished deleting archives from the database.") + + for archive_id in archive_ids: + archive_path = archives_dir / archive_id + if not archive_path.is_dir(): + logger.warning(f"Archive {archive_id} is not a directory. Skipping deletion.") + continue + + logger.info(f"Deleting archive {archive_id} from disk.") + shutil.rmtree(archive_path) + + logger.info(f"Finished deleting archives from disk.") + + return 0 + + +if "__main__" == __name__: + sys.exit(main(sys.argv)) diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/search.py b/components/clp-package-utils/clp_package_utils/scripts/native/search.py index 7dd247fa5..d166cf35f 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/native/search.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/search.py @@ -26,15 +26,7 @@ wait_for_query_job, ) -# Setup logging -# Create logger logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) -# Setup console logging -logging_console_handler = logging.StreamHandler() -logging_formatter = logging.Formatter("%(asctime)s [%(levelname)s] [%(name)s] %(message)s") -logging_console_handler.setFormatter(logging_formatter) -logger.addHandler(logging_console_handler) def create_and_monitor_job_in_db( diff --git a/components/clp-package-utils/clp_package_utils/scripts/search.py b/components/clp-package-utils/clp_package_utils/scripts/search.py index f3f02046d..beb7fb0b0 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/search.py +++ b/components/clp-package-utils/clp_package_utils/scripts/search.py @@ -20,15 +20,7 @@ validate_and_load_db_credentials_file, ) -# Setup logging -# Create logger logger = logging.getLogger(__file__) -logger.setLevel(logging.INFO) -# Setup console logging -logging_console_handler = logging.StreamHandler() -logging_formatter = logging.Formatter("%(asctime)s [%(levelname)s] [%(name)s] %(message)s") -logging_console_handler.setFormatter(logging_formatter) -logger.addHandler(logging_console_handler) def main(argv): @@ -82,7 +74,7 @@ def main(argv): logger.exception("Failed to load config.") return -1 - container_name = generate_container_name(JobType.SEARCH) + container_name = generate_container_name(str(JobType.SEARCH)) container_clp_config, mounts = generate_container_config(clp_config, clp_home) generated_config_path_on_container, generated_config_path_on_host = dump_container_config( diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 6732ded0b..8097929f1 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -59,15 +59,7 @@ validate_worker_config, ) -# Setup logging -# Create logger -logger = logging.getLogger("clp") -logger.setLevel(logging.INFO) -# Setup console logging -logging_console_handler = logging.StreamHandler() -logging_formatter = logging.Formatter("%(asctime)s [%(levelname)s] [%(name)s] %(message)s") -logging_console_handler.setFormatter(logging_formatter) -logger.addHandler(logging_console_handler) +logger = logging.getLogger(__file__) def container_exists(container_name): diff --git a/components/clp-package-utils/clp_package_utils/scripts/stop_clp.py b/components/clp-package-utils/clp_package_utils/scripts/stop_clp.py index f100a098a..a55d7a795 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/stop_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/stop_clp.py @@ -31,15 +31,7 @@ validate_and_load_queue_credentials_file, ) -# Setup logging -# Create logger -logger = logging.getLogger("clp") -logger.setLevel(logging.INFO) -# Setup console logging -logging_console_handler = logging.StreamHandler() -logging_formatter = logging.Formatter("%(asctime)s [%(levelname)s] [%(name)s] %(message)s") -logging_console_handler.setFormatter(logging_formatter) -logger.addHandler(logging_console_handler) +logger = logging.getLogger(__file__) def stop_running_container(container_name: str, already_exited_containers: List[str], force: bool): diff --git a/components/package-template/src/sbin/admin-tools/del-archives.sh b/components/package-template/src/sbin/admin-tools/del-archives.sh new file mode 100755 index 000000000..4d7ebc6b7 --- /dev/null +++ b/components/package-template/src/sbin/admin-tools/del-archives.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +package_root="$script_dir/../.." + +PYTHONPATH=$(readlink -f "$package_root/lib/python3/site-packages") \ + python3 \ + -m clp_package_utils.scripts.del_archives \ + "$@" From 5d900544aff94dc0ab1c38205a9dbfb4acc169f6 Mon Sep 17 00:00:00 2001 From: wraymo <37269683+wraymo@users.noreply.github.com> Date: Wed, 27 Nov 2024 16:36:34 -0500 Subject: [PATCH 38/65] feat(clp-s): Add the write path for single-file archives. (#563) Co-authored-by: Devin Gibson --- components/core/src/clp_s/ArchiveWriter.cpp | 192 +++++++++++++++--- components/core/src/clp_s/ArchiveWriter.hpp | 66 +++++- .../core/src/clp_s/CommandLineArguments.cpp | 4 + .../core/src/clp_s/CommandLineArguments.hpp | 3 + components/core/src/clp_s/JsonParser.cpp | 1 + components/core/src/clp_s/JsonParser.hpp | 1 + .../core/src/clp_s/SingleFileArchiveDefs.hpp | 59 ++++++ .../src/clp_s/TimestampDictionaryWriter.cpp | 60 ++---- .../src/clp_s/TimestampDictionaryWriter.hpp | 43 ++-- components/core/src/clp_s/TimestampEntry.cpp | 23 ++- components/core/src/clp_s/TimestampEntry.hpp | 6 +- components/core/src/clp_s/Utils.hpp | 12 ++ .../core/src/clp_s/archive_constants.hpp | 3 + components/core/src/clp_s/clp-s.cpp | 1 + 14 files changed, 359 insertions(+), 115 deletions(-) create mode 100644 components/core/src/clp_s/SingleFileArchiveDefs.hpp diff --git a/components/core/src/clp_s/ArchiveWriter.cpp b/components/core/src/clp_s/ArchiveWriter.cpp index 7118ce88b..d627479de 100644 --- a/components/core/src/clp_s/ArchiveWriter.cpp +++ b/components/core/src/clp_s/ArchiveWriter.cpp @@ -1,6 +1,8 @@ #include "ArchiveWriter.hpp" #include +#include +#include #include @@ -13,18 +15,23 @@ void ArchiveWriter::open(ArchiveWriterOption const& option) { m_id = boost::uuids::to_string(option.id); m_compression_level = option.compression_level; m_print_archive_stats = option.print_archive_stats; + m_single_file_archive = option.single_file_archive; m_min_table_size = option.min_table_size; - auto archive_path = boost::filesystem::path(option.archives_dir) / m_id; + m_archives_dir = option.archives_dir; + std::string working_dir_name = m_id; + if (option.single_file_archive) { + working_dir_name += constants::cTmpPostfix; + } + auto archive_path = std::filesystem::path(option.archives_dir) / working_dir_name; - boost::system::error_code boost_error_code; - bool path_exists = boost::filesystem::exists(archive_path, boost_error_code); - if (path_exists) { + std::error_code ec; + if (std::filesystem::exists(archive_path, ec)) { SPDLOG_ERROR("Archive path already exists: {}", archive_path.c_str()); throw OperationFailed(ErrorCodeUnsupported, __FILENAME__, __LINE__); } m_archive_path = archive_path.string(); - if (false == boost::filesystem::create_directory(m_archive_path)) { + if (false == std::filesystem::create_directory(m_archive_path, ec)) { throw OperationFailed(ErrorCodeErrno, __FILENAME__, __LINE__); } @@ -39,20 +46,42 @@ void ArchiveWriter::open(ArchiveWriterOption const& option) { std::string array_dict_path = m_archive_path + constants::cArchiveArrayDictFile; m_array_dict = std::make_shared(); m_array_dict->open(array_dict_path, m_compression_level, UINT64_MAX); - - std::string timestamp_dict_path = m_archive_path + constants::cArchiveTimestampDictFile; - m_timestamp_dict = std::make_shared(); - m_timestamp_dict->open(timestamp_dict_path, m_compression_level); } void ArchiveWriter::close() { - m_compressed_size += m_var_dict->close(); - m_compressed_size += m_log_dict->close(); - m_compressed_size += m_array_dict->close(); - m_compressed_size += m_timestamp_dict->close(); - m_compressed_size += m_schema_tree.store(m_archive_path, m_compression_level); - m_compressed_size += m_schema_map.store(m_archive_path, m_compression_level); - m_compressed_size += store_tables(); + auto var_dict_compressed_size = m_var_dict->close(); + auto log_dict_compressed_size = m_log_dict->close(); + auto array_dict_compressed_size = m_array_dict->close(); + auto schema_tree_compressed_size = m_schema_tree.store(m_archive_path, m_compression_level); + auto schema_map_compressed_size = m_schema_map.store(m_archive_path, m_compression_level); + auto [table_metadata_compressed_size, table_compressed_size] = store_tables(); + + if (m_single_file_archive) { + std::vector files{ + {constants::cArchiveSchemaTreeFile, schema_tree_compressed_size}, + {constants::cArchiveSchemaMapFile, schema_map_compressed_size}, + {constants::cArchiveTableMetadataFile, table_metadata_compressed_size}, + {constants::cArchiveVarDictFile, var_dict_compressed_size}, + {constants::cArchiveLogDictFile, log_dict_compressed_size}, + {constants::cArchiveArrayDictFile, array_dict_compressed_size}, + {constants::cArchiveTablesFile, table_compressed_size} + }; + uint64_t offset = 0; + for (auto& file : files) { + uint64_t original_size = file.o; + file.o = offset; + offset += original_size; + } + write_single_file_archive(files); + } else { + // Timestamp dictionary written separately here until we transition to moving it inside of + // the metadata region of multi-file archives. + auto timestamp_dict_compressed_size = write_timestamp_dict(); + m_compressed_size = var_dict_compressed_size + log_dict_compressed_size + + array_dict_compressed_size + timestamp_dict_compressed_size + + schema_tree_compressed_size + schema_map_compressed_size + + table_metadata_compressed_size + table_compressed_size; + } if (m_metadata_db) { update_metadata_db(); @@ -65,12 +94,130 @@ void ArchiveWriter::close() { m_id_to_schema_writer.clear(); m_schema_tree.clear(); m_schema_map.clear(); + m_timestamp_dict.clear(); m_encoded_message_size = 0UL; m_uncompressed_size = 0UL; m_compressed_size = 0UL; m_next_log_event_id = 0; } +size_t ArchiveWriter::write_timestamp_dict() { + std::string timestamp_dict_path = m_archive_path + constants::cArchiveTimestampDictFile; + FileWriter timestamp_dict_file_writer; + ZstdCompressor timestamp_dict_compressor; + timestamp_dict_file_writer.open(timestamp_dict_path, FileWriter::OpenMode::CreateForWriting); + timestamp_dict_compressor.open(timestamp_dict_file_writer, m_compression_level); + std::stringstream timestamp_dict_stream; + m_timestamp_dict.write(timestamp_dict_stream); + std::string encoded_timestamp_dict = timestamp_dict_stream.str(); + timestamp_dict_compressor.write(encoded_timestamp_dict.data(), encoded_timestamp_dict.size()); + timestamp_dict_compressor.close(); + auto compressed_size = timestamp_dict_file_writer.get_pos(); + timestamp_dict_file_writer.close(); + return compressed_size; +} + +void ArchiveWriter::write_single_file_archive(std::vector const& files) { + std::string single_file_archive_path = (std::filesystem::path(m_archives_dir) / m_id).string(); + FileWriter archive_writer; + archive_writer.open(single_file_archive_path, FileWriter::OpenMode::CreateForWriting); + + write_archive_metadata(archive_writer, files); + size_t metadata_section_size = archive_writer.get_pos() - sizeof(ArchiveHeader); + write_archive_files(archive_writer, files); + m_compressed_size = archive_writer.get_pos(); + write_archive_header(archive_writer, metadata_section_size); + + archive_writer.close(); + std::error_code ec; + if (false == std::filesystem::remove(m_archive_path, ec)) { + throw OperationFailed(ErrorCodeFileExists, __FILENAME__, __LINE__); + } +} + +void ArchiveWriter::write_archive_metadata( + FileWriter& archive_writer, + std::vector const& files +) { + archive_writer.seek_from_begin(sizeof(ArchiveHeader)); + + ZstdCompressor compressor; + compressor.open(archive_writer, m_compression_level); + compressor.write_numeric_value(static_cast(3U)); // Number of packets + + // Write archive info + ArchiveInfoPacket archive_info{.num_segments = 1}; + std::stringstream msgpack_buffer; + msgpack::pack(msgpack_buffer, archive_info); + std::string archive_info_str = msgpack_buffer.str(); + compressor.write_numeric_value(ArchiveMetadataPacketType::ArchiveInfo); + compressor.write_numeric_value(static_cast(archive_info_str.size())); + compressor.write_string(archive_info_str); + + // Write archive file info + ArchiveFileInfoPacket archive_file_info{.files{files}}; + msgpack_buffer = std::stringstream{}; + msgpack::pack(msgpack_buffer, archive_file_info); + std::string archive_file_info_str = msgpack_buffer.str(); + compressor.write_numeric_value(ArchiveMetadataPacketType::ArchiveFileInfo); + compressor.write_numeric_value(static_cast(archive_file_info_str.size())); + compressor.write_string(archive_file_info_str); + + // Write timestamp dictionary + compressor.write_numeric_value(ArchiveMetadataPacketType::TimestampDictionary); + std::stringstream timestamp_dict_stream; + m_timestamp_dict.write(timestamp_dict_stream); + std::string encoded_timestamp_dict = timestamp_dict_stream.str(); + compressor.write_numeric_value(static_cast(encoded_timestamp_dict.size())); + compressor.write(encoded_timestamp_dict.data(), encoded_timestamp_dict.size()); + + compressor.close(); +} + +void ArchiveWriter::write_archive_files( + FileWriter& archive_writer, + std::vector const& files +) { + FileReader reader; + for (auto const& file : files) { + std::string file_path = m_archive_path + file.n; + reader.open(file_path); + char read_buffer[cReadBlockSize]; + while (true) { + size_t num_bytes_read{0}; + ErrorCode const error_code + = reader.try_read(read_buffer, cReadBlockSize, num_bytes_read); + if (ErrorCodeEndOfFile == error_code) { + break; + } else if (ErrorCodeSuccess != error_code) { + throw OperationFailed(error_code, __FILENAME__, __LINE__); + } + archive_writer.write(read_buffer, num_bytes_read); + } + reader.close(); + if (false == std::filesystem::remove(file_path)) { + throw OperationFailed(ErrorCodeFileExists, __FILENAME__, __LINE__); + } + } +} + +void ArchiveWriter::write_archive_header(FileWriter& archive_writer, size_t metadata_section_size) { + ArchiveHeader header{ + .magic_number{0}, + .version + = (cArchiveMajorVersion << 24) | (cArchiveMinorVersion << 16) | cArchivePatchVersion, + .uncompressed_size = m_uncompressed_size, + .compressed_size = m_compressed_size, + .reserved_padding{0}, + .metadata_section_size = static_cast(metadata_section_size), + .compression_type = static_cast(ArchiveCompressionType::Zstd), + .padding = 0 + }; + std::memcpy(&header.magic_number, cStructuredSFAMagicNumber, sizeof(header.magic_number)); + archive_writer.seek_from_begin(0); + archive_writer.write(reinterpret_cast(&header), sizeof(header)); +} + void ArchiveWriter::append_message( int32_t schema_id, Schema const& schema, @@ -132,8 +279,7 @@ void ArchiveWriter::initialize_schema_writer(SchemaWriter* writer, Schema const& } } -size_t ArchiveWriter::store_tables() { - size_t compressed_size = 0; +std::pair ArchiveWriter::store_tables() { m_tables_file_writer.open( m_archive_path + constants::cArchiveTablesFile, FileWriter::OpenMode::CreateForWriting @@ -243,13 +389,13 @@ size_t ArchiveWriter::store_tables() { } m_table_metadata_compressor.close(); - compressed_size += m_table_metadata_file_writer.get_pos(); - compressed_size += m_tables_file_writer.get_pos(); + auto table_metadata_compressed_size = m_table_metadata_file_writer.get_pos(); + auto table_compressed_size = m_tables_file_writer.get_pos(); m_table_metadata_file_writer.close(); m_tables_file_writer.close(); - return compressed_size; + return {table_metadata_compressed_size, table_compressed_size}; } void ArchiveWriter::update_metadata_db() { @@ -262,8 +408,8 @@ void ArchiveWriter::update_metadata_db() { metadata.increment_static_compressed_size(m_compressed_size); metadata.increment_static_uncompressed_size(m_uncompressed_size); metadata.expand_time_range( - m_timestamp_dict->get_begin_timestamp(), - m_timestamp_dict->get_end_timestamp() + m_timestamp_dict.get_begin_timestamp(), + m_timestamp_dict.get_end_timestamp() ); m_metadata_db->add_archive(m_id, metadata); diff --git a/components/core/src/clp_s/ArchiveWriter.hpp b/components/core/src/clp_s/ArchiveWriter.hpp index 87e9d11e5..3b13f4426 100644 --- a/components/core/src/clp_s/ArchiveWriter.hpp +++ b/components/core/src/clp_s/ArchiveWriter.hpp @@ -14,6 +14,7 @@ #include "SchemaMap.hpp" #include "SchemaTree.hpp" #include "SchemaWriter.hpp" +#include "SingleFileArchiveDefs.hpp" #include "TimestampDictionaryWriter.hpp" namespace clp_s { @@ -22,6 +23,7 @@ struct ArchiveWriterOption { std::string archives_dir; int compression_level; bool print_archive_stats; + bool single_file_archive; size_t min_table_size; }; @@ -125,7 +127,7 @@ class ArchiveWriter { std::string const& timestamp, uint64_t& pattern_id ) { - return m_timestamp_dict->ingest_entry(key, node_id, timestamp, pattern_id); + return m_timestamp_dict.ingest_entry(key, node_id, timestamp, pattern_id); } /** @@ -135,21 +137,24 @@ class ArchiveWriter { * @param timestamp */ void ingest_timestamp_entry(std::string const& key, int32_t node_id, double timestamp) { - m_timestamp_dict->ingest_entry(key, node_id, timestamp); + m_timestamp_dict.ingest_entry(key, node_id, timestamp); } void ingest_timestamp_entry(std::string const& key, int32_t node_id, int64_t timestamp) { - m_timestamp_dict->ingest_entry(key, node_id, timestamp); + m_timestamp_dict.ingest_entry(key, node_id, timestamp); } /** - * Increments the size of the compressed data written to the archive + * Increments the size of the original (uncompressed) logs ingested into the archive. This size + * tracks the raw input size before any encoding or compression. * @param size */ void increment_uncompressed_size(size_t size) { m_uncompressed_size += size; } /** - * @return Size of the uncompressed data written to the archive + * @return The total size of the encoded (uncompressed) data written to the archive. This + * reflects the size of the data after encoding but before compression. + * TODO: Add the size of schema tree, schema map and timestamp dictionary */ size_t get_data_size(); @@ -162,10 +167,40 @@ class ArchiveWriter { void initialize_schema_writer(SchemaWriter* writer, Schema const& schema); /** - * Stores the tables - * @return Size of the compressed data in bytes + * Compresses and stores the tables. + * @return A pair containing: + * - The size of the compressed table metadata in bytes. + * - The size of the compressed tables in bytes. */ - [[nodiscard]] size_t store_tables(); + [[nodiscard]] std::pair store_tables(); + + /** + * Writes the archive to a single file + * @param files + */ + void write_single_file_archive(std::vector const& files); + + /** + * Writes the metadata section of the single file archive + * @param archive_writer + * @param files + */ + void + write_archive_metadata(FileWriter& archive_writer, std::vector const& files); + + /** + * Writes the file section of the single file archive + * @param archive_writer + * @param files + */ + void write_archive_files(FileWriter& archive_writer, std::vector const& files); + + /** + * Writes the header section of the single file archive + * @param archive_writer + * @param metadata_section_size + */ + void write_archive_header(FileWriter& archive_writer, size_t metadata_section_size); /** * Updates the metadata db with the archive's metadata (id, size, timestamp ranges, etc.) @@ -177,6 +212,17 @@ class ArchiveWriter { */ void print_archive_stats(); + /** + * Write the timestamp dictionary as a dedicated file for multi-file archives. + * + * Note: the timestamp dictionary will be moved into the metadata region of multi-file archives + * in a follow-up PR. + * @return the compressed size of the Timestamp Dictionary in bytes + */ + size_t write_timestamp_dict(); + + static constexpr size_t cReadBlockSize = 4 * 1024; + size_t m_encoded_message_size{}; size_t m_uncompressed_size{}; size_t m_compressed_size{}; @@ -184,16 +230,18 @@ class ArchiveWriter { std::string m_id; + std::string m_archives_dir; std::string m_archive_path; std::string m_encoded_messages_dir; std::shared_ptr m_var_dict; std::shared_ptr m_log_dict; std::shared_ptr m_array_dict; // log type dictionary for arrays - std::shared_ptr m_timestamp_dict; + TimestampDictionaryWriter m_timestamp_dict; std::shared_ptr m_metadata_db; int m_compression_level{}; bool m_print_archive_stats{}; + bool m_single_file_archive{}; size_t m_min_table_size{}; SchemaMap m_schema_map; diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp index d174b4a23..99539b627 100644 --- a/components/core/src/clp_s/CommandLineArguments.cpp +++ b/components/core/src/clp_s/CommandLineArguments.cpp @@ -190,6 +190,10 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { "print-archive-stats", po::bool_switch(&m_print_archive_stats), "Print statistics (json) about the archive after it's compressed." + )( + "single-file-archive", + po::bool_switch(&m_single_file_archive), + "Create a single archive file instead of multiple files." )( "structurize-arrays", po::bool_switch(&m_structurize_arrays), diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp index 913e27fbc..a87e9b6bd 100644 --- a/components/core/src/clp_s/CommandLineArguments.hpp +++ b/components/core/src/clp_s/CommandLineArguments.hpp @@ -102,6 +102,8 @@ class CommandLineArguments { OutputHandlerType get_output_handler_type() const { return m_output_handler_type; } + bool get_single_file_archive() const { return m_single_file_archive; } + bool get_structurize_arrays() const { return m_structurize_arrays; } bool get_ordered_decompression() const { return m_ordered_decompression; } @@ -176,6 +178,7 @@ class CommandLineArguments { size_t m_target_encoded_size{8ULL * 1024 * 1024 * 1024}; // 8 GiB bool m_print_archive_stats{false}; size_t m_max_document_size{512ULL * 1024 * 1024}; // 512 MB + bool m_single_file_archive{false}; bool m_structurize_arrays{false}; bool m_ordered_decompression{false}; size_t m_target_ordered_chunk_size{}; diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index 9e8293510..d14a221b3 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -37,6 +37,7 @@ JsonParser::JsonParser(JsonParserOption const& option) m_archive_options.archives_dir = option.archives_dir; m_archive_options.compression_level = option.compression_level; m_archive_options.print_archive_stats = option.print_archive_stats; + m_archive_options.single_file_archive = option.single_file_archive; m_archive_options.min_table_size = option.min_table_size; m_archive_options.id = m_generator(); diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp index d7cc5a2fe..bfd423c22 100644 --- a/components/core/src/clp_s/JsonParser.hpp +++ b/components/core/src/clp_s/JsonParser.hpp @@ -38,6 +38,7 @@ struct JsonParserOption { bool print_archive_stats{}; bool structurize_arrays{}; bool record_log_order{true}; + bool single_file_archive{false}; std::shared_ptr metadata_db; }; diff --git a/components/core/src/clp_s/SingleFileArchiveDefs.hpp b/components/core/src/clp_s/SingleFileArchiveDefs.hpp new file mode 100644 index 000000000..7eabeb6db --- /dev/null +++ b/components/core/src/clp_s/SingleFileArchiveDefs.hpp @@ -0,0 +1,59 @@ +#ifndef CLP_S_ARCHIVEDEFS_HPP +#define CLP_S_ARCHIVEDEFS_HPP + +#include + +#include "msgpack.hpp" + +namespace clp_s { +// define the version +constexpr uint8_t cArchiveMajorVersion = 0; +constexpr uint8_t cArchiveMinorVersion = 2; +constexpr uint16_t cArchivePatchVersion = 0; + +// define the magic number +constexpr uint8_t cStructuredSFAMagicNumber[] = {0xFD, 0x2F, 0xC5, 0x30}; + +struct ArchiveHeader { + uint8_t magic_number[4]; + uint32_t version; + uint64_t uncompressed_size; + uint64_t compressed_size; + uint64_t reserved_padding[4]; + uint32_t metadata_section_size; + uint16_t compression_type; + uint16_t padding; +}; + +enum class ArchiveCompressionType : uint16_t { + Zstd = 0, +}; + +enum struct ArchiveMetadataPacketType : uint8_t { + ArchiveInfo = 0, + ArchiveFileInfo = 1, + TimestampDictionary = 2, +}; + +struct ArchiveInfoPacket { + uint64_t num_segments; + // TODO: Add more fields in the future + + MSGPACK_DEFINE_MAP(num_segments); +}; + +struct ArchiveFileInfo { + std::string n; // name + uint64_t o; // offset + + MSGPACK_DEFINE_MAP(n, o); +}; + +struct ArchiveFileInfoPacket { + std::vector files; + + MSGPACK_DEFINE_MAP(files); +}; +} // namespace clp_s + +#endif // CLP_S_ARCHIVEDEFS_HPP diff --git a/components/core/src/clp_s/TimestampDictionaryWriter.cpp b/components/core/src/clp_s/TimestampDictionaryWriter.cpp index 7b02fd3a5..39e66a6af 100644 --- a/components/core/src/clp_s/TimestampDictionaryWriter.cpp +++ b/components/core/src/clp_s/TimestampDictionaryWriter.cpp @@ -1,63 +1,34 @@ #include "TimestampDictionaryWriter.hpp" +#include + #include "Utils.hpp" namespace clp_s { void TimestampDictionaryWriter::write_timestamp_entries( std::map const& ranges, - ZstdCompressor& compressor + std::stringstream& stream ) { - compressor.write_numeric_value(ranges.size()); + write_numeric_value(stream, ranges.size()); for (auto const& range : ranges) { - range.second.write_to_file(compressor); + range.second.write_to_stream(stream); } } -void TimestampDictionaryWriter::write_and_flush_to_disk() { - write_timestamp_entries(m_column_key_to_range, m_dictionary_compressor); +void TimestampDictionaryWriter::write(std::stringstream& stream) { + merge_range(); + write_timestamp_entries(m_column_key_to_range, stream); - m_dictionary_compressor.write_numeric_value(m_pattern_to_id.size()); + write_numeric_value(stream, m_pattern_to_id.size()); for (auto& it : m_pattern_to_id) { // write pattern ID - m_dictionary_compressor.write_numeric_value(it.second); + write_numeric_value(stream, it.second); std::string const& pattern = it.first->get_format(); - m_dictionary_compressor.write_numeric_value(pattern.length()); - m_dictionary_compressor.write_string(pattern); - } - - m_dictionary_compressor.flush(); - m_dictionary_file_writer.flush(); -} - -void TimestampDictionaryWriter::open(std::string const& dictionary_path, int compression_level) { - if (m_is_open) { - throw OperationFailed(ErrorCodeNotReady, __FILENAME__, __LINE__); - } - - m_dictionary_file_writer.open(dictionary_path, FileWriter::OpenMode::CreateForWriting); - m_dictionary_compressor.open(m_dictionary_file_writer, compression_level); - - m_next_id = 0; - m_is_open = true; -} - -size_t TimestampDictionaryWriter::close() { - if (false == m_is_open) { - throw OperationFailed(ErrorCodeNotInit, __FILENAME__, __LINE__); + write_numeric_value(stream, pattern.length()); + stream.write(pattern.data(), pattern.size()); } - - // merge before writing overall archive because this - // happens before the last sub-archive is written - merge_range(); - write_and_flush_to_disk(); - m_dictionary_compressor.close(); - size_t compressed_size = m_dictionary_file_writer.get_pos(); - m_dictionary_file_writer.close(); - - m_is_open = false; - return compressed_size; } uint64_t TimestampDictionaryWriter::get_pattern_id(TimestampPattern const* pattern) { @@ -180,4 +151,11 @@ epochtime_t TimestampDictionaryWriter::get_end_timestamp() const { return it->second.get_end_timestamp(); } + +void TimestampDictionaryWriter::clear() { + m_next_id = 0; + m_pattern_to_id.clear(); + m_column_key_to_range.clear(); + m_column_id_to_range.clear(); +} } // namespace clp_s diff --git a/components/core/src/clp_s/TimestampDictionaryWriter.hpp b/components/core/src/clp_s/TimestampDictionaryWriter.hpp index 81266b187..29288fd48 100644 --- a/components/core/src/clp_s/TimestampDictionaryWriter.hpp +++ b/components/core/src/clp_s/TimestampDictionaryWriter.hpp @@ -1,15 +1,15 @@ #ifndef CLP_S_TIMESTAMPDICTIONARYWRITER_HPP #define CLP_S_TIMESTAMPDICTIONARYWRITER_HPP +#include +#include #include #include #include -#include "FileWriter.hpp" #include "SchemaTree.hpp" #include "TimestampEntry.hpp" #include "TimestampPattern.hpp" -#include "ZstdCompressor.hpp" namespace clp_s { class TimestampDictionaryWriter { @@ -23,25 +23,13 @@ class TimestampDictionaryWriter { }; // Constructors - TimestampDictionaryWriter() : m_is_open(false) {} + TimestampDictionaryWriter() {} /** - * Opens the timestamp dictionary for writing - * @param dictionary_path - * @param compression_level + * Writes the timestamp dictionary to a buffered stream. + * @param stream */ - void open(std::string const& dictionary_path, int compression_level); - - /** - * Closes the timestamp dictionary - * @return the compressed size of the global timestamp dictionary in bytes - */ - [[nodiscard]] size_t close(); - - /** - * Writes the timestamp dictionary to disk - */ - void write_and_flush_to_disk(); + void write(std::stringstream& stream); /** * Gets the pattern id for a given pattern @@ -91,33 +79,30 @@ class TimestampDictionaryWriter { */ epochtime_t get_end_timestamp() const; + /** + * Clears and resets all internal state. + */ + void clear(); + private: /** - * Merges timestamp ranges with the same key name + * Merges timestamp ranges with the same key name but different node ids. */ void merge_range(); /** - * Writes timestamp entries to the disk + * Writes timestamp entries to a buffered stream. * @param ranges * @param compressor */ static void write_timestamp_entries( std::map const& ranges, - ZstdCompressor& compressor + std::stringstream& stream ); using pattern_to_id_t = std::unordered_map; // Variables - bool m_is_open; - - // Variables related to on-disk storage - FileWriter m_dictionary_file_writer; - ZstdCompressor m_dictionary_compressor; - FileWriter m_dictionary_file_writer_local; - ZstdCompressor m_dictionary_compressor_local; - pattern_to_id_t m_pattern_to_id; uint64_t m_next_id{}; diff --git a/components/core/src/clp_s/TimestampEntry.cpp b/components/core/src/clp_s/TimestampEntry.cpp index 54b27d22e..19d422066 100644 --- a/components/core/src/clp_s/TimestampEntry.cpp +++ b/components/core/src/clp_s/TimestampEntry.cpp @@ -1,6 +1,9 @@ #include "TimestampEntry.hpp" #include +#include + +#include "Utils.hpp" namespace clp_s { void TimestampEntry::ingest_timestamp(epochtime_t timestamp) { @@ -54,21 +57,21 @@ void TimestampEntry::merge_range(TimestampEntry const& entry) { } } -void TimestampEntry::write_to_file(ZstdCompressor& compressor) const { - compressor.write_numeric_value(m_key_name.size()); - compressor.write_string(m_key_name); - compressor.write_numeric_value(m_column_ids.size()); +void TimestampEntry::write_to_stream(std::stringstream& stream) const { + write_numeric_value(stream, m_key_name.size()); + stream.write(m_key_name.data(), m_key_name.size()); + write_numeric_value(stream, m_column_ids.size()); for (auto const& id : m_column_ids) { - compressor.write_numeric_value(id); + write_numeric_value(stream, id); } - compressor.write_numeric_value(m_encoding); + write_numeric_value(stream, m_encoding); if (m_encoding == Epoch) { - compressor.write_numeric_value(m_epoch_start); - compressor.write_numeric_value(m_epoch_end); + write_numeric_value(stream, m_epoch_start); + write_numeric_value(stream, m_epoch_end); } else if (m_encoding == DoubleEpoch) { - compressor.write_numeric_value(m_epoch_start_double); - compressor.write_numeric_value(m_epoch_end_double); + write_numeric_value(stream, m_epoch_start_double); + write_numeric_value(stream, m_epoch_end_double); } } diff --git a/components/core/src/clp_s/TimestampEntry.hpp b/components/core/src/clp_s/TimestampEntry.hpp index ad40b4b89..326ed9d73 100644 --- a/components/core/src/clp_s/TimestampEntry.hpp +++ b/components/core/src/clp_s/TimestampEntry.hpp @@ -1,6 +1,7 @@ #ifndef CLP_S_TIMESTAMPENTRY_HPP #define CLP_S_TIMESTAMPENTRY_HPP +#include #include #include #include @@ -9,7 +10,6 @@ #include "ErrorCode.hpp" #include "search/FilterOperation.hpp" #include "Utils.hpp" -#include "ZstdCompressor.hpp" #include "ZstdDecompressor.hpp" using clp_s::search::FilterOperation; @@ -66,10 +66,10 @@ class TimestampEntry { void merge_range(TimestampEntry const& entry); /** - * Write the timestamp entry to a file + * Write the timestamp entry to a buffered stream. * @param compressor */ - void write_to_file(ZstdCompressor& compressor) const; + void write_to_stream(std::stringstream& stream) const; /** * Try to read the timestamp entry from a file diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp index d6deb3280..553f7e608 100644 --- a/components/core/src/clp_s/Utils.hpp +++ b/components/core/src/clp_s/Utils.hpp @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -254,6 +255,17 @@ inline T2 bit_cast(T1 t1) { return t2; } +/** + * Writes a numeric value to a stringstream. + * @param stream + * @param value + * @tparam ValueType + */ +template +void write_numeric_value(std::stringstream& stream, ValueType value) { + stream.write(reinterpret_cast(&value), sizeof(value)); +} + /** * A span of memory where the underlying memory may not be aligned correctly for type T. * diff --git a/components/core/src/clp_s/archive_constants.hpp b/components/core/src/clp_s/archive_constants.hpp index 604c97f66..b76af2944 100644 --- a/components/core/src/clp_s/archive_constants.hpp +++ b/components/core/src/clp_s/archive_constants.hpp @@ -4,6 +4,9 @@ #include namespace clp_s::constants { +// Single file archive +constexpr char cTmpPostfix[] = ".tmp"; + // Schema files constexpr char cArchiveSchemaMapFile[] = "/schema_ids"; constexpr char cArchiveSchemaTreeFile[] = "/schema_tree"; diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index a74693e33..b76683caf 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -95,6 +95,7 @@ bool compress(CommandLineArguments const& command_line_arguments) { option.compression_level = command_line_arguments.get_compression_level(); option.timestamp_key = command_line_arguments.get_timestamp_key(); option.print_archive_stats = command_line_arguments.print_archive_stats(); + option.single_file_archive = command_line_arguments.get_single_file_archive(); option.structurize_arrays = command_line_arguments.get_structurize_arrays(); option.record_log_order = command_line_arguments.get_record_log_order(); From 2b88c6fcf397ac3729303c1eff5ca0772a955e1e Mon Sep 17 00:00:00 2001 From: "Xiaochong(Eddy) Wei" <40865608+anlowee@users.noreply.github.com> Date: Thu, 28 Nov 2024 11:52:30 -0500 Subject: [PATCH 39/65] test: Allow multiple trials when unittesting http headers (#613) Co-authored-by: Xiaochong Wei --- components/core/tests/test-NetworkReader.cpp | 38 ++++++++++++-------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/components/core/tests/test-NetworkReader.cpp b/components/core/tests/test-NetworkReader.cpp index f2995f141..552775ea8 100644 --- a/components/core/tests/test-NetworkReader.cpp +++ b/components/core/tests/test-NetworkReader.cpp @@ -196,26 +196,36 @@ TEST_CASE("network_reader_with_valid_http_header_kv_pairs", "[NetworkReader]") { std::unordered_map valid_http_header_kv_pairs; // We use httpbin (https://httpbin.org/) to test the user-specified headers. On success, it is // supposed to respond all the user-specified headers as key-value pairs in JSON form. - constexpr int cNumHttpHeaderKeyValuePairs{10}; + constexpr size_t cNumHttpHeaderKeyValuePairs{10}; for (size_t i{0}; i < cNumHttpHeaderKeyValuePairs; ++i) { valid_http_header_kv_pairs.emplace( fmt::format("Unit-Test-Key{}", i), fmt::format("Unit-Test-Value{}", i) ); } - clp::NetworkReader reader{ - "https://httpbin.org/headers", - 0, - false, - clp::CurlDownloadHandler::cDefaultOverallTimeout, - clp::CurlDownloadHandler::cDefaultConnectionTimeout, - clp::NetworkReader::cDefaultBufferPoolSize, - clp::NetworkReader::cDefaultBufferSize, - valid_http_header_kv_pairs - }; - auto const content{get_content(reader)}; - REQUIRE(assert_curl_error_code(CURLE_OK, reader)); - auto const parsed_content = nlohmann::json::parse(content); + std::optional> optional_content; + // Retry the unit test a limited number of times to handle transient server-side HTTP errors. + // This ensures the test is not marked as failed due to temporary issues beyond our control. + constexpr size_t cNumMaxTrials{10}; + for (size_t i{0}; i < cNumMaxTrials; ++i) { + clp::NetworkReader reader{ + "https://httpbin.org/headers", + 0, + false, + clp::CurlDownloadHandler::cDefaultOverallTimeout, + clp::CurlDownloadHandler::cDefaultConnectionTimeout, + clp::NetworkReader::cDefaultBufferPoolSize, + clp::NetworkReader::cDefaultBufferSize, + valid_http_header_kv_pairs + }; + auto const content = get_content(reader); + if (assert_curl_error_code(CURLE_OK, reader)) { + optional_content.emplace(content); + break; + } + } + REQUIRE(optional_content.has_value()); + auto const parsed_content = nlohmann::json::parse(optional_content.value()); auto const& headers{parsed_content.at("headers")}; for (auto const& [key, value] : valid_http_header_kv_pairs) { REQUIRE((value == headers.at(key).get())); From 290ede3cb30b4bfe4e813212cf78c2de85e10db4 Mon Sep 17 00:00:00 2001 From: Junhao Liao Date: Fri, 29 Nov 2024 03:54:23 -0500 Subject: [PATCH 40/65] chore(log-viewer-webui): Update `yscope-log-viewer` to the latest version. (#615) --- components/log-viewer-webui/yscope-log-viewer | 2 +- deps-tasks.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/components/log-viewer-webui/yscope-log-viewer b/components/log-viewer-webui/yscope-log-viewer index 4c69bc11d..969ff35b2 160000 --- a/components/log-viewer-webui/yscope-log-viewer +++ b/components/log-viewer-webui/yscope-log-viewer @@ -1 +1 @@ -Subproject commit 4c69bc11dbe8a5d87b5fbfb0e43a2f2a06f04866 +Subproject commit 969ff35b2387bcdc3580b441907e3656640ce16d diff --git a/deps-tasks.yml b/deps-tasks.yml index 64a218a47..3c60af001 100644 --- a/deps-tasks.yml +++ b/deps-tasks.yml @@ -421,8 +421,8 @@ tasks: vars: DEST: "{{.DEST}}" FLAGS: "--extract" - SRC_NAME: "yscope-log-viewer-4c69bc11dbe8a5d87b5fbfb0e43a2f2a06f04866" - SRC_URL: "https://github.com/y-scope/yscope-log-viewer/archive/4c69bc1.zip" + SRC_NAME: "yscope-log-viewer-969ff35b2387bcdc3580b441907e3656640ce16d" + SRC_URL: "https://github.com/y-scope/yscope-log-viewer/archive/969ff35.zip" # This command must be last - task: ":utils:compute-checksum" vars: From 2c0e053c938ca70cedd378bedd9b73f6b613ca2b Mon Sep 17 00:00:00 2001 From: Abigail Matthews Date: Mon, 2 Dec 2024 00:48:07 -0500 Subject: [PATCH 41/65] test(clp-s): Add end-to-end test case for compression and extraction. (#595) --- components/core/CMakeLists.txt | 54 +++++- .../core/tests/test-clp_s-end_to_end.cpp | 158 ++++++++++++++++++ .../test_no_floats_sorted.jsonl | 4 + .../install-prebuilt-packages.sh | 2 + .../ubuntu-focal/install-prebuilt-packages.sh | 1 + .../ubuntu-jammy/install-prebuilt-packages.sh | 1 + 6 files changed, 219 insertions(+), 1 deletion(-) create mode 100644 components/core/tests/test-clp_s-end_to_end.cpp create mode 100644 components/core/tests/test_log_files/test_no_floats_sorted.jsonl diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 160f6766d..9e14498b0 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -259,6 +259,42 @@ add_subdirectory(src/clp_s) add_subdirectory(src/reducer) set(SOURCE_FILES_clp_s_unitTest + src/clp_s/ArchiveReader.cpp + src/clp_s/ArchiveReader.hpp + src/clp_s/ArchiveWriter.cpp + src/clp_s/ArchiveWriter.hpp + src/clp_s/ColumnReader.cpp + src/clp_s/ColumnReader.hpp + src/clp_s/ColumnWriter.cpp + src/clp_s/ColumnWriter.hpp + src/clp_s/DictionaryEntry.cpp + src/clp_s/DictionaryEntry.hpp + src/clp_s/DictionaryWriter.cpp + src/clp_s/DictionaryWriter.hpp + src/clp_s/FileReader.cpp + src/clp_s/FileReader.hpp + src/clp_s/FileWriter.cpp + src/clp_s/FileWriter.hpp + src/clp_s/JsonConstructor.cpp + src/clp_s/JsonConstructor.hpp + src/clp_s/JsonFileIterator.cpp + src/clp_s/JsonFileIterator.hpp + src/clp_s/JsonParser.cpp + src/clp_s/JsonParser.hpp + src/clp_s/PackedStreamReader.cpp + src/clp_s/PackedStreamReader.hpp + src/clp_s/ReaderUtils.cpp + src/clp_s/ReaderUtils.hpp + src/clp_s/Schema.cpp + src/clp_s/Schema.hpp + src/clp_s/SchemaMap.cpp + src/clp_s/SchemaMap.hpp + src/clp_s/SchemaReader.cpp + src/clp_s/SchemaReader.hpp + src/clp_s/SchemaTree.cpp + src/clp_s/SchemaTree.hpp + src/clp_s/SchemaWriter.cpp + src/clp_s/SchemaWriter.hpp src/clp_s/search/AndExpr.cpp src/clp_s/search/AndExpr.hpp src/clp_s/search/BooleanLiteral.cpp @@ -291,11 +327,24 @@ set(SOURCE_FILES_clp_s_unitTest src/clp_s/search/StringLiteral.hpp src/clp_s/search/Transformation.hpp src/clp_s/search/Value.hpp - src/clp_s/SchemaTree.hpp + src/clp_s/TimestampDictionaryReader.cpp + src/clp_s/TimestampDictionaryReader.hpp + src/clp_s/TimestampDictionaryWriter.cpp + src/clp_s/TimestampDictionaryWriter.hpp + src/clp_s/TimestampEntry.cpp + src/clp_s/TimestampEntry.hpp src/clp_s/TimestampPattern.cpp src/clp_s/TimestampPattern.hpp src/clp_s/Utils.cpp src/clp_s/Utils.hpp + src/clp_s/VariableDecoder.cpp + src/clp_s/VariableDecoder.hpp + src/clp_s/VariableEncoder.cpp + src/clp_s/VariableEncoder.hpp + src/clp_s/ZstdCompressor.cpp + src/clp_s/ZstdCompressor.hpp + src/clp_s/ZstdDecompressor.cpp + src/clp_s/ZstdDecompressor.hpp ) set(SOURCE_FILES_unitTest @@ -520,6 +569,7 @@ set(SOURCE_FILES_unitTest tests/LogSuppressor.hpp tests/test-Array.cpp tests/test-BufferedFileReader.cpp + tests/test-clp_s-end_to_end.cpp tests/test-EncodedVariableInterpreter.cpp tests/test-encoding_methods.cpp tests/test-ffi_IrUnitHandlerInterface.cpp @@ -563,6 +613,8 @@ target_link_libraries(unitTest log_surgeon::log_surgeon LibArchive::LibArchive MariaDBClient::MariaDBClient + ${MONGOCXX_TARGET} + simdjson spdlog::spdlog OpenSSL::Crypto ${sqlite_LIBRARY_DEPENDENCIES} diff --git a/components/core/tests/test-clp_s-end_to_end.cpp b/components/core/tests/test-clp_s-end_to_end.cpp new file mode 100644 index 000000000..3f138b472 --- /dev/null +++ b/components/core/tests/test-clp_s-end_to_end.cpp @@ -0,0 +1,158 @@ +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include "../src/clp_s/JsonConstructor.hpp" +#include "../src/clp_s/JsonParser.hpp" + +constexpr std::string_view cTestEndToEndArchiveDirectory{"test-end-to-end-archive"}; +constexpr std::string_view cTestEndToEndOutputDirectory{"test-end-to-end-out"}; +constexpr std::string_view cTestEndToEndOutputSortedJson{"test-end-to-end_sorted.jsonl"}; +constexpr std::string_view cTestEndToEndInputFileDirectory{"test_log_files"}; +constexpr std::string_view cTestEndToEndInputFile{"test_no_floats_sorted.jsonl"}; + +namespace { +/** + * A class that deletes the directories and files created by test cases, both before and after each + * test case where the class is instantiated. + */ +class TestOutputCleaner { +public: + TestOutputCleaner() { delete_files(); } + + ~TestOutputCleaner() { delete_files(); } + + // Delete copy & move constructors and assignment operators + TestOutputCleaner(TestOutputCleaner const&) = delete; + TestOutputCleaner(TestOutputCleaner&&) = delete; + auto operator=(TestOutputCleaner const&) -> TestOutputCleaner& = delete; + auto operator=(TestOutputCleaner&&) -> TestOutputCleaner& = delete; + +private: + static void delete_files() { + std::filesystem::remove_all(cTestEndToEndArchiveDirectory); + std::filesystem::remove_all(cTestEndToEndOutputDirectory); + std::filesystem::remove(cTestEndToEndOutputSortedJson); + } +}; + +auto get_test_input_path_relative_to_tests_dir() -> std::filesystem::path; +auto get_test_input_local_path() -> std::string; +void compress(bool structurize_arrays); +auto extract() -> std::filesystem::path; +void compare(std::filesystem::path const& extracted_json_path); + +auto get_test_input_path_relative_to_tests_dir() -> std::filesystem::path { + return std::filesystem::path{cTestEndToEndInputFileDirectory} / cTestEndToEndInputFile; +} + +auto get_test_input_local_path() -> std::string { + std::filesystem::path const current_file_path{__FILE__}; + auto const tests_dir{current_file_path.parent_path()}; + return (tests_dir / get_test_input_path_relative_to_tests_dir()).string(); +} + +void compress(bool structurize_arrays) { + constexpr auto cDefaultTargetEncodedSize = 8ULL * 1024 * 1024 * 1024; // 8 GiB + constexpr auto cDefaultMaxDocumentSize = 512ULL * 1024 * 1024; // 512 MiB + constexpr auto cDefaultMinTableSize = 1ULL * 1024 * 1024; // 1 MiB + constexpr auto cDefaultCompressionLevel = 3; + constexpr auto cDefaultPrintArchiveStats = false; + + std::filesystem::create_directory(cTestEndToEndArchiveDirectory); + REQUIRE((std::filesystem::is_directory(cTestEndToEndArchiveDirectory))); + + clp_s::JsonParserOption parser_option{}; + parser_option.file_paths.push_back(get_test_input_local_path()); + parser_option.archives_dir = cTestEndToEndArchiveDirectory; + parser_option.target_encoded_size = cDefaultTargetEncodedSize; + parser_option.max_document_size = cDefaultMaxDocumentSize; + parser_option.min_table_size = cDefaultMinTableSize; + parser_option.compression_level = cDefaultCompressionLevel; + parser_option.print_archive_stats = cDefaultPrintArchiveStats; + parser_option.structurize_arrays = structurize_arrays; + + clp_s::JsonParser parser{parser_option}; + REQUIRE(parser.parse()); + parser.store(); + + REQUIRE((false == std::filesystem::is_empty(cTestEndToEndArchiveDirectory))); +} + +auto extract() -> std::filesystem::path { + constexpr auto cDefaultOrdered = false; + constexpr auto cDefaultTargetOrderedChunkSize = 0; + + std::filesystem::create_directory(cTestEndToEndOutputDirectory); + REQUIRE(std::filesystem::is_directory(cTestEndToEndOutputDirectory)); + + clp_s::JsonConstructorOption constructor_option{}; + constructor_option.archives_dir = cTestEndToEndArchiveDirectory; + constructor_option.output_dir = cTestEndToEndOutputDirectory; + constructor_option.ordered = cDefaultOrdered; + constructor_option.target_ordered_chunk_size = cDefaultTargetOrderedChunkSize; + for (auto const& entry : std::filesystem::directory_iterator(constructor_option.archives_dir)) { + if (false == entry.is_directory()) { + // Skip non-directories + continue; + } + + constructor_option.archive_id = entry.path().filename(); + clp_s::JsonConstructor constructor{constructor_option}; + constructor.store(); + } + std::filesystem::path extracted_json_path{cTestEndToEndOutputDirectory}; + extracted_json_path /= "original"; + REQUIRE(std::filesystem::exists(extracted_json_path)); + + return extracted_json_path; +} + +// Silence the checks below since our use of `std::system` is safe in the context of testing. +// NOLINTBEGIN(cert-env33-c,concurrency-mt-unsafe) +void compare(std::filesystem::path const& extracted_json_path) { + int result{std::system("command -v jq >/dev/null 2>&1")}; + REQUIRE((0 == result)); + auto command = fmt::format( + "jq --sort-keys --compact-output '.' {} | sort > {}", + extracted_json_path.string(), + cTestEndToEndOutputSortedJson + ); + result = std::system(command.c_str()); + REQUIRE((0 == result)); + + REQUIRE((false == std::filesystem::is_empty(cTestEndToEndOutputSortedJson))); + + result = std::system("command -v diff >/dev/null 2>&1"); + REQUIRE((0 == result)); + command = fmt::format( + "diff --unified {} {} > /dev/null", + cTestEndToEndOutputSortedJson, + get_test_input_local_path() + ); + result = std::system(command.c_str()); + REQUIRE((true == WIFEXITED(result))); + REQUIRE((0 == WEXITSTATUS(result))); +} + +// NOLINTEND(cert-env33-c,concurrency-mt-unsafe) +} // namespace + +TEST_CASE("clp-s-compress-extract-no-floats", "[clp-s][end-to-end]") { + auto structurize_arrays = GENERATE(true, false); + + TestOutputCleaner const test_cleanup; + + compress(structurize_arrays); + + auto extracted_json_path = extract(); + + compare(extracted_json_path); +} diff --git a/components/core/tests/test_log_files/test_no_floats_sorted.jsonl b/components/core/tests/test_log_files/test_no_floats_sorted.jsonl new file mode 100644 index 000000000..8dfcd85f6 --- /dev/null +++ b/components/core/tests/test_log_files/test_no_floats_sorted.jsonl @@ -0,0 +1,4 @@ +{"clp_string":"uid=0, CPU usage:99.99%, \"user_name\"=YScope","empty_array":[],"empty_object":{},"false":false,"int16_max":32767,"int16_min":-32768,"int32_max":2147483647,"int32_min":-2147483648,"int64_max_jq_losslessly_represents":9824299763229016,"int64_min_jq_losslessly_represents":-9007199254740992,"int8_max":127,"int8_min":-128,"null":null,"string":"short_string","true":true} +{"clp_string":"uid=0, CPU usage:99.99%, \"user_name\"=YScope","empty_array":[],"false":false,"int16_max":32767,"int16_min":-32768,"int32_max":2147483647,"int32_min":-2147483648,"int64_max_jq_losslessly_represents":9824299763229016,"int64_min_jq_losslessly_represents":-9007199254740992,"int8_max":127,"int8_min":-128,"nonempty_object":{"clp_string":"uid=0, CPU usage:99.99%, \"user_name\"=YScope","empty_array":[],"empty_object":{},"false":false,"int16_max":32767,"int16_min":-32768,"int32_max":2147483647,"int32_min":-2147483648,"int64_max_jq_losslessly_represents":9824299763229016,"int64_min_jq_losslessly_represents":-9007199254740992,"int8_max":127,"int8_min":-128,"null":null,"string":"short_string","true":true},"null":null,"string":"short_string","true":true} +{"clp_string":"uid=0, CPU usage:99.99%, \"user_name\"=YScope","empty_array":[],"false":false,"int16_max":32767,"int16_min":-32768,"int32_max":2147483647,"int32_min":-2147483648,"int64_max_jq_losslessly_represents":9824299763229016,"int64_min_jq_losslessly_represents":-9007199254740992,"int8_max":127,"int8_min":-128,"nonempty_object":{"clp_string":"uid=0, CPU usage:99.99%, \"user_name\"=YScope","empty_array":[],"false":false,"int16_max":32767,"int16_min":-32768,"int32_max":2147483647,"int32_min":-2147483648,"int64_max_jq_losslessly_represents":9824299763229016,"int64_min_jq_losslessly_represents":-9007199254740992,"int8_max":127,"int8_min":-128,"non_empty_object2":{"clp_string":"uid=0, CPU usage:99.99%, \"user_name\"=YScope","empty_array":[],"empty_object":{},"false":false,"int16_max":32767,"int16_min":-32768,"int32_max":2147483647,"int32_min":-2147483648,"int64_max_jq_losslessly_represents":9824299763229016,"int64_min_jq_losslessly_represents":-9007199254740992,"int8_max":127,"int8_min":-128,"null":null,"string":"short_string","true":true},"null":null,"string":"short_string","true":true},"null":null,"string":"short_string","true":true} +{"clp_string":"uid=0, CPU usage:99.99%, \"user_name\"=YScope","empty_object":{},"false":false,"int16_max":32767,"int16_min":-32768,"int32_max":2147483647,"int32_min":-2147483648,"int64_max_jq_losslessly_represents":9824299763229016,"int64_min_jq_losslessly_represents":-9007199254740992,"int8_max":127,"int8_min":-128,"nonempty_array":[1,2,3,4,5],"null":null,"string":"short_string","true":true} diff --git a/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh index eede5e004..c51a521c1 100755 --- a/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh +++ b/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh @@ -8,9 +8,11 @@ set -u dnf install -y \ cmake \ + diffutils \ gcc-c++ \ git \ java-11-openjdk \ + jq \ libarchive-devel \ libcurl-devel \ libzstd-devel \ diff --git a/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh index b373cbe4d..3ea3b3ed5 100755 --- a/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh +++ b/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh @@ -17,6 +17,7 @@ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ gcc \ gcc-10 \ git \ + jq \ libcurl4 \ libcurl4-openssl-dev \ liblzma-dev \ diff --git a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh index e2e17283b..ca1f5f59e 100755 --- a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh +++ b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh @@ -14,6 +14,7 @@ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ curl \ build-essential \ git \ + jq \ libboost-filesystem-dev \ libboost-iostreams-dev \ libboost-program-options-dev \ From cbf8bf9224490bb46b70f97ff5218f773dd2c8ba Mon Sep 17 00:00:00 2001 From: Devin Gibson Date: Mon, 2 Dec 2024 15:16:04 -0500 Subject: [PATCH 42/65] docs(clp-json): Update list of characters that requires escaping in queries. (#617) Co-authored-by: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- .../reference-json-search-syntax.md | 36 +++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/docs/src/user-guide/reference-json-search-syntax.md b/docs/src/user-guide/reference-json-search-syntax.md index ca6898984..18d0e4267 100644 --- a/docs/src/user-guide/reference-json-search-syntax.md +++ b/docs/src/user-guide/reference-json-search-syntax.md @@ -33,15 +33,18 @@ To search for a key or value with multiple words, you must quote the key/value w "multi-word key": "multi-word value" ``` -Queries for keys or values with the following literal characters must escape the characters using a -`\` (backslash): `\`, `(`, `)`, `:`, `<`, `>`, `"`, `*`, `{`, `}`. - :::{caution} Currently, a query that contains spaces is interpreted as a substring search, i.e., it will match log events that contain the value as a *substring*. In a future version of CLP, these queries will be interpreted as _exact_ searches unless they include [wildcards](#wildcards-in-values). ::: +:::{note} +Certain characters have special meanings when used in keys or values, so to search for the +characters literally, you must escape them. For a list of such characters, see +[Escaping special characters](#escaping-special-characters). +::: + ### Querying nested kv-pairs If the kv-pair is nested in one or more objects, you can specify the key in one of two ways: @@ -161,6 +164,33 @@ There are three supported boolean operators: You can use parentheses (`()`) to apply an operator to a group of expressions. +### Escaping special characters + +Keys containing the following literal characters must escape the characters using a `\` (backslash): + +* `\` +* `"` +* `.` + +Values containing the following literal characters must escape the characters using a `\` +(backslash): + +* `\` +* `"` +* `?` +* `*` + +_Unquoted_ keys or values containing the following literal characters must also escape the +characters using a `\` (backslash): + +* `(` +* `)` +* `:` +* `<` +* `>` +* `{` +* `}` + ## Examples **Search for log events that contain a specific key-value pair:** From 44b0f2b8ce3feb3be1757bea8b4c71ac87434fcb Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Mon, 2 Dec 2024 16:00:18 -0500 Subject: [PATCH 43/65] feat(core): Add `ErrorCode` template to standardize conversion of user-defined error code enums to `std::error_code`. (#486) --- components/core/CMakeLists.txt | 2 + .../core/src/clp/error_handling/ErrorCode.hpp | 150 ++++++++++++++++++ components/core/tests/test-error_handling.cpp | 141 ++++++++++++++++ 3 files changed, 293 insertions(+) create mode 100644 components/core/src/clp/error_handling/ErrorCode.hpp create mode 100644 components/core/tests/test-error_handling.cpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 9e14498b0..f974e5c7e 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -386,6 +386,7 @@ set(SOURCE_FILES_unitTest src/clp/DictionaryEntry.hpp src/clp/DictionaryReader.hpp src/clp/DictionaryWriter.hpp + src/clp/error_handling/ErrorCode.hpp src/clp/EncodedVariableInterpreter.cpp src/clp/EncodedVariableInterpreter.hpp src/clp/ErrorCode.hpp @@ -572,6 +573,7 @@ set(SOURCE_FILES_unitTest tests/test-clp_s-end_to_end.cpp tests/test-EncodedVariableInterpreter.cpp tests/test-encoding_methods.cpp + tests/test-error_handling.cpp tests/test-ffi_IrUnitHandlerInterface.cpp tests/test-ffi_KeyValuePairLogEvent.cpp tests/test-ffi_SchemaTree.cpp diff --git a/components/core/src/clp/error_handling/ErrorCode.hpp b/components/core/src/clp/error_handling/ErrorCode.hpp new file mode 100644 index 000000000..2612e7768 --- /dev/null +++ b/components/core/src/clp/error_handling/ErrorCode.hpp @@ -0,0 +1,150 @@ +#ifndef CLP_ERROR_HANDLING_ERRORCODE_HPP +#define CLP_ERROR_HANDLING_ERRORCODE_HPP + +#include +#include +#include +#include + +namespace clp::error_handling { +/** + * Concept that defines a template parameter of an integer-based error code enumeration. + * @tparam Type + */ +template +concept ErrorCodeEnumType = std::is_enum_v && requires(Type type) { + { + static_cast>(type) + } -> std::convertible_to; +}; + +/** + * Template that defines a `std::error_category` of the given set of error code enumeration. + * @tparam ErrorCodeEnum + */ +template +class ErrorCategory : public std::error_category { +public: + // Methods implementing `std::error_category` + /** + * Gets the error category name. + * Note: A specialization must be explicitly implemented for each valid `ErrorCodeEnum`. + * @return The name of the error category. + */ + [[nodiscard]] auto name() const noexcept -> char const* override; + + /** + * Gets the descriptive message associated with the given error. + * @param error_num + * @return The descriptive message for the error. + */ + [[nodiscard]] auto message(int error_num) const -> std::string override { + return message(static_cast(error_num)); + } + + /** + * @param error_num + * @param condition + * @return Whether the error condition of the given error matches the given condition. + */ + [[nodiscard]] auto equivalent( + int error_num, + std::error_condition const& condition + ) const noexcept -> bool override { + return equivalent(static_cast(error_num), condition); + } + + // Methods + /** + * Gets the descriptive message associated with the given error. + * Note: A specialization must be explicitly implemented for each valid `ErrorCodeEnum`. + * @param error_enum. + * @return The descriptive message for the error. + */ + [[nodiscard]] auto message(ErrorCodeEnum error_enum) const -> std::string; + + /** + * Note: A specialization can be implemented to create error enum to error condition mappings. + * @param error_num + * @param condition + * @return Whether the error condition of the given error matches the given condition. + */ + [[nodiscard]] auto equivalent( + ErrorCodeEnum error_enum, + std::error_condition const& condition + ) const noexcept -> bool; +}; + +/** + * Template class that defines an error code. An error code is represented by a error enum value and + * the associated error category. This template class is designed to be `std::error_code` + * compatible, meaning that every instance of this class can be used to construct a corresponded + * `std::error_code` instance, or compare with a `std::error_code` instance to inspect a specific + * error. + * @tparam ErrorCodeEnum + */ +template +class ErrorCode { +public: + // Constructor + ErrorCode(ErrorCodeEnum error) : m_error{error} {} + + /** + * @return The underlying error code enum. + */ + [[nodiscard]] auto get_error() const -> ErrorCodeEnum { return m_error; } + + /** + * @return The error code as an error number. + */ + [[nodiscard]] auto get_error_num() const -> int { return static_cast(m_error); } + + /** + * @return The reference to the singleton of the corresponded error category. + */ + [[nodiscard]] constexpr static auto get_category() -> ErrorCategory const& { + return cCategory; + } + +private: + static inline ErrorCategory const cCategory; + + ErrorCodeEnum m_error; +}; + +/** + * @tparam ErrorCodeEnum + * @param error + * @return Constructed `std::error_code` from the given `ErrorCode` instance. + */ +template +[[nodiscard]] auto make_error_code(ErrorCode error) -> std::error_code; + +template +auto ErrorCategory::equivalent( + ErrorCodeEnum error_enum, + std::error_condition const& condition +) const noexcept -> bool { + return std::error_category::default_error_condition(static_cast(error_enum)) == condition; +} + +template +auto make_error_code(ErrorCode error) -> std::error_code { + return {error.get_error_num(), ErrorCode::get_category()}; +} +} // namespace clp::error_handling + +/** + * The macro to create a specialization of `std::is_error_code_enum` for a given type T. Only types + * that are marked with this macro will be considered as a valid CLP error code enum, and thus used + * to specialize `ErrorCode` and `ErrorCategory` templates. + */ +// NOLINTBEGIN(bugprone-macro-parentheses, cppcoreguidelines-macro-usage) +#define CLP_ERROR_HANDLING_MARK_AS_ERROR_CODE_ENUM(T) \ + template <> \ + struct std::is_error_code_enum> : std::true_type { \ + static_assert(std::is_enum_v); \ + }; +// NOLINTEND(bugprone-macro-parentheses, cppcoreguidelines-macro-usage) + +#endif // CLP_ERROR_HANDLING_ERRORCODE_HPP diff --git a/components/core/tests/test-error_handling.cpp b/components/core/tests/test-error_handling.cpp new file mode 100644 index 000000000..2d640ed57 --- /dev/null +++ b/components/core/tests/test-error_handling.cpp @@ -0,0 +1,141 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../src/clp/error_handling/ErrorCode.hpp" + +using clp::error_handling::ErrorCategory; +using clp::error_handling::ErrorCode; +using std::string; +using std::string_view; + +namespace { +enum class AlwaysSuccessErrorCodeEnum : uint8_t { + Success = 0 +}; + +enum class BinaryErrorCodeEnum : uint8_t { + Success = 0, + Failure +}; + +using AlwaysSuccessErrorCode = ErrorCode; +using AlwaysSuccessErrorCategory = ErrorCategory; +using BinaryErrorCode = ErrorCode; +using BinaryErrorCategory = ErrorCategory; + +constexpr string_view cAlwaysSuccessErrorCategoryName{"Always Success Error Code"}; +constexpr string_view cBinaryTestErrorCategoryName{"Binary Error Code"}; +constexpr string_view cSuccessErrorMsg{"Success"}; +constexpr string_view cFailureErrorMsg{"Failure"}; +constexpr string_view cUnrecognizedErrorCode{"Unrecognized Error Code"}; +constexpr std::array cFailureConditions{std::errc::not_connected, std::errc::timed_out}; +constexpr std::array cNoneFailureConditions{std::errc::broken_pipe, std::errc::address_in_use}; +} // namespace + +CLP_ERROR_HANDLING_MARK_AS_ERROR_CODE_ENUM(AlwaysSuccessErrorCodeEnum); +CLP_ERROR_HANDLING_MARK_AS_ERROR_CODE_ENUM(BinaryErrorCodeEnum); + +template <> +auto AlwaysSuccessErrorCategory::name() const noexcept -> char const* { + return cAlwaysSuccessErrorCategoryName.data(); +} + +template <> +auto AlwaysSuccessErrorCategory::message(AlwaysSuccessErrorCodeEnum error_enum) const -> string { + switch (error_enum) { + case AlwaysSuccessErrorCodeEnum::Success: + return string{cSuccessErrorMsg}; + default: + return string{cUnrecognizedErrorCode}; + } +} + +template <> +auto BinaryErrorCategory::name() const noexcept -> char const* { + return cBinaryTestErrorCategoryName.data(); +} + +template <> +auto BinaryErrorCategory::message(BinaryErrorCodeEnum error_enum) const -> string { + switch (error_enum) { + case BinaryErrorCodeEnum::Success: + return string{cSuccessErrorMsg}; + case BinaryErrorCodeEnum::Failure: + return string{cFailureErrorMsg}; + default: + return string{cUnrecognizedErrorCode}; + } +} + +template <> +auto BinaryErrorCategory::equivalent( + BinaryErrorCodeEnum error_enum, + std::error_condition const& condition +) const noexcept -> bool { + switch (error_enum) { + case BinaryErrorCodeEnum::Failure: + return std::any_of( + cFailureConditions.cbegin(), + cFailureConditions.cend(), + [&](auto failure_condition) -> bool { return condition == failure_condition; } + ); + default: + return false; + } +} + +TEST_CASE("test_error_code_implementation", "[error_handling][ErrorCode]") { + // Test error codes within the same error category + BinaryErrorCode const success{BinaryErrorCodeEnum::Success}; + std::error_code const success_error_code{success}; + REQUIRE((success == success_error_code)); + REQUIRE((cSuccessErrorMsg == success_error_code.message())); + REQUIRE((BinaryErrorCode::get_category() == success_error_code.category())); + REQUIRE((cBinaryTestErrorCategoryName == success_error_code.category().name())); + + BinaryErrorCode const failure{BinaryErrorCodeEnum::Failure}; + std::error_code const failure_error_code{failure}; + REQUIRE((failure == failure_error_code)); + REQUIRE((cFailureErrorMsg == failure_error_code.message())); + REQUIRE((BinaryErrorCode::get_category() == failure_error_code.category())); + REQUIRE((cBinaryTestErrorCategoryName == failure_error_code.category().name())); + std::for_each( + cFailureConditions.cbegin(), + cFailureConditions.cend(), + [&](auto failure_condition) { REQUIRE((failure_error_code == failure_condition)); } + ); + std::for_each( + cNoneFailureConditions.cbegin(), + cNoneFailureConditions.cend(), + [&](auto none_failure_condition) { + REQUIRE((failure_error_code != none_failure_condition)); + } + ); + + REQUIRE((success_error_code != failure_error_code)); + REQUIRE((success_error_code.category() == failure_error_code.category())); + + AlwaysSuccessErrorCode const always_success{AlwaysSuccessErrorCodeEnum::Success}; + std::error_code const always_success_error_code{always_success}; + REQUIRE((always_success_error_code == always_success)); + REQUIRE((cSuccessErrorMsg == always_success_error_code.message())); + REQUIRE((AlwaysSuccessErrorCode::get_category() == always_success_error_code.category())); + REQUIRE((cAlwaysSuccessErrorCategoryName == always_success_error_code.category().name())); + + // Compare error codes from different error category + // Error codes that have the same value or message won't be the same with each other if they are + // from different error categories. + REQUIRE((success_error_code.value() == always_success_error_code.value())); + REQUIRE((success_error_code.message() == always_success_error_code.message())); + REQUIRE((success_error_code.category() != always_success_error_code.category())); + REQUIRE((success_error_code != always_success_error_code)); + REQUIRE((AlwaysSuccessErrorCode{AlwaysSuccessErrorCodeEnum::Success} != success_error_code)); + REQUIRE((BinaryErrorCode{BinaryErrorCodeEnum::Success} != always_success_error_code)); +} From 4d21d9b04c42e3f9de2ddfa19491240e9cb99ce4 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Thu, 5 Dec 2024 16:58:02 -0500 Subject: [PATCH 44/65] revert(core): Remove temporary output directory option from `clp` and `clo`. (#619) --- components/core/src/clp/clo/CommandLineArguments.cpp | 8 -------- components/core/src/clp/clo/CommandLineArguments.hpp | 5 ----- components/core/src/clp/clo/clo.cpp | 2 +- components/core/src/clp/clp/CommandLineArguments.cpp | 11 ----------- components/core/src/clp/clp/CommandLineArguments.hpp | 3 --- components/core/src/clp/clp/decompression.cpp | 2 +- 6 files changed, 2 insertions(+), 29 deletions(-) diff --git a/components/core/src/clp/clo/CommandLineArguments.cpp b/components/core/src/clp/clo/CommandLineArguments.cpp index fffc3d783..4e187f985 100644 --- a/components/core/src/clp/clo/CommandLineArguments.cpp +++ b/components/core/src/clp/clo/CommandLineArguments.cpp @@ -181,10 +181,6 @@ auto CommandLineArguments::parse_ir_extraction_arguments( // clang-format off options_ir_extraction .add_options()( - "temp-output-dir", - po::value(&m_ir_temp_output_dir)->value_name("DIR"), - "Temporary output directory for IR chunks while they're being written" - )( "target-size", po::value(&m_ir_target_size)->value_name("SIZE"), "Target size (B) for each IR chunk before a new chunk is created" @@ -287,10 +283,6 @@ auto CommandLineArguments::parse_ir_extraction_arguments( if (m_ir_mongodb_collection.empty()) { throw invalid_argument("COLLECTION not specified or empty."); } - - if (m_ir_temp_output_dir.empty()) { - m_ir_temp_output_dir = m_ir_output_dir; - } return ParsingResult::Success; } diff --git a/components/core/src/clp/clo/CommandLineArguments.hpp b/components/core/src/clp/clo/CommandLineArguments.hpp index 9e6d311c3..d84b96a18 100644 --- a/components/core/src/clp/clo/CommandLineArguments.hpp +++ b/components/core/src/clp/clo/CommandLineArguments.hpp @@ -54,10 +54,6 @@ class CommandLineArguments : public CommandLineArgumentsBase { [[nodiscard]] auto get_ir_output_dir() const -> std::string const& { return m_ir_output_dir; } - [[nodiscard]] auto get_ir_temp_output_dir() const -> std::string const& { - return m_ir_temp_output_dir; - } - [[nodiscard]] auto get_ir_mongodb_uri() const -> std::string const& { return m_ir_mongodb_uri; } [[nodiscard]] auto get_ir_mongodb_collection() const -> std::string const& { @@ -187,7 +183,6 @@ class CommandLineArguments : public CommandLineArgumentsBase { std::string m_file_split_id; size_t m_ir_target_size{128ULL * 1024 * 1024}; std::string m_ir_output_dir; - std::string m_ir_temp_output_dir; std::string m_ir_mongodb_uri; std::string m_ir_mongodb_collection; diff --git a/components/core/src/clp/clo/clo.cpp b/components/core/src/clp/clo/clo.cpp index f29df0306..23ff6f67e 100644 --- a/components/core/src/clp/clo/clo.cpp +++ b/components/core/src/clp/clo/clo.cpp @@ -224,7 +224,7 @@ bool extract_ir(CommandLineArguments const& command_line_args) { archive_reader, *file_metadata_ix_ptr, command_line_args.get_ir_target_size(), - command_line_args.get_ir_temp_output_dir(), + command_line_args.get_ir_output_dir(), ir_output_handler )) { diff --git a/components/core/src/clp/clp/CommandLineArguments.cpp b/components/core/src/clp/clp/CommandLineArguments.cpp index ccdc99793..cb44d96d8 100644 --- a/components/core/src/clp/clp/CommandLineArguments.cpp +++ b/components/core/src/clp/clp/CommandLineArguments.cpp @@ -255,13 +255,6 @@ CommandLineArguments::parse_arguments(int argc, char const* argv[]) { ->default_value(m_ir_target_size), "Target size (B) for each IR chunk before a new chunk is created" ); - options_ir.add_options()( - "temp-output-dir", - po::value(&m_ir_temp_output_dir) - ->value_name("DIR") - ->default_value(m_ir_temp_output_dir), - "Temporary output directory for IR chunks while they're being written" - ); po::options_description all_ir_options; all_ir_options.add(ir_positional_options); @@ -311,10 +304,6 @@ CommandLineArguments::parse_arguments(int argc, char const* argv[]) { if (m_orig_file_id.empty()) { throw invalid_argument("ORIG_FILE_ID cannot be empty."); } - - if (m_ir_temp_output_dir.empty()) { - m_ir_temp_output_dir = m_output_dir; - } } else if (Command::Compress == m_command) { // Define compression hidden positional options po::options_description compression_positional_options; diff --git a/components/core/src/clp/clp/CommandLineArguments.hpp b/components/core/src/clp/clp/CommandLineArguments.hpp index b9cf15740..6e14a4b3b 100644 --- a/components/core/src/clp/clp/CommandLineArguments.hpp +++ b/components/core/src/clp/clp/CommandLineArguments.hpp @@ -37,8 +37,6 @@ class CommandLineArguments : public CommandLineArgumentsBase { std::string const& get_path_prefix_to_remove() const { return m_path_prefix_to_remove; } - std::string const& get_ir_temp_output_dir() const { return m_ir_temp_output_dir; } - std::string const& get_output_dir() const { return m_output_dir; } std::string const& get_schema_file_path() const { return m_schema_file_path; } @@ -91,7 +89,6 @@ class CommandLineArguments : public CommandLineArgumentsBase { size_t m_ir_msg_ix{0}; size_t m_ir_target_size{128ULL * 1024 * 1024}; bool m_sort_input_files; - std::string m_ir_temp_output_dir; std::string m_output_dir; std::string m_schema_file_path; bool m_show_progress; diff --git a/components/core/src/clp/clp/decompression.cpp b/components/core/src/clp/clp/decompression.cpp index 6b87f6777..b8ae06350 100644 --- a/components/core/src/clp/clp/decompression.cpp +++ b/components/core/src/clp/clp/decompression.cpp @@ -310,7 +310,7 @@ bool decompress_to_ir(CommandLineArguments& command_line_args) { archive_reader, *file_metadata_ix_ptr, command_line_args.get_ir_target_size(), - command_line_args.get_ir_temp_output_dir(), + command_line_args.get_output_dir(), ir_output_handler )) { From 36892c1560772176b6d7ace258a4a50ea70dcb18 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Fri, 6 Dec 2024 14:34:52 -0500 Subject: [PATCH 45/65] refactor(clp-package): Unify the metadata schema for JSON and IR streams. (#620) --- components/core/src/clp/clo/OutputHandler.cpp | 2 +- components/core/src/clp/clo/clo.cpp | 12 ++++-------- components/core/src/clp/clo/constants.hpp | 7 +++---- .../core/src/clp/clp/FileDecompressor.hpp | 2 +- components/core/src/clp/clp/decompression.cpp | 2 +- components/core/src/clp_s/JsonConstructor.cpp | 4 ++-- .../core/src/clp_s/archive_constants.hpp | 4 ++-- .../log-viewer-webui/client/src/api/query.js | 19 ++++--------------- .../log-viewer-webui/server/src/DbManager.js | 2 +- 9 files changed, 19 insertions(+), 35 deletions(-) diff --git a/components/core/src/clp/clo/OutputHandler.cpp b/components/core/src/clp/clo/OutputHandler.cpp index bdf1bb1bd..1d92777c5 100644 --- a/components/core/src/clp/clo/OutputHandler.cpp +++ b/components/core/src/clp/clo/OutputHandler.cpp @@ -100,7 +100,7 @@ ErrorCode ResultsCacheOutputHandler::flush() { try { m_results.emplace_back(std::move(bsoncxx::builder::basic::make_document( bsoncxx::builder::basic::kvp( - cResultsCacheKeys::OrigFileId, + cResultsCacheKeys::SearchOutput::OrigFileId, std::move(result.orig_file_id) ), bsoncxx::builder::basic::kvp( diff --git a/components/core/src/clp/clo/clo.cpp b/components/core/src/clp/clo/clo.cpp index 23ff6f67e..d62049e6b 100644 --- a/components/core/src/clp/clo/clo.cpp +++ b/components/core/src/clp/clo/clo.cpp @@ -171,7 +171,7 @@ bool extract_ir(CommandLineArguments const& command_line_args) { string const& orig_file_id, size_t begin_message_ix, size_t end_message_ix, - bool is_last_ir_chunk) { + bool is_last_chunk) { auto dest_ir_file_name = orig_file_id; dest_ir_file_name += "_" + std::to_string(begin_message_ix); dest_ir_file_name += "_" + std::to_string(end_message_ix); @@ -195,13 +195,9 @@ bool extract_ir(CommandLineArguments const& command_line_args) { dest_ir_file_name ), bsoncxx::builder::basic::kvp( - clp::clo::cResultsCacheKeys::OrigFileId, + clp::clo::cResultsCacheKeys::IrOutput::StreamId, orig_file_id ), - bsoncxx::builder::basic::kvp( - clp::clo::cResultsCacheKeys::IrOutput::FileSplitId, - file_split_id - ), bsoncxx::builder::basic::kvp( clp::clo::cResultsCacheKeys::IrOutput::BeginMsgIx, static_cast(begin_message_ix) @@ -211,8 +207,8 @@ bool extract_ir(CommandLineArguments const& command_line_args) { static_cast(end_message_ix) ), bsoncxx::builder::basic::kvp( - clp::clo::cResultsCacheKeys::IrOutput::IsLastIrChunk, - is_last_ir_chunk + clp::clo::cResultsCacheKeys::IrOutput::IsLastChunk, + is_last_chunk ) ))); return true; diff --git a/components/core/src/clp/clo/constants.hpp b/components/core/src/clp/clo/constants.hpp index 86f7313f2..945bde83e 100644 --- a/components/core/src/clp/clo/constants.hpp +++ b/components/core/src/clp/clo/constants.hpp @@ -3,17 +3,16 @@ // NOLINTBEGIN(cppcoreguidelines-avoid-c-arrays, readability-identifier-naming) namespace clp::clo::cResultsCacheKeys { -constexpr char OrigFileId[]{"orig_file_id"}; - namespace IrOutput { constexpr char Path[]{"path"}; -constexpr char FileSplitId[]{"file_split_id"}; +constexpr char StreamId[]{"stream_id"}; constexpr char BeginMsgIx[]{"begin_msg_ix"}; constexpr char EndMsgIx[]{"end_msg_ix"}; -constexpr char IsLastIrChunk[]{"is_last_ir_chunk"}; +constexpr char IsLastChunk[]{"is_last_chunk"}; } // namespace IrOutput namespace SearchOutput { +constexpr char OrigFileId[]{"orig_file_id"}; constexpr char OrigFilePath[]{"orig_file_path"}; constexpr char LogEventIx[]{"log_event_ix"}; constexpr char Timestamp[]{"timestamp"}; diff --git a/components/core/src/clp/clp/FileDecompressor.hpp b/components/core/src/clp/clp/FileDecompressor.hpp index 932cab7c5..b08a21eb4 100644 --- a/components/core/src/clp/clp/FileDecompressor.hpp +++ b/components/core/src/clp/clp/FileDecompressor.hpp @@ -39,7 +39,7 @@ class FileDecompressor { * * @tparam IrOutputHandler Function to handle the resulting IR chunks. * Signature: (std::filesystem::path const& ir_file_path, string const& orig_file_id, - * size_t begin_message_ix, size_t end_message_ix, bool is_last_ir_chunk) -> bool; + * size_t begin_message_ix, size_t end_message_ix, bool is_last_chunk) -> bool; * The function returns whether it succeeded. * @param archive_reader * @param file_metadata_ix diff --git a/components/core/src/clp/clp/decompression.cpp b/components/core/src/clp/clp/decompression.cpp index b8ae06350..c42357334 100644 --- a/components/core/src/clp/clp/decompression.cpp +++ b/components/core/src/clp/clp/decompression.cpp @@ -282,7 +282,7 @@ bool decompress_to_ir(CommandLineArguments& command_line_args) { string const& orig_file_id, size_t begin_message_ix, size_t end_message_ix, - [[maybe_unused]] bool is_last_ir_chunk) { + [[maybe_unused]] bool is_last_chunk) { auto dest_ir_file_name = orig_file_id; dest_ir_file_name += "_" + std::to_string(begin_message_ix); dest_ir_file_name += "_" + std::to_string(end_message_ix); diff --git a/components/core/src/clp_s/JsonConstructor.cpp b/components/core/src/clp_s/JsonConstructor.cpp index 95e3fa2c5..8886f2074 100644 --- a/components/core/src/clp_s/JsonConstructor.cpp +++ b/components/core/src/clp_s/JsonConstructor.cpp @@ -122,7 +122,7 @@ void JsonConstructor::construct_in_order() { new_file_path.filename() ), bsoncxx::builder::basic::kvp( - constants::results_cache::decompression::cOrigFileId, + constants::results_cache::decompression::cStreamId, m_option.archive_id ), bsoncxx::builder::basic::kvp( @@ -134,7 +134,7 @@ void JsonConstructor::construct_in_order() { last_idx ), bsoncxx::builder::basic::kvp( - constants::results_cache::decompression::cIsLastIrChunk, + constants::results_cache::decompression::cIsLastChunk, false == open_new_writer ) ))); diff --git a/components/core/src/clp_s/archive_constants.hpp b/components/core/src/clp_s/archive_constants.hpp index b76af2944..6dd7b6928 100644 --- a/components/core/src/clp_s/archive_constants.hpp +++ b/components/core/src/clp_s/archive_constants.hpp @@ -29,10 +29,10 @@ constexpr char cLogEventIdxName[] = "log_event_idx"; namespace results_cache::decompression { constexpr char cPath[]{"path"}; -constexpr char cOrigFileId[]{"orig_file_id"}; +constexpr char cStreamId[]{"stream_id"}; constexpr char cBeginMsgIx[]{"begin_msg_ix"}; constexpr char cEndMsgIx[]{"end_msg_ix"}; -constexpr char cIsLastIrChunk[]{"is_last_ir_chunk"}; +constexpr char cIsLastChunk[]{"is_last_chunk"}; } // namespace results_cache::decompression namespace results_cache::search { diff --git a/components/log-viewer-webui/client/src/api/query.js b/components/log-viewer-webui/client/src/api/query.js index eda1db21c..f48f610a1 100644 --- a/components/log-viewer-webui/client/src/api/query.js +++ b/components/log-viewer-webui/client/src/api/query.js @@ -2,22 +2,11 @@ import axios from "axios"; /** - * @typedef {object} ExtractIrResp + * @typedef {object} ExtractStreamResp + * @property {string} stream_id * @property {number} begin_msg_ix * @property {number} end_msg_ix - * @property {string} file_split_id - * @property {boolean} is_last_ir_chunk - * @property {string} orig_file_id - * @property {string} path - * @property {string} _id - */ - -/** - * @typedef {object} ExtractJsonResp - * @property {number} begin_msg_ix - * @property {number} end_msg_ix - * @property {boolean} is_last_ir_chunk - * @property {string} orig_file_id + * @property {boolean} is_last_chunk * @property {string} path * @property {string} _id */ @@ -30,7 +19,7 @@ import axios from "axios"; * @param {string} streamId * @param {number} logEventIdx * @param {Function} onUploadProgress Callback to handle upload progress events. - * @return {Promise>} + * @return {Promise>} */ const submitExtractStreamJob = async (extractJobType, streamId, logEventIdx, onUploadProgress) => { return await axios.post( diff --git a/components/log-viewer-webui/server/src/DbManager.js b/components/log-viewer-webui/server/src/DbManager.js index e1ec00812..fc48ba5e8 100644 --- a/components/log-viewer-webui/server/src/DbManager.js +++ b/components/log-viewer-webui/server/src/DbManager.js @@ -171,7 +171,7 @@ class DbManager { */ async getExtractedStreamFileMetadata (streamId, logEventIdx) { return await this.#streamFilesCollection.findOne({ - orig_file_id: streamId, + stream_id: streamId, begin_msg_ix: {$lte: logEventIdx}, end_msg_ix: {$gt: logEventIdx}, }); From 604bd75d36657c1efb25233e7dd6611643a46cb9 Mon Sep 17 00:00:00 2001 From: Abigail Matthews Date: Mon, 9 Dec 2024 11:35:29 -0500 Subject: [PATCH 46/65] feat(clp-s): Add command line options for stubbed out kv-pair-IR ingestion. (#618) --- .../core/src/clp_s/CommandLineArguments.cpp | 23 +++++++++++++++++++ .../core/src/clp_s/CommandLineArguments.hpp | 8 +++++++ components/core/src/clp_s/JsonParser.hpp | 2 ++ components/core/src/clp_s/clp-s.cpp | 13 +++++++++-- 4 files changed, 44 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp index 99539b627..c7fb9487e 100644 --- a/components/core/src/clp_s/CommandLineArguments.cpp +++ b/components/core/src/clp_s/CommandLineArguments.cpp @@ -148,6 +148,9 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { po::options_description compression_options("Compression options"); std::string metadata_db_config_file_path; std::string input_path_list_file_path; + constexpr std::string_view cJsonFileType{"json"}; + constexpr std::string_view cKeyValueIrFileType{"kv-ir"}; + std::string file_type{cJsonFileType}; // clang-format off compression_options.add_options()( "compression-level", @@ -202,6 +205,10 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { "disable-log-order", po::bool_switch(&m_disable_log_order), "Do not record log order at ingestion time." + )( + "file-type", + po::value(&file_type)->value_name("FILE_TYPE")->default_value(file_type), + "The type of file being compressed (json or kv-ir)" ); // clang-format on @@ -255,6 +262,22 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { throw std::invalid_argument("No input paths specified."); } + if (cJsonFileType == file_type) { + m_file_type = FileType::Json; + } else if (cKeyValueIrFileType == file_type) { + m_file_type = FileType::KeyValueIr; + if (m_structurize_arrays) { + SPDLOG_ERROR( + "Invalid combination of arguments; --file-type {} and " + "--structurize-arrays can't be used together", + cKeyValueIrFileType + ); + return ParsingResult::Failure; + } + } else { + throw std::invalid_argument("Unknown FILE_TYPE: " + file_type); + } + // Parse and validate global metadata DB config if (false == metadata_db_config_file_path.empty()) { clp::GlobalMetadataDBConfig metadata_db_config; diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp index a87e9b6bd..47c244646 100644 --- a/components/core/src/clp_s/CommandLineArguments.hpp +++ b/components/core/src/clp_s/CommandLineArguments.hpp @@ -36,6 +36,11 @@ class CommandLineArguments { Stdout, }; + enum class FileType : uint8_t { + Json = 0, + KeyValueIr + }; + // Constructors explicit CommandLineArguments(std::string const& program_name) : m_program_name(program_name) {} @@ -116,6 +121,8 @@ class CommandLineArguments { bool get_record_log_order() const { return false == m_disable_log_order; } + [[nodiscard]] auto get_file_type() const -> FileType { return m_file_type; } + private: // Methods /** @@ -184,6 +191,7 @@ class CommandLineArguments { size_t m_target_ordered_chunk_size{}; size_t m_minimum_table_size{1ULL * 1024 * 1024}; // 1 MB bool m_disable_log_order{false}; + FileType m_file_type{FileType::Json}; // Metadata db variables std::optional m_metadata_db_config; diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp index bfd423c22..c05ab9d60 100644 --- a/components/core/src/clp_s/JsonParser.hpp +++ b/components/core/src/clp_s/JsonParser.hpp @@ -12,6 +12,7 @@ #include "../clp/GlobalMySQLMetadataDB.hpp" #include "ArchiveWriter.hpp" +#include "CommandLineArguments.hpp" #include "DictionaryWriter.hpp" #include "FileReader.hpp" #include "FileWriter.hpp" @@ -29,6 +30,7 @@ using namespace simdjson; namespace clp_s { struct JsonParserOption { std::vector file_paths; + CommandLineArguments::FileType input_file_type{CommandLineArguments::FileType::Json}; std::string timestamp_key; std::string archives_dir; size_t target_encoded_size{}; diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index b76683caf..2c6639290 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -88,6 +88,7 @@ bool compress(CommandLineArguments const& command_line_arguments) { clp_s::JsonParserOption option{}; option.file_paths = command_line_arguments.get_file_paths(); + option.input_file_type = command_line_arguments.get_file_type(); option.archives_dir = archives_dir.string(); option.target_encoded_size = command_line_arguments.get_target_encoded_size(); option.max_document_size = command_line_arguments.get_max_document_size(); @@ -113,9 +114,17 @@ bool compress(CommandLineArguments const& command_line_arguments) { } clp_s::JsonParser parser(option); - if (false == parser.parse()) { - SPDLOG_ERROR("Encountered error while parsing input"); + if (CommandLineArguments::FileType::KeyValueIr == option.input_file_type) { + // Functionality Coming in later PR + // -->Call new parsing function in Json Parser to parse IRv2 to archive + // -->Check for error from parsing function + SPDLOG_ERROR("Compressing Key Value IR Files is not yet supported"); return false; + } else { + if (false == parser.parse()) { + SPDLOG_ERROR("Encountered error while parsing input"); + return false; + } } parser.store(); return true; From 0a9322b9e86921f26690a68684c11dee4f012efd Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Mon, 9 Dec 2024 11:44:26 -0500 Subject: [PATCH 47/65] feat(ffi): Add initial implementation of `IrErrorCode` (using the `ErrorCode` template) which will replace the `IRErrorCode` enum. (#623) --- components/core/CMakeLists.txt | 2 ++ .../src/clp/ffi/ir_stream/IrErrorCode.cpp | 26 +++++++++++++++++++ .../src/clp/ffi/ir_stream/IrErrorCode.hpp | 24 +++++++++++++++++ components/core/tests/test-error_handling.cpp | 15 +++++++++++ 4 files changed, 67 insertions(+) create mode 100644 components/core/src/clp/ffi/ir_stream/IrErrorCode.cpp create mode 100644 components/core/src/clp/ffi/ir_stream/IrErrorCode.hpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index f974e5c7e..f15d14405 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -400,6 +400,8 @@ set(SOURCE_FILES_unitTest src/clp/ffi/ir_stream/decoding_methods.inc src/clp/ffi/ir_stream/encoding_methods.cpp src/clp/ffi/ir_stream/encoding_methods.hpp + src/clp/ffi/ir_stream/IrErrorCode.cpp + src/clp/ffi/ir_stream/IrErrorCode.hpp src/clp/ffi/ir_stream/IrUnitHandlerInterface.hpp src/clp/ffi/ir_stream/IrUnitType.hpp src/clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp diff --git a/components/core/src/clp/ffi/ir_stream/IrErrorCode.cpp b/components/core/src/clp/ffi/ir_stream/IrErrorCode.cpp new file mode 100644 index 000000000..f9a00ca1e --- /dev/null +++ b/components/core/src/clp/ffi/ir_stream/IrErrorCode.cpp @@ -0,0 +1,26 @@ +#include "IrErrorCode.hpp" + +#include + +using IrErrorCategory = clp::error_handling::ErrorCategory; +using clp::ffi::ir_stream::IrErrorCodeEnum; + +template <> +auto IrErrorCategory::name() const noexcept -> char const* { + return "clp::ffi::ir_stream::IrErrorCode"; +} + +template <> +auto IrErrorCategory::message(IrErrorCodeEnum error_enum) const -> std::string { + switch (error_enum) { + case IrErrorCodeEnum::DecodingMethodFailure: + return "The decoding method failed."; + case IrErrorCodeEnum::EndOfStream: + return "The end-of-stream IR unit has already been consumed."; + case IrErrorCodeEnum::IncompleteStream: + return "The IR stream ended with a truncated IR unit or did not terminate with an " + "end-of-stream IR unit."; + default: + return "Unknown error code enum."; + } +} diff --git a/components/core/src/clp/ffi/ir_stream/IrErrorCode.hpp b/components/core/src/clp/ffi/ir_stream/IrErrorCode.hpp new file mode 100644 index 000000000..8eaad4e16 --- /dev/null +++ b/components/core/src/clp/ffi/ir_stream/IrErrorCode.hpp @@ -0,0 +1,24 @@ +#ifndef CLP_IRERRORCODE_HPP +#define CLP_IRERRORCODE_HPP + +#include + +#include "../../error_handling/ErrorCode.hpp" + +namespace clp::ffi::ir_stream { +/** + * This enum class represents all possible error codes related to serializing or deserializing CLP + * IR streams. + */ +enum class IrErrorCodeEnum : uint8_t { + DecodingMethodFailure, + EndOfStream, + IncompleteStream, +}; + +using IrErrorCode = clp::error_handling::ErrorCode; +} // namespace clp::ffi::ir_stream + +CLP_ERROR_HANDLING_MARK_AS_ERROR_CODE_ENUM(clp::ffi::ir_stream::IrErrorCodeEnum); + +#endif // CLP_IRERRORCODE_HPP diff --git a/components/core/tests/test-error_handling.cpp b/components/core/tests/test-error_handling.cpp index 2d640ed57..44327c833 100644 --- a/components/core/tests/test-error_handling.cpp +++ b/components/core/tests/test-error_handling.cpp @@ -9,6 +9,7 @@ #include #include "../src/clp/error_handling/ErrorCode.hpp" +#include "../src/clp/ffi/ir_stream/IrErrorCode.hpp" using clp::error_handling::ErrorCategory; using clp::error_handling::ErrorCode; @@ -139,3 +140,17 @@ TEST_CASE("test_error_code_implementation", "[error_handling][ErrorCode]") { REQUIRE((AlwaysSuccessErrorCode{AlwaysSuccessErrorCodeEnum::Success} != success_error_code)); REQUIRE((BinaryErrorCode{BinaryErrorCodeEnum::Success} != always_success_error_code)); } + +TEST_CASE("test_ir_error_code", "[error_handling][ErrorCode][IrErrorCode]") { + using clp::ffi::ir_stream::IrErrorCode; + using clp::ffi::ir_stream::IrErrorCodeEnum; + + auto assert_error_code_matches_error_code_enum = [](IrErrorCodeEnum error_code_enum) -> bool { + std::error_code const error_code{IrErrorCode{error_code_enum}}; + return error_code == IrErrorCode{error_code_enum}; + }; + + REQUIRE(assert_error_code_matches_error_code_enum(IrErrorCodeEnum::DecodingMethodFailure)); + REQUIRE(assert_error_code_matches_error_code_enum(IrErrorCodeEnum::EndOfStream)); + REQUIRE(assert_error_code_matches_error_code_enum(IrErrorCodeEnum::IncompleteStream)); +} From 78a535cc079632a047ec34503f0782578af5be65 Mon Sep 17 00:00:00 2001 From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> Date: Mon, 9 Dec 2024 18:44:42 -0500 Subject: [PATCH 48/65] feat(ffi): Add support for auto/user-generated KV-pairs in `KeyValuePairLogEvent`; Detect and invalidate duplicate keys among non-leaf nodes when constructing a `KeyValuePairLogEvent`. (#558) Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- .../core/src/clp/ffi/KeyValuePairLogEvent.cpp | 289 ++++++++++---- .../core/src/clp/ffi/KeyValuePairLogEvent.hpp | 101 +++-- components/core/src/clp/ffi/SchemaTree.hpp | 6 + .../src/clp/ffi/ir_stream/Deserializer.hpp | 17 +- .../ir_unit_deserialization_methods.cpp | 7 +- .../ir_unit_deserialization_methods.hpp | 11 +- .../tests/test-ffi_IrUnitHandlerInterface.cpp | 12 +- .../tests/test-ffi_KeyValuePairLogEvent.cpp | 360 ++++++++++++++---- .../core/tests/test-ir_encoding_methods.cpp | 6 +- 9 files changed, 611 insertions(+), 198 deletions(-) diff --git a/components/core/src/clp/ffi/KeyValuePairLogEvent.cpp b/components/core/src/clp/ffi/KeyValuePairLogEvent.cpp index a8a8cf617..8e8bb15f5 100644 --- a/components/core/src/clp/ffi/KeyValuePairLogEvent.cpp +++ b/components/core/src/clp/ffi/KeyValuePairLogEvent.cpp @@ -153,6 +153,20 @@ node_type_matches_value_type(SchemaTree::Node::Type type, Value const& value) -> KeyValuePairLogEvent::NodeIdValuePairs const& node_id_value_pairs ) -> bool; +/** + * @param node_id_value_pairs + * @param schema_tree + * @return A result containing a bitmap where every bit corresponds to the ID of a node in the + * schema tree, and the set bits correspond to the nodes in the subtree defined by all paths from + * the root node to the nodes in `node_id_value_pairs`; or an error code indicating a failure: + * - std::errc::result_out_of_range if a node ID in `node_id_value_pairs` doesn't exist in the + * schema tree. + */ +[[nodiscard]] auto get_schema_subtree_bitmap( + KeyValuePairLogEvent::NodeIdValuePairs const& node_id_value_pairs, + SchemaTree const& schema_tree +) -> OUTCOME_V2_NAMESPACE::std_result>; + /** * Inserts the given key-value pair into the JSON object (map). * @param node The schema tree node of the key to insert. @@ -175,6 +189,34 @@ node_type_matches_value_type(SchemaTree::Node::Type type, Value const& value) -> */ [[nodiscard]] auto decode_as_encoded_text_ast(Value const& val) -> std::optional; +/** + * Serializes the given node-ID-value pairs into a `nlohmann::json` object. + * @param schema_tree + * @param node_id_value_pairs + * @param schema_subtree_bitmap + * @return A result containing the serialized JSON object or an error code indicating the failure: + * - std::errc::protocol_error if a value in the log event couldn't be decoded, or it couldn't be + * inserted into a JSON object. + */ +[[nodiscard]] auto serialize_node_id_value_pairs_to_json( + SchemaTree const& schema_tree, + KeyValuePairLogEvent::NodeIdValuePairs const& node_id_value_pairs, + vector const& schema_subtree_bitmap +) -> OUTCOME_V2_NAMESPACE::std_result; + +/** + * @param node A non-root schema tree node. + * @param parent_node_id_to_key_names + * @return true if `node`'s key is unique among its sibling nodes with `parent_node_id_to_key_names` + * updated to keep track of this unique key name. + * @return false if a sibling of `node` has the same key. + */ +[[nodiscard]] auto check_key_uniqueness_among_sibling_nodes( + SchemaTree::Node const& node, + std::unordered_map>& + parent_node_id_to_key_names +) -> bool; + auto node_type_matches_value_type(SchemaTree::Node::Type type, Value const& value) -> bool { switch (type) { case SchemaTree::Node::Type::Obj: @@ -202,6 +244,7 @@ auto validate_node_id_value_pairs( try { std::unordered_map> parent_node_id_to_key_names; + std::vector key_duplication_checked_node_id_bitmap(schema_tree.get_size(), false); for (auto const& [node_id, value] : node_id_value_pairs) { auto const& node{schema_tree.get_node(node_id)}; if (node.is_root()) { @@ -226,20 +269,38 @@ auto validate_node_id_value_pairs( return std::errc::operation_not_permitted; } - // We checked that the node isn't the root above, so we can query the underlying ID - // safely without a repeated check. - auto const parent_node_id{node.get_parent_id_unsafe()}; - auto const key_name{node.get_key_name()}; - if (parent_node_id_to_key_names.contains(parent_node_id)) { - auto const [it, new_key_inserted]{ - parent_node_id_to_key_names.at(parent_node_id).emplace(key_name) - }; - if (false == new_key_inserted) { - // The key is duplicated under the same parent + if (false + == check_key_uniqueness_among_sibling_nodes(node, parent_node_id_to_key_names)) + { + return std::errc::protocol_not_supported; + } + + // Iteratively check if there's any key duplication in the node's ancestors until: + // 1. The ancestor has already been checked. We only need to check an ancestor node + // once since if there are key duplications among its siblings, it would've been + // caught when the sibling was first checked (the order in which siblings get checked + // doesn't affect the results). + // 2. We reach the root node. + auto next_ancestor_node_id_to_check{node.get_parent_id_unsafe()}; + while (false == key_duplication_checked_node_id_bitmap[next_ancestor_node_id_to_check]) + { + auto const& node_to_check{schema_tree.get_node(next_ancestor_node_id_to_check)}; + if (node_to_check.is_root()) { + key_duplication_checked_node_id_bitmap[node_to_check.get_id()] = true; + break; + } + + if (false + == check_key_uniqueness_among_sibling_nodes( + node_to_check, + parent_node_id_to_key_names + )) + { return std::errc::protocol_not_supported; } - } else { - parent_node_id_to_key_names.emplace(parent_node_id, std::unordered_set{key_name}); + + key_duplication_checked_node_id_bitmap[next_ancestor_node_id_to_check] = true; + next_ancestor_node_id_to_check = node_to_check.get_parent_id_unsafe(); } } } catch (SchemaTree::OperationFailed const& ex) { @@ -269,6 +330,38 @@ auto is_leaf_node( return true; } +auto get_schema_subtree_bitmap( + KeyValuePairLogEvent::NodeIdValuePairs const& node_id_value_pairs, + SchemaTree const& schema_tree +) -> OUTCOME_V2_NAMESPACE::std_result> { + vector schema_subtree_bitmap(schema_tree.get_size(), false); + for (auto const& [node_id, val] : node_id_value_pairs) { + if (node_id >= schema_subtree_bitmap.size()) { + return std::errc::result_out_of_range; + } + schema_subtree_bitmap[node_id] = true; + + // Iteratively mark the parents as true + auto optional_parent_id{schema_tree.get_node(node_id).get_parent_id()}; + while (true) { + // Ideally, we'd use this if statement as the loop condition, but clang-tidy will + // complain about an unchecked `optional` access. + if (false == optional_parent_id.has_value()) { + // Reached the root + break; + } + auto const parent_id{optional_parent_id.value()}; + if (schema_subtree_bitmap[parent_id]) { + // Parent already set by other child + break; + } + schema_subtree_bitmap[parent_id] = true; + optional_parent_id = schema_tree.get_node(parent_id).get_parent_id(); + } + } + return schema_subtree_bitmap; +} + auto insert_kv_pair_into_json_obj( SchemaTree::Node const& node, std::optional const& optional_val, @@ -332,54 +425,13 @@ auto decode_as_encoded_text_ast(Value const& val) -> std::optional { ? val.get_immutable_view().decode_and_unparse() : val.get_immutable_view().decode_and_unparse(); } -} // namespace - -auto KeyValuePairLogEvent::create( - std::shared_ptr schema_tree, - NodeIdValuePairs node_id_value_pairs, - UtcOffset utc_offset -) -> OUTCOME_V2_NAMESPACE::std_result { - if (auto const ret_val{validate_node_id_value_pairs(*schema_tree, node_id_value_pairs)}; - std::errc{} != ret_val) - { - return ret_val; - } - return KeyValuePairLogEvent{std::move(schema_tree), std::move(node_id_value_pairs), utc_offset}; -} - -auto KeyValuePairLogEvent::get_schema_subtree_bitmap( -) const -> OUTCOME_V2_NAMESPACE::std_result> { - auto schema_subtree_bitmap{vector(m_schema_tree->get_size(), false)}; - for (auto const& [node_id, val] : m_node_id_value_pairs) { - if (node_id >= schema_subtree_bitmap.size()) { - return std::errc::result_out_of_range; - } - schema_subtree_bitmap[node_id] = true; - - // Iteratively mark the parents as true - auto optional_parent_id{m_schema_tree->get_node(node_id).get_parent_id()}; - while (true) { - // Ideally, we'd use this if statement as the loop condition, but clang-tidy will - // complain about an unchecked `optional` access. - if (false == optional_parent_id.has_value()) { - // Reached the root - break; - } - auto const parent_id{optional_parent_id.value()}; - if (schema_subtree_bitmap[parent_id]) { - // Parent already set by other child - break; - } - schema_subtree_bitmap[parent_id] = true; - optional_parent_id = m_schema_tree->get_node(parent_id).get_parent_id(); - } - } - return schema_subtree_bitmap; -} -auto KeyValuePairLogEvent::serialize_to_json( -) const -> OUTCOME_V2_NAMESPACE::std_result { - if (m_node_id_value_pairs.empty()) { +auto serialize_node_id_value_pairs_to_json( + SchemaTree const& schema_tree, + KeyValuePairLogEvent::NodeIdValuePairs const& node_id_value_pairs, + vector const& schema_subtree_bitmap +) -> OUTCOME_V2_NAMESPACE::std_result { + if (node_id_value_pairs.empty()) { return nlohmann::json::object(); } @@ -393,12 +445,6 @@ auto KeyValuePairLogEvent::serialize_to_json( // vector grows). std::stack dfs_stack; - auto const schema_subtree_bitmap_ret{get_schema_subtree_bitmap()}; - if (schema_subtree_bitmap_ret.has_error()) { - return schema_subtree_bitmap_ret.error(); - } - auto const& schema_subtree_bitmap{schema_subtree_bitmap_ret.value()}; - // Traverse the schema tree in DFS order, but only traverse the nodes that are set in // `schema_subtree_bitmap`. // @@ -408,7 +454,7 @@ auto KeyValuePairLogEvent::serialize_to_json( // // On the way up, add the current node's `nlohmann::json::object_t` to the parent's // `nlohmann::json::object_t`. - auto const& root_schema_tree_node{m_schema_tree->get_root()}; + auto const& root_schema_tree_node{schema_tree.get_root()}; auto root_json_obj = nlohmann::json::object_t(); dfs_stack.emplace( @@ -424,13 +470,13 @@ auto KeyValuePairLogEvent::serialize_to_json( continue; } auto const child_schema_tree_node_id{top.get_next_child_schema_tree_node()}; - auto const& child_schema_tree_node{m_schema_tree->get_node(child_schema_tree_node_id)}; - if (m_node_id_value_pairs.contains(child_schema_tree_node_id)) { + auto const& child_schema_tree_node{schema_tree.get_node(child_schema_tree_node_id)}; + if (node_id_value_pairs.contains(child_schema_tree_node_id)) { // Handle leaf node if (false == insert_kv_pair_into_json_obj( child_schema_tree_node, - m_node_id_value_pairs.at(child_schema_tree_node_id), + node_id_value_pairs.at(child_schema_tree_node_id), top.get_json_obj() )) { @@ -452,4 +498,109 @@ auto KeyValuePairLogEvent::serialize_to_json( return root_json_obj; } + +auto check_key_uniqueness_among_sibling_nodes( + SchemaTree::Node const& node, + std::unordered_map>& + parent_node_id_to_key_names +) -> bool { + // The caller checks that the given node is not the root, so we can query the underlying + // parent ID safely without a check. + auto const parent_node_id{node.get_parent_id_unsafe()}; + auto const key_name{node.get_key_name()}; + auto const parent_node_id_to_key_names_it{parent_node_id_to_key_names.find(parent_node_id)}; + if (parent_node_id_to_key_names_it != parent_node_id_to_key_names.end()) { + auto const [it, new_key_inserted]{parent_node_id_to_key_names_it->second.emplace(key_name)}; + if (false == new_key_inserted) { + // The key is duplicated under the same parent + return false; + } + } else { + parent_node_id_to_key_names.emplace(parent_node_id, std::unordered_set{key_name}); + } + return true; +} +} // namespace + +auto KeyValuePairLogEvent::create( + std::shared_ptr auto_gen_keys_schema_tree, + std::shared_ptr user_gen_keys_schema_tree, + NodeIdValuePairs auto_gen_node_id_value_pairs, + NodeIdValuePairs user_gen_node_id_value_pairs, + UtcOffset utc_offset +) -> OUTCOME_V2_NAMESPACE::std_result { + if (nullptr == auto_gen_keys_schema_tree || nullptr == user_gen_keys_schema_tree) { + return std::errc::invalid_argument; + } + + if (auto const ret_val{validate_node_id_value_pairs( + *auto_gen_keys_schema_tree, + auto_gen_node_id_value_pairs + )}; + std::errc{} != ret_val) + { + return ret_val; + } + + if (auto const ret_val{validate_node_id_value_pairs( + *user_gen_keys_schema_tree, + user_gen_node_id_value_pairs + )}; + std::errc{} != ret_val) + { + return ret_val; + } + + return KeyValuePairLogEvent{ + std::move(auto_gen_keys_schema_tree), + std::move(user_gen_keys_schema_tree), + std::move(auto_gen_node_id_value_pairs), + std::move(user_gen_node_id_value_pairs), + utc_offset + }; +} + +auto KeyValuePairLogEvent::get_auto_gen_keys_schema_subtree_bitmap( +) const -> OUTCOME_V2_NAMESPACE::std_result> { + return get_schema_subtree_bitmap(m_auto_gen_node_id_value_pairs, *m_auto_gen_keys_schema_tree); +} + +auto KeyValuePairLogEvent::get_user_gen_keys_schema_subtree_bitmap( +) const -> outcome_v2::std_result> { + return get_schema_subtree_bitmap(m_user_gen_node_id_value_pairs, *m_user_gen_keys_schema_tree); +} + +auto KeyValuePairLogEvent::serialize_to_json( +) const -> OUTCOME_V2_NAMESPACE::std_result> { + auto const auto_gen_keys_schema_subtree_bitmap_result{get_auto_gen_keys_schema_subtree_bitmap() + }; + if (auto_gen_keys_schema_subtree_bitmap_result.has_error()) { + return auto_gen_keys_schema_subtree_bitmap_result.error(); + } + auto serialized_auto_gen_kv_pairs_result{serialize_node_id_value_pairs_to_json( + *m_auto_gen_keys_schema_tree, + m_auto_gen_node_id_value_pairs, + auto_gen_keys_schema_subtree_bitmap_result.value() + )}; + if (serialized_auto_gen_kv_pairs_result.has_error()) { + return serialized_auto_gen_kv_pairs_result.error(); + } + + auto const user_gen_keys_schema_subtree_bitmap_result{get_user_gen_keys_schema_subtree_bitmap() + }; + if (user_gen_keys_schema_subtree_bitmap_result.has_error()) { + return user_gen_keys_schema_subtree_bitmap_result.error(); + } + auto serialized_user_gen_kv_pairs_result{serialize_node_id_value_pairs_to_json( + *m_user_gen_keys_schema_tree, + m_user_gen_node_id_value_pairs, + user_gen_keys_schema_subtree_bitmap_result.value() + )}; + if (serialized_user_gen_kv_pairs_result.has_error()) { + return serialized_user_gen_kv_pairs_result.error(); + } + + return {std::move(serialized_auto_gen_kv_pairs_result.value()), + std::move(serialized_user_gen_kv_pairs_result.value())}; +} } // namespace clp::ffi diff --git a/components/core/src/clp/ffi/KeyValuePairLogEvent.hpp b/components/core/src/clp/ffi/KeyValuePairLogEvent.hpp index f6334d378..2929c7498 100644 --- a/components/core/src/clp/ffi/KeyValuePairLogEvent.hpp +++ b/components/core/src/clp/ffi/KeyValuePairLogEvent.hpp @@ -17,10 +17,13 @@ namespace clp::ffi { /** * A log event containing key-value pairs. Each event contains: - * - A collection of node-ID & value pairs, where each pair represents a leaf `SchemaTreeNode` in - * the `SchemaTree`. - * - A reference to the `SchemaTree` - * - The UTC offset of the current log event + * - A reference to the schema tree for auto-generated keys. + * - A reference to the schema tree for user-generated keys. + * - A collection of auto-generated node-ID & value pairs, where each pair represents a leaf + * `SchemaTree::Node` in the schema tree for auto-generated keys. + * - A collection of user-generated node-ID & value pairs, where each pair represents a leaf + * `SchemaTree::Node` in the schema tree for user-generated keys. + * - The UTC offset of the current log event. */ class KeyValuePairLogEvent { public: @@ -29,15 +32,21 @@ class KeyValuePairLogEvent { // Factory functions /** - * @param schema_tree - * @param node_id_value_pairs + * @param auto_gen_keys_schema_tree + * @param user_gen_keys_schema_tree + * @param auto_gen_node_id_value_pairs + * @param user_gen_node_id_value_pairs * @param utc_offset * @return A result containing the key-value pair log event or an error code indicating the - * failure. See `validate_node_id_value_pairs` for the possible error codes. + * failure: + * - std::errc::invalid_argument if any of the given schema tree pointers are null. + * - Forwards `validate_node_id_value_pairs`'s return values. */ [[nodiscard]] static auto create( - std::shared_ptr schema_tree, - NodeIdValuePairs node_id_value_pairs, + std::shared_ptr auto_gen_keys_schema_tree, + std::shared_ptr user_gen_keys_schema_tree, + NodeIdValuePairs auto_gen_node_id_value_pairs, + NodeIdValuePairs user_gen_node_id_value_pairs, UtcOffset utc_offset ) -> OUTCOME_V2_NAMESPACE::std_result; @@ -53,51 +62,77 @@ class KeyValuePairLogEvent { ~KeyValuePairLogEvent() = default; // Methods - [[nodiscard]] auto get_schema_tree() const -> SchemaTree const& { return *m_schema_tree; } + [[nodiscard]] auto get_auto_gen_keys_schema_tree() const -> SchemaTree const& { + return *m_auto_gen_keys_schema_tree; + } - [[nodiscard]] auto get_node_id_value_pairs() const -> NodeIdValuePairs const& { - return m_node_id_value_pairs; + [[nodiscard]] auto get_user_gen_keys_schema_tree() const -> SchemaTree const& { + return *m_user_gen_keys_schema_tree; } - [[nodiscard]] auto get_utc_offset() const -> UtcOffset { return m_utc_offset; } + [[nodiscard]] auto get_auto_gen_node_id_value_pairs() const -> NodeIdValuePairs const& { + return m_auto_gen_node_id_value_pairs; + } + + [[nodiscard]] auto get_user_gen_node_id_value_pairs() const -> NodeIdValuePairs const& { + return m_user_gen_node_id_value_pairs; + } /** * @return A result containing a bitmap where every bit corresponds to the ID of a node in the - * schema tree, and the set bits correspond to the nodes in the subtree defined by all paths - * from the root node to the nodes in `node_id_value_pairs`; or an error code indicating a - * failure: - * - std::errc::result_out_of_range if a node ID in `node_id_value_pairs` doesn't exist in the - * schema tree. + * schema tree for auto-generated keys, and the set bits correspond to the nodes in the subtree + * defined by all paths from the root node to the nodes in `m_auto_gen_node_id_value_pairs`; or + * an error code indicating a failure: + * - Forwards `get_schema_subtree_bitmap`'s return values. */ - [[nodiscard]] auto get_schema_subtree_bitmap( + [[nodiscard]] auto get_auto_gen_keys_schema_subtree_bitmap( ) const -> OUTCOME_V2_NAMESPACE::std_result>; /** - * Serializes the log event into a `nlohmann::json` object. - * @return A result containing the serialized JSON object or an error code indicating the - * failure: - * - std::errc::protocol_error if a value in the log event couldn't be decoded or it couldn't be - * inserted into a JSON object. - * - std::errc::result_out_of_range if a node ID in the log event doesn't exist in the schema - * tree. + * @return A result containing a bitmap where every bit corresponds to the ID of a node in the + * schema tree for user-generated keys, and the set bits correspond to the nodes in the subtree + * defined by all paths from the root node to the nodes in `m_user_gen_node_id_value_pairs`; or + * an error code indicating a failure: + * - Forwards `get_schema_subtree_bitmap`'s return values. + */ + [[nodiscard]] auto get_user_gen_keys_schema_subtree_bitmap( + ) const -> OUTCOME_V2_NAMESPACE::std_result>; + + [[nodiscard]] auto get_utc_offset() const -> UtcOffset { return m_utc_offset; } + + /** + * Serializes the log event into `nlohmann::json` objects. + * @return A result containing a pair or an error code indicating the failure: + * - The pair: + * - Serialized auto-generated key-value pairs as a JSON object + * - Serialized user-generated key-value pairs as a JSON object + * - The possible error codes: + * - Forwards `get_auto_gen_keys_schema_subtree_bitmap`'s return values on failure. + * - Forwards `serialize_node_id_value_pairs_to_json`'s return values on failure. */ [[nodiscard]] auto serialize_to_json( - ) const -> OUTCOME_V2_NAMESPACE::std_result; + ) const -> OUTCOME_V2_NAMESPACE::std_result>; private: // Constructor KeyValuePairLogEvent( - std::shared_ptr schema_tree, - NodeIdValuePairs node_id_value_pairs, + std::shared_ptr auto_gen_keys_schema_tree, + std::shared_ptr user_gen_keys_schema_tree, + NodeIdValuePairs auto_gen_node_id_value_pairs, + NodeIdValuePairs user_gen_node_id_value_pairs, UtcOffset utc_offset ) - : m_schema_tree{std::move(schema_tree)}, - m_node_id_value_pairs{std::move(node_id_value_pairs)}, + : m_auto_gen_keys_schema_tree{std::move(auto_gen_keys_schema_tree)}, + m_user_gen_keys_schema_tree{std::move(user_gen_keys_schema_tree)}, + m_auto_gen_node_id_value_pairs{std::move(auto_gen_node_id_value_pairs)}, + m_user_gen_node_id_value_pairs{std::move(user_gen_node_id_value_pairs)}, m_utc_offset{utc_offset} {} // Variables - std::shared_ptr m_schema_tree; - NodeIdValuePairs m_node_id_value_pairs; + std::shared_ptr m_auto_gen_keys_schema_tree; + std::shared_ptr m_user_gen_keys_schema_tree; + NodeIdValuePairs m_auto_gen_node_id_value_pairs; + NodeIdValuePairs m_user_gen_node_id_value_pairs; UtcOffset m_utc_offset{0}; }; } // namespace clp::ffi diff --git a/components/core/src/clp/ffi/SchemaTree.hpp b/components/core/src/clp/ffi/SchemaTree.hpp index 46494fa71..4efbbf81e 100644 --- a/components/core/src/clp/ffi/SchemaTree.hpp +++ b/components/core/src/clp/ffi/SchemaTree.hpp @@ -128,6 +128,8 @@ class SchemaTree { ~Node() = default; // Methods + [[nodiscard]] auto operator==(Node const& rhs) const -> bool = default; + [[nodiscard]] auto get_id() const -> id_t { return m_id; } [[nodiscard]] auto is_root() const -> bool { return false == m_parent_id.has_value(); } @@ -249,6 +251,10 @@ class SchemaTree { ~SchemaTree() = default; // Methods + [[nodiscard]] auto operator==(SchemaTree const& rhs) const -> bool { + return m_tree_nodes == rhs.m_tree_nodes; + } + [[nodiscard]] auto get_size() const -> size_t { return m_tree_nodes.size(); } [[nodiscard]] auto get_root() const -> Node const& { return m_tree_nodes[cRootId]; } diff --git a/components/core/src/clp/ffi/ir_stream/Deserializer.hpp b/components/core/src/clp/ffi/ir_stream/Deserializer.hpp index 3418a39ae..d31699cd2 100644 --- a/components/core/src/clp/ffi/ir_stream/Deserializer.hpp +++ b/components/core/src/clp/ffi/ir_stream/Deserializer.hpp @@ -115,7 +115,8 @@ class Deserializer { Deserializer(IrUnitHandler ir_unit_handler) : m_ir_unit_handler{std::move(ir_unit_handler)} {} // Variables - std::shared_ptr m_schema_tree{std::make_shared()}; + std::shared_ptr m_auto_gen_keys_schema_tree{std::make_shared()}; + std::shared_ptr m_user_gen_keys_schema_tree{std::make_shared()}; UtcOffset m_utc_offset{0}; IrUnitHandler m_ir_unit_handler; bool m_is_complete{false}; @@ -183,9 +184,13 @@ auto Deserializer::deserialize_next_ir_unit(ReaderInterface& read auto const ir_unit_type{optional_ir_unit_type.value()}; switch (ir_unit_type) { case IrUnitType::LogEvent: { - auto result{ - deserialize_ir_unit_kv_pair_log_event(reader, tag, m_schema_tree, m_utc_offset) - }; + auto result{deserialize_ir_unit_kv_pair_log_event( + reader, + tag, + m_auto_gen_keys_schema_tree, + m_user_gen_keys_schema_tree, + m_utc_offset + )}; if (result.has_error()) { return result.error(); } @@ -207,7 +212,7 @@ auto Deserializer::deserialize_next_ir_unit(ReaderInterface& read } auto const node_locator{result.value()}; - if (m_schema_tree->has_node(node_locator)) { + if (m_user_gen_keys_schema_tree->has_node(node_locator)) { return std::errc::protocol_error; } @@ -217,7 +222,7 @@ auto Deserializer::deserialize_next_ir_unit(ReaderInterface& read return ir_error_code_to_errc(err); } - std::ignore = m_schema_tree->insert_node(node_locator); + std::ignore = m_user_gen_keys_schema_tree->insert_node(node_locator); break; } diff --git a/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp b/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp index 5e1813a3e..cea4a1b84 100644 --- a/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp +++ b/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp @@ -551,7 +551,8 @@ auto deserialize_ir_unit_utc_offset_change(ReaderInterface& reader auto deserialize_ir_unit_kv_pair_log_event( ReaderInterface& reader, encoded_tag_t tag, - std::shared_ptr schema_tree, + std::shared_ptr auto_gen_keys_schema_tree, + std::shared_ptr user_gen_keys_schema_tree, UtcOffset utc_offset ) -> OUTCOME_V2_NAMESPACE::std_result { auto const schema_result{deserialize_schema(reader, tag)}; @@ -579,7 +580,9 @@ auto deserialize_ir_unit_kv_pair_log_event( } return KeyValuePairLogEvent::create( - std::move(schema_tree), + std::move(auto_gen_keys_schema_tree), + std::move(user_gen_keys_schema_tree), + {}, std::move(node_id_value_pairs), utc_offset ); diff --git a/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.hpp b/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.hpp index 68ed4408b..451f627db 100644 --- a/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.hpp +++ b/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.hpp @@ -57,10 +57,12 @@ namespace clp::ffi::ir_stream { * Deserializes a key-value pair log event IR unit. * @param reader * @param tag - * @param schema_tree Schema tree used to construct the KV-pair log event. + * @param auto_gen_keys_schema_tree Schema tree for auto-generated keys, used to construct the + * KV-pair log event. + * @param user_gen_keys_schema_tree Schema tree for user-generated keys, used to construct the + * KV-pair log event. * @param utc_offset UTC offset used to construct the KV-pair log event. - * @return A result containing the deserialized log event or an error code indicating the - * failure: + * @return A result containing the deserialized log event or an error code indicating the failure: * - std::errc::result_out_of_range if the IR stream is truncated. * - std::errc::protocol_error if the IR stream is corrupted. * - std::errc::protocol_not_supported if the IR stream contains an unsupported metadata format @@ -72,7 +74,8 @@ namespace clp::ffi::ir_stream { [[nodiscard]] auto deserialize_ir_unit_kv_pair_log_event( ReaderInterface& reader, encoded_tag_t tag, - std::shared_ptr schema_tree, + std::shared_ptr auto_gen_keys_schema_tree, + std::shared_ptr user_gen_keys_schema_tree, UtcOffset utc_offset ) -> OUTCOME_V2_NAMESPACE::std_result; } // namespace clp::ffi::ir_stream diff --git a/components/core/tests/test-ffi_IrUnitHandlerInterface.cpp b/components/core/tests/test-ffi_IrUnitHandlerInterface.cpp index 5b8ad82cd..8f76a2f1a 100644 --- a/components/core/tests/test-ffi_IrUnitHandlerInterface.cpp +++ b/components/core/tests/test-ffi_IrUnitHandlerInterface.cpp @@ -87,9 +87,13 @@ auto test_ir_unit_handler_interface(clp::ffi::ir_stream::IrUnitHandlerInterface auto test_ir_unit_handler_interface(clp::ffi::ir_stream::IrUnitHandlerInterface auto& handler ) -> void { - auto test_log_event_result{ - KeyValuePairLogEvent::create(std::make_shared(), {}, cTestUtcOffset) - }; + auto test_log_event_result{KeyValuePairLogEvent::create( + std::make_shared(), + std::make_shared(), + {}, + {}, + cTestUtcOffset + )}; REQUIRE( (false == test_log_event_result.has_error() && IRErrorCode::IRErrorCode_Success @@ -127,7 +131,7 @@ TEMPLATE_TEST_CASE( REQUIRE( (optional_log_event.has_value() && optional_log_event.value().get_utc_offset() == cTestUtcOffset - && optional_log_event.value().get_node_id_value_pairs().empty()) + && optional_log_event.value().get_user_gen_node_id_value_pairs().empty()) ); auto const& optional_schema_tree_locator{handler.get_schema_tree_node_locator()}; REQUIRE( diff --git a/components/core/tests/test-ffi_KeyValuePairLogEvent.cpp b/components/core/tests/test-ffi_KeyValuePairLogEvent.cpp index 2e9cfb691..9ffee4f68 100644 --- a/components/core/tests/test-ffi_KeyValuePairLogEvent.cpp +++ b/components/core/tests/test-ffi_KeyValuePairLogEvent.cpp @@ -11,6 +11,7 @@ #include #include +#include #include "../src/clp/ffi/encoding_methods.hpp" #include "../src/clp/ffi/KeyValuePairLogEvent.hpp" @@ -81,6 +82,25 @@ auto insert_invalid_node_id_value_pairs_with_node_type_errors( KeyValuePairLogEvent::NodeIdValuePairs& invalid_node_id_value_pairs ) -> void; +/** + * Asserts that `KeyValuePairLogEvent` creation fails with the expected error code. + * @param auto_gen_keys_schema_tree + * @param user_gen_keys_schema_tree + * @param auto_gen_node_id_value_pairs + * @param user_gen_node_id_value_pairs + * @param utc_offset + * @param expected_error_code + * @return Whether the assertion succeeded. + */ +[[nodiscard]] auto assert_kv_pair_log_event_creation_failure( + std::shared_ptr auto_gen_keys_schema_tree, + std::shared_ptr user_gen_keys_schema_tree, + KeyValuePairLogEvent::NodeIdValuePairs auto_gen_node_id_value_pairs, + KeyValuePairLogEvent::NodeIdValuePairs user_gen_node_id_value_pairs, + UtcOffset utc_offset, + std::errc expected_error_code +) -> bool; + template requires(std::is_same_v || std::is_same_v) @@ -197,6 +217,24 @@ auto insert_invalid_node_id_value_pairs_with_node_type_errors( invalid_node_id_value_pairs.emplace(node_id, Value{}); } } + +auto assert_kv_pair_log_event_creation_failure( + std::shared_ptr auto_gen_keys_schema_tree, + std::shared_ptr user_gen_keys_schema_tree, + KeyValuePairLogEvent::NodeIdValuePairs auto_gen_node_id_value_pairs, + KeyValuePairLogEvent::NodeIdValuePairs user_gen_node_id_value_pairs, + UtcOffset utc_offset, + std::errc expected_error_code +) -> bool { + auto const result{KeyValuePairLogEvent::create( + std::move(auto_gen_keys_schema_tree), + std::move(user_gen_keys_schema_tree), + std::move(auto_gen_node_id_value_pairs), + std::move(user_gen_node_id_value_pairs), + utc_offset + )}; + return result.has_error() && result.error() == expected_error_code; +} } // namespace TEST_CASE("ffi_Value_basic", "[ffi][Value]") { @@ -250,22 +288,23 @@ TEST_CASE("ffi_KeyValuePairLogEvent_create", "[ffi]") { * | * |------------> <1:a:Obj> * | | - * |--> <2:a:Int> |--> <3:b:Obj> - * | - * |------------> <4:c:Obj> - * | | - * |--> <5:d:Str> |--> <7:a:UnstructuredArray> - * | | - * |--> <6:d:Bool> |--> <8:d:Str> - * | | - * |--> <10:e:Obj> |--> <9:d:Float> - * | - * |--> <11:f:Obj> + * |--> <2:b:Int> |--> <3:b:Obj> + * | | | + * |--> <12:a:Int> | |------------> <4:c:Obj> + * | | | + * | |--> <5:d:Str> |--> <7:a:UnstructuredArray> + * | | | + * | |--> <6:d:Bool> |--> <8:d:Str> + * | | | + * | |--> <10:e:Obj> |--> <9:d:Float> + * | | + * |--> <13:b:Bool> |--> <11:f:Obj> */ - auto const schema_tree{std::make_shared()}; + auto const auto_gen_keys_schema_tree{std::make_shared()}; + auto const user_gen_keys_schema_tree{std::make_shared()}; std::vector const locators{ {SchemaTree::cRootId, "a", SchemaTree::Node::Type::Obj}, - {SchemaTree::cRootId, "a", SchemaTree::Node::Type::Int}, + {SchemaTree::cRootId, "b", SchemaTree::Node::Type::Int}, {1, "b", SchemaTree::Node::Type::Obj}, {3, "c", SchemaTree::Node::Type::Obj}, {3, "d", SchemaTree::Node::Type::Str}, @@ -274,63 +313,88 @@ TEST_CASE("ffi_KeyValuePairLogEvent_create", "[ffi]") { {4, "d", SchemaTree::Node::Type::Str}, {4, "d", SchemaTree::Node::Type::Float}, {3, "e", SchemaTree::Node::Type::Obj}, - {4, "f", SchemaTree::Node::Type::Obj} + {4, "f", SchemaTree::Node::Type::Obj}, + {SchemaTree::cRootId, "a", SchemaTree::Node::Type::Int}, + {1, "b", SchemaTree::Node::Type::Bool} }; for (auto const& locator : locators) { - REQUIRE_NOTHROW(schema_tree->insert_node(locator)); + REQUIRE_NOTHROW(auto_gen_keys_schema_tree->insert_node(locator)); + REQUIRE_NOTHROW(user_gen_keys_schema_tree->insert_node(locator)); } + REQUIRE((*auto_gen_keys_schema_tree == *user_gen_keys_schema_tree)); + SECTION("Test empty ID-value pairs") { - KeyValuePairLogEvent::NodeIdValuePairs node_id_value_pairs; auto const result{KeyValuePairLogEvent::create( - schema_tree, - std::move(node_id_value_pairs), + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + {}, + {}, UtcOffset{0} )}; REQUIRE_FALSE(result.has_error()); } + SECTION("Test schema tree pointers being null") { + REQUIRE(assert_kv_pair_log_event_creation_failure( + nullptr, + user_gen_keys_schema_tree, + {}, + {}, + UtcOffset{0}, + std::errc::invalid_argument + )); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + nullptr, + {}, + {}, + UtcOffset{0}, + std::errc::invalid_argument + )); + } + SECTION("Test mismatched types") { KeyValuePairLogEvent::NodeIdValuePairs invalid_node_id_value_pairs; // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) // Int: insert_invalid_node_id_value_pairs_with_node_type_errors( - *schema_tree, + *user_gen_keys_schema_tree, 2, invalid_node_id_value_pairs ); // Float: insert_invalid_node_id_value_pairs_with_node_type_errors( - *schema_tree, + *user_gen_keys_schema_tree, 9, invalid_node_id_value_pairs ); // Bool: insert_invalid_node_id_value_pairs_with_node_type_errors( - *schema_tree, + *user_gen_keys_schema_tree, 6, invalid_node_id_value_pairs ); // Str: insert_invalid_node_id_value_pairs_with_node_type_errors( - *schema_tree, + *user_gen_keys_schema_tree, 5, invalid_node_id_value_pairs ); // UnstructuredArray: insert_invalid_node_id_value_pairs_with_node_type_errors( - *schema_tree, + *user_gen_keys_schema_tree, 7, invalid_node_id_value_pairs ); // Obj: insert_invalid_node_id_value_pairs_with_node_type_errors( - *schema_tree, + *user_gen_keys_schema_tree, 3, invalid_node_id_value_pairs ); @@ -343,26 +407,37 @@ TEST_CASE("ffi_KeyValuePairLogEvent_create", "[ffi]") { } else { node_id_value_pair_to_test.emplace(node_id, std::nullopt); } - auto const result{KeyValuePairLogEvent::create( - schema_tree, - std::move(node_id_value_pair_to_test), - UtcOffset{0} - )}; - REQUIRE(result.has_error()); - auto const& err{result.error()}; - REQUIRE((std::errc::protocol_error == err)); + + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + node_id_value_pair_to_test, + {}, + UtcOffset{0}, + std::errc::protocol_error + )); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + {}, + node_id_value_pair_to_test, + UtcOffset{0}, + std::errc::protocol_error + )); } } SECTION("Test valid ID-value pairs") { - KeyValuePairLogEvent::NodeIdValuePairs node_id_value_pairs; + constexpr std::string_view cJsonArrayToEncode{"[\"a\", 1, 0.1, null]"}; + constexpr std::string_view cStaticText{"Test"}; + KeyValuePairLogEvent::NodeIdValuePairs valid_node_id_value_pairs; /* * The sub schema tree of `node_id_value_pairs`: * <0:root:Obj> * | * |------------> <1:a:Obj> * | | - * |--> <2:a:Int> |--> <3:b:Obj> + * |--> <2:b:Int> |--> <3:b:Obj> * | * |------------> <4:c:Obj> * | | @@ -375,77 +450,206 @@ TEST_CASE("ffi_KeyValuePairLogEvent_create", "[ffi]") { * |--> <11:f:Obj> */ // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - node_id_value_pairs.emplace(2, Value{static_cast(0)}); - node_id_value_pairs.emplace(5, Value{string{"Test"}}); - node_id_value_pairs.emplace( + valid_node_id_value_pairs.emplace(2, Value{static_cast(0)}); + valid_node_id_value_pairs.emplace(5, Value{string{cStaticText}}); + valid_node_id_value_pairs.emplace( 8, Value{get_encoded_text_ast(cStringToEncode)} ); - node_id_value_pairs.emplace( + valid_node_id_value_pairs.emplace( 7, - Value{get_encoded_text_ast(cStringToEncode)} + Value{get_encoded_text_ast(cJsonArrayToEncode)} ); - node_id_value_pairs.emplace(10, Value{}); - node_id_value_pairs.emplace(11, std::nullopt); + valid_node_id_value_pairs.emplace(10, Value{}); + valid_node_id_value_pairs.emplace(11, std::nullopt); // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - auto const result{ - KeyValuePairLogEvent::create(schema_tree, node_id_value_pairs, UtcOffset{0}) - }; + auto const result{KeyValuePairLogEvent::create( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + valid_node_id_value_pairs, + valid_node_id_value_pairs, + UtcOffset{0} + )}; REQUIRE_FALSE(result.has_error()); - SECTION("Test duplicated key conflict on node #3") { - // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - node_id_value_pairs.emplace(6, Value{static_cast(false)}); - auto const result{ - KeyValuePairLogEvent::create(schema_tree, node_id_value_pairs, UtcOffset{0}) + SECTION("Test JSON serialization") { + nlohmann::json const subtree_rooted_at_node_4 + = {{"a", nlohmann::json::parse(cJsonArrayToEncode)}, + {"d", cStringToEncode}, + {"f", nlohmann::json::object_t()}}; + nlohmann::json const subtree_rooted_at_node_3 + = {{"c", subtree_rooted_at_node_4}, {"d", cStaticText}, {"e", nullptr}}; + nlohmann::json const expected = { + {"a", {{"b", subtree_rooted_at_node_3}}}, + {"b", 0}, }; - REQUIRE(result.has_error()); - REQUIRE((std::errc::protocol_not_supported == result.error())); + + auto const& kv_pair_log_event{result.value()}; + auto const serialized_json_result{kv_pair_log_event.serialize_to_json()}; + REQUIRE_FALSE(serialized_json_result.has_error()); + auto const& [serialized_auto_gen_kv_pairs, serialized_user_gen_kv_pairs]{ + serialized_json_result.value() + }; + REQUIRE((serialized_auto_gen_kv_pairs == expected)); + REQUIRE((serialized_user_gen_kv_pairs == expected)); } - SECTION("Test duplicated key conflict on node #4") { + SECTION("Test duplicated key conflict under node #3") { + auto invalid_node_id_value_pairs{valid_node_id_value_pairs}; // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) - node_id_value_pairs.emplace(9, Value{static_cast(0.0)}); - auto const result{ - KeyValuePairLogEvent::create(schema_tree, node_id_value_pairs, UtcOffset{0}) - }; - REQUIRE(result.has_error()); - REQUIRE((std::errc::protocol_not_supported == result.error())); + invalid_node_id_value_pairs.emplace(6, Value{static_cast(false)}); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + invalid_node_id_value_pairs, + valid_node_id_value_pairs, + UtcOffset{0}, + std::errc::protocol_not_supported + )); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + valid_node_id_value_pairs, + invalid_node_id_value_pairs, + UtcOffset{0}, + std::errc::protocol_not_supported + )); + } + + SECTION("Test duplicated key conflict under node #4") { + auto invalid_node_id_value_pairs{valid_node_id_value_pairs}; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + invalid_node_id_value_pairs.emplace(9, Value{static_cast(0.0)}); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + invalid_node_id_value_pairs, + valid_node_id_value_pairs, + UtcOffset{0}, + std::errc::protocol_not_supported + )); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + valid_node_id_value_pairs, + invalid_node_id_value_pairs, + UtcOffset{0}, + std::errc::protocol_not_supported + )); + } + + SECTION("Test duplicated keys among siblings of node #1") { + auto invalid_node_id_value_pairs{valid_node_id_value_pairs}; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + invalid_node_id_value_pairs.emplace(12, static_cast(0)); + // Node #12 has the same key as its sibling node #1 + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + invalid_node_id_value_pairs, + valid_node_id_value_pairs, + UtcOffset{0}, + std::errc::protocol_not_supported + )); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + valid_node_id_value_pairs, + invalid_node_id_value_pairs, + UtcOffset{0}, + std::errc::protocol_not_supported + )); + } + + SECTION("Test duplicated keys among siblings of node #3") { + auto invalid_node_id_value_pairs{valid_node_id_value_pairs}; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers) + invalid_node_id_value_pairs.emplace(13, false); + // Node #13 has the same key as its sibling node #3 + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + invalid_node_id_value_pairs, + valid_node_id_value_pairs, + UtcOffset{0}, + std::errc::protocol_not_supported + )); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + valid_node_id_value_pairs, + invalid_node_id_value_pairs, + UtcOffset{0}, + std::errc::protocol_not_supported + )); } SECTION("Test invalid sub-tree on node #3") { - node_id_value_pairs.emplace(3, std::nullopt); - auto const result{ - KeyValuePairLogEvent::create(schema_tree, node_id_value_pairs, UtcOffset{0}) - }; + auto invalid_node_id_value_pairs{valid_node_id_value_pairs}; + invalid_node_id_value_pairs.emplace(3, std::nullopt); // Node #3 is empty, but its descendants appear in the sub schema tree (node #5 & #10) - REQUIRE(result.has_error()); - REQUIRE((std::errc::operation_not_permitted == result.error())); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + invalid_node_id_value_pairs, + valid_node_id_value_pairs, + UtcOffset{0}, + std::errc::operation_not_permitted + )); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + valid_node_id_value_pairs, + invalid_node_id_value_pairs, + UtcOffset{0}, + std::errc::operation_not_permitted + )); } SECTION("Test invalid sub-tree on node #4") { - node_id_value_pairs.emplace(4, Value{}); - auto const result{ - KeyValuePairLogEvent::create(schema_tree, node_id_value_pairs, UtcOffset{0}) - }; + auto invalid_node_id_value_pairs{valid_node_id_value_pairs}; + invalid_node_id_value_pairs.emplace(4, Value{}); // Node #4 is null, but its descendants appear in the sub schema tree (node #5 & #10) - REQUIRE(result.has_error()); - REQUIRE((std::errc::operation_not_permitted == result.error())); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + invalid_node_id_value_pairs, + valid_node_id_value_pairs, + UtcOffset{0}, + std::errc::operation_not_permitted + )); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + valid_node_id_value_pairs, + invalid_node_id_value_pairs, + UtcOffset{0}, + std::errc::operation_not_permitted + )); } } SECTION("Test out-of-bound node ID") { KeyValuePairLogEvent::NodeIdValuePairs node_id_value_pairs_out_of_bound; node_id_value_pairs_out_of_bound.emplace( - static_cast(schema_tree->get_size()), + static_cast(user_gen_keys_schema_tree->get_size()), Value{} ); - auto const out_of_bound_result{KeyValuePairLogEvent::create( - schema_tree, - std::move(node_id_value_pairs_out_of_bound), - UtcOffset{0} - )}; - REQUIRE(out_of_bound_result.has_error()); - REQUIRE((std::errc::operation_not_permitted == out_of_bound_result.error())); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + node_id_value_pairs_out_of_bound, + {}, + UtcOffset{0}, + std::errc::operation_not_permitted + )); + REQUIRE(assert_kv_pair_log_event_creation_failure( + auto_gen_keys_schema_tree, + user_gen_keys_schema_tree, + {}, + node_id_value_pairs_out_of_bound, + UtcOffset{0}, + std::errc::operation_not_permitted + )); } } diff --git a/components/core/tests/test-ir_encoding_methods.cpp b/components/core/tests/test-ir_encoding_methods.cpp index 1ee1e3542..347dadb7a 100644 --- a/components/core/tests/test-ir_encoding_methods.cpp +++ b/components/core/tests/test-ir_encoding_methods.cpp @@ -1246,12 +1246,14 @@ TEMPLATE_TEST_CASE( auto const& deserialized_log_event{deserialized_log_events.at(idx)}; auto const num_leaves_in_json_obj{count_num_leaves(expect)}; - auto const num_kv_pairs{deserialized_log_event.get_node_id_value_pairs().size()}; + auto const num_kv_pairs{deserialized_log_event.get_user_gen_node_id_value_pairs().size()}; REQUIRE((num_leaves_in_json_obj == num_kv_pairs)); auto const serialized_json_result{deserialized_log_event.serialize_to_json()}; REQUIRE_FALSE(serialized_json_result.has_error()); - REQUIRE((expect == serialized_json_result.value())); + auto const& [auto_generated, user_generated]{serialized_json_result.value()}; + REQUIRE(auto_generated.empty()); + REQUIRE((expect == user_generated)); } auto const eof_result{deserializer.deserialize_next_ir_unit(reader)}; From 42db88c34e9336941cadaa212f1f30884fd6705c Mon Sep 17 00:00:00 2001 From: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Tue, 10 Dec 2024 17:00:04 -0500 Subject: [PATCH 49/65] build(docs): Update dependencies to latest versions. (#631) --- docs/requirements.txt | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 84466dcae..dd8ca3593 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,10 +1,6 @@ -myst-parser>=2.0.0 -# Locked to avoid pydata/pydata-sphinx-theme#1676 until its fix is released in a version above -# 0.15.2 -pydata-sphinx-theme==0.14.4 -# Locked to avoid the following issue until a fix is released: -# https://github.com/sphinx-doc/sphinx/issues/13002 -sphinx==8.0.2 -sphinx_design>=0.5.0 +myst-parser>=4.0.0 +pydata-sphinx-theme>=0.16.0 +sphinx>=8.1.3 +sphinx_design>=0.6.1 sphinx-copybutton>=0.5.2 -sphinxcontrib-mermaid>=0.9.2 +sphinxcontrib-mermaid>=1.0.0 From 13c752801bf28427bfdcd9e8ab942a28e7dbbea5 Mon Sep 17 00:00:00 2001 From: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Thu, 12 Dec 2024 11:02:56 -0500 Subject: [PATCH 50/65] ci(pr-title-checks): Remove default GH workflow permissions and document risk of `pull_request_target` workflow trigger. (#633) --- .github/workflows/clp-pr-title-checks.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/clp-pr-title-checks.yaml b/.github/workflows/clp-pr-title-checks.yaml index 428e9f21d..1c8ced072 100644 --- a/.github/workflows/clp-pr-title-checks.yaml +++ b/.github/workflows/clp-pr-title-checks.yaml @@ -2,9 +2,16 @@ name: "clp-pr-title-checks" on: pull_request_target: + # NOTE: Workflows triggered by this event give the workflow access to secrets and grant the + # `GITHUB_TOKEN` read/write repository access by default. So we need to ensure: + # - This workflow doesn't inadvertently check out, build, or execute untrusted code from the + # pull request triggered by this event. + # - Each job has `permissions` set to only those necessary. types: ["edited", "opened", "reopened"] branches: ["main"] +permissions: {} + concurrency: group: "${{github.workflow}}-${{github.ref}}" From 8b34dac702ade914935cc5624982afba0e345efc Mon Sep 17 00:00:00 2001 From: Devin Gibson Date: Fri, 13 Dec 2024 15:40:43 -0500 Subject: [PATCH 51/65] feat(core-clp): Add `BoundedReader` to prevent out-of-bound reads in segmented input streams. (#624) Co-authored-by: haiqi96 <14502009+haiqi96@users.noreply.github.com> Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com> --- components/core/CMakeLists.txt | 3 + components/core/src/clp/BoundedReader.cpp | 43 +++++++++ components/core/src/clp/BoundedReader.hpp | 89 ++++++++++++++++++ components/core/src/clp/StringReader.cpp | 4 + components/core/tests/test-BoundedReader.cpp | 99 ++++++++++++++++++++ 5 files changed, 238 insertions(+) create mode 100644 components/core/src/clp/BoundedReader.cpp create mode 100644 components/core/src/clp/BoundedReader.hpp create mode 100644 components/core/tests/test-BoundedReader.cpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index f15d14405..7509efebd 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -352,6 +352,8 @@ set(SOURCE_FILES_unitTest src/clp/aws/AwsAuthenticationSigner.cpp src/clp/aws/AwsAuthenticationSigner.hpp src/clp/aws/constants.hpp + src/clp/BoundedReader.cpp + src/clp/BoundedReader.hpp src/clp/BufferedFileReader.cpp src/clp/BufferedFileReader.hpp src/clp/BufferReader.cpp @@ -571,6 +573,7 @@ set(SOURCE_FILES_unitTest submodules/sqlite3/sqlite3ext.h tests/LogSuppressor.hpp tests/test-Array.cpp + tests/test-BoundedReader.cpp tests/test-BufferedFileReader.cpp tests/test-clp_s-end_to_end.cpp tests/test-EncodedVariableInterpreter.cpp diff --git a/components/core/src/clp/BoundedReader.cpp b/components/core/src/clp/BoundedReader.cpp new file mode 100644 index 000000000..9bca08f71 --- /dev/null +++ b/components/core/src/clp/BoundedReader.cpp @@ -0,0 +1,43 @@ +#include "BoundedReader.hpp" + +#include + +#include "ErrorCode.hpp" + +namespace clp { +auto BoundedReader::try_seek_from_begin(size_t pos) -> ErrorCode { + auto const next_pos = pos > m_bound ? m_bound : pos; + if (auto const rc = m_reader->try_seek_from_begin(next_pos); ErrorCode_Success != rc) { + m_curr_pos = ErrorCode_EndOfFile == rc ? next_pos : m_curr_pos; + return rc; + } + m_curr_pos = next_pos; + if (m_curr_pos >= m_bound) { + return ErrorCode_EndOfFile; + } + return ErrorCode_Success; +} + +auto BoundedReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) + -> ErrorCode { + if (m_curr_pos == m_bound) { + num_bytes_read = 0; + return ErrorCode_EndOfFile; + } + + if ((m_curr_pos + num_bytes_to_read) > m_bound) { + num_bytes_to_read = m_bound - m_curr_pos; + } + + auto const rc = m_reader->try_read(buf, num_bytes_to_read, num_bytes_read); + m_curr_pos += num_bytes_read; + if (ErrorCode_EndOfFile == rc) { + if (0 == num_bytes_read) { + return ErrorCode_EndOfFile; + } + } else if (ErrorCode_Success != rc) { + return rc; + } + return ErrorCode_Success; +} +} // namespace clp diff --git a/components/core/src/clp/BoundedReader.hpp b/components/core/src/clp/BoundedReader.hpp new file mode 100644 index 000000000..cfcb07422 --- /dev/null +++ b/components/core/src/clp/BoundedReader.hpp @@ -0,0 +1,89 @@ +#ifndef CLP_BOUNDEDREADER_HPP +#define CLP_BOUNDEDREADER_HPP + +#include +#include + +#include "ErrorCode.hpp" +#include "ReaderInterface.hpp" + +namespace clp { +/** + * BoundedReader is a ReaderInterface designed to wrap other ReaderInterfaces and prevent users + * from reading or seeking beyond a certain point in the underlying input stream. + * + * This is useful when the underlying input stream is divided into several logical segments and we + * want to prevent a reader for an earlier segment consuming any bytes from a later segment. In + * particular, reading part of a later segment may force the reader for that later segment to seek + * backwards, which can be either inefficient or impossible for certain kinds of input streams. + */ +class BoundedReader : public ReaderInterface { +public: + // Constructor + explicit BoundedReader(ReaderInterface* reader, size_t bound) + : m_reader{reader}, + m_bound{bound} { + if (nullptr == m_reader) { + throw ReaderInterface::OperationFailed(ErrorCode_BadParam, __FILE__, __LINE__); + } + m_curr_pos = m_reader->get_pos(); + if (m_curr_pos > m_bound) { + throw ReaderInterface::OperationFailed(ErrorCode_BadParam, __FILE__, __LINE__); + } + } + + // Methods implementing the ReaderInterface + /** + * Tries to get the current position of the read head in the underlying reader. + * @param pos Returns the position of the underlying reader's head + * @return ErrorCode_Success on success + * @return ErrorCode_errno on failure + */ + [[nodiscard]] auto try_get_pos(size_t& pos) -> ErrorCode override { + return m_reader->try_get_pos(pos); + } + + /** + * Tries to seek to the given position, limited by the bound. + * @param pos + * @return ErrorCode_Success on success + * @return ErrorCode_EndOfFile on EOF or if trying to seek beyond the checkpoint + * @return ErrorCode_errno on failure + */ + [[nodiscard]] auto try_seek_from_begin(size_t pos) -> ErrorCode override; + + /** + * Tries to read up to a given number of bytes from the file, limited by the bound. + * @param buf + * @param num_bytes_to_read The number of bytes to try and read + * @param num_bytes_read The actual number of bytes read + * @return ErrorCode_errno on error + * @return ErrorCode_EndOfFile on EOF or trying to read after hitting checkpoint + * @return ErrorCode_Success on success + */ + [[nodiscard]] auto + try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) -> ErrorCode override; + + /** + * This function is unsupported because BoundedReader can not delegate to a potentially + * efficient implementation in the underlying reader, as the underlying reader's implementation + * will not respect the bound. + * @return ErrorCode_Unsupported + */ + [[nodiscard]] auto try_read_to_delimiter( + [[maybe_unused]] char delim, + [[maybe_unused]] bool keep_delimiter, + [[maybe_unused]] bool append, + [[maybe_unused]] std::string& str + ) -> ErrorCode override { + return ErrorCode_Unsupported; + } + +private: + ReaderInterface* m_reader{nullptr}; + size_t m_bound{}; + size_t m_curr_pos{}; +}; +} // namespace clp + +#endif // CLP_BOUNDEDREADER_HPP diff --git a/components/core/src/clp/StringReader.cpp b/components/core/src/clp/StringReader.cpp index 9fa2c27d3..8dd0a3793 100644 --- a/components/core/src/clp/StringReader.cpp +++ b/components/core/src/clp/StringReader.cpp @@ -41,6 +41,10 @@ ErrorCode StringReader::try_read(char* buf, size_t num_bytes_to_read, size_t& nu } ErrorCode StringReader::try_seek_from_begin(size_t pos) { + if (pos > input_string.size()) { + this->pos = input_string.size(); + return ErrorCode_EndOfFile; + } this->pos = pos; return ErrorCode_Success; } diff --git a/components/core/tests/test-BoundedReader.cpp b/components/core/tests/test-BoundedReader.cpp new file mode 100644 index 000000000..9d1a9d2c0 --- /dev/null +++ b/components/core/tests/test-BoundedReader.cpp @@ -0,0 +1,99 @@ +#include +#include +#include +#include + +#include + +#include "../src/clp/BoundedReader.hpp" +#include "../src/clp/ErrorCode.hpp" +#include "../src/clp/StringReader.hpp" + +TEST_CASE("Test Bounded Reader", "[BoundedReader]") { + constexpr std::string_view cTestString{"0123456789"}; + + SECTION("BoundedReader does not support try_read_to_delimiter") { + clp::StringReader string_reader; + string_reader.open(std::string{cTestString}); + clp::BoundedReader bounded_reader{&string_reader, cTestString.size()}; + std::string tmp; + REQUIRE(clp::ErrorCode_Unsupported + == bounded_reader.try_read_to_delimiter('0', false, false, tmp)); + } + + SECTION("BoundedReader does not allow reads beyond end of underlying stream.") { + clp::StringReader string_reader; + string_reader.open(std::string{cTestString}); + clp::BoundedReader bounded_reader{&string_reader, cTestString.size() + 1}; + std::array buf{}; + size_t num_bytes_read{}; + auto rc = bounded_reader.try_read(buf.data(), cTestString.size() + 1, num_bytes_read); + REQUIRE(clp::ErrorCode_Success == rc); + REQUIRE(num_bytes_read == cTestString.size()); + REQUIRE(cTestString.size() == string_reader.get_pos()); + REQUIRE(cTestString.size() == bounded_reader.get_pos()); + } + + SECTION("BoundedReader does not allow reads beyond checkpoint.") { + clp::StringReader string_reader; + string_reader.open(std::string{cTestString}); + clp::BoundedReader bounded_reader{&string_reader, 1}; + std::array buf{}; + size_t num_bytes_read{}; + auto rc = bounded_reader.try_read(buf.data(), cTestString.size(), num_bytes_read); + REQUIRE(clp::ErrorCode_Success == rc); + REQUIRE(1 == num_bytes_read); + REQUIRE(1 == string_reader.get_pos()); + REQUIRE(1 == bounded_reader.get_pos()); + rc = bounded_reader.try_read(buf.data(), 1, num_bytes_read); + REQUIRE(clp::ErrorCode_EndOfFile == rc); + REQUIRE(0 == num_bytes_read); + REQUIRE(1 == string_reader.get_pos()); + REQUIRE(1 == bounded_reader.get_pos()); + } + + SECTION("BoundedReader does allow reads before checkpoint.") { + clp::StringReader string_reader; + string_reader.open(std::string{cTestString}); + clp::BoundedReader bounded_reader{&string_reader, 1}; + char buf{}; + size_t num_bytes_read{}; + auto rc = bounded_reader.try_read(&buf, 1, num_bytes_read); + REQUIRE(clp::ErrorCode_Success == rc); + REQUIRE(1 == num_bytes_read); + REQUIRE(1 == string_reader.get_pos()); + REQUIRE(1 == bounded_reader.get_pos()); + } + + SECTION("BoundedReader does not allow seeks beyond end of underlying stream.") { + clp::StringReader string_reader; + string_reader.open(std::string{cTestString}); + clp::BoundedReader bounded_reader{&string_reader, cTestString.size() + 1}; + auto rc = bounded_reader.try_seek_from_begin(cTestString.size() + 1); + REQUIRE(clp::ErrorCode_EndOfFile == rc); + REQUIRE(cTestString.size() == string_reader.get_pos()); + REQUIRE(cTestString.size() == bounded_reader.get_pos()); + } + + SECTION("BoundedReader does not allow seeks beyond checkpoint.") { + clp::StringReader string_reader; + string_reader.open(std::string{cTestString}); + clp::BoundedReader bounded_reader{&string_reader, 1}; + size_t num_bytes_read{}; + auto rc = bounded_reader.try_seek_from_begin(cTestString.size()); + REQUIRE(clp::ErrorCode_EndOfFile == rc); + REQUIRE(1 == string_reader.get_pos()); + REQUIRE(1 == bounded_reader.get_pos()); + } + + SECTION("BoundedReader does allow seeks before checkpoint.") { + clp::StringReader string_reader; + string_reader.open(std::string{cTestString}); + clp::BoundedReader bounded_reader{&string_reader, 2}; + size_t num_bytes_read{}; + auto rc = bounded_reader.try_seek_from_begin(1); + REQUIRE(clp::ErrorCode_Success == rc); + REQUIRE(1 == string_reader.get_pos()); + REQUIRE(1 == bounded_reader.get_pos()); + } +} From 02d8956db32ffcf2978ce85bd38c3b7522e583fe Mon Sep 17 00:00:00 2001 From: Devin Gibson Date: Mon, 16 Dec 2024 12:57:47 -0500 Subject: [PATCH 52/65] build(core): Update Boost to v1.87.0 in order to pull in boost::urls; Replace calls to boost::asio's deprecated `expires_from_now` with `expires_after`. (#636) --- components/core/CMakeLists.txt | 2 +- components/core/src/reducer/reducer_server.cpp | 7 +++---- .../centos-stream-9/install-packages-from-source.sh | 2 +- components/core/tools/scripts/lib_install/install-boost.sh | 2 +- .../ubuntu-focal/install-packages-from-source.sh | 2 +- .../ubuntu-jammy/install-packages-from-source.sh | 3 +++ .../lib_install/ubuntu-jammy/install-prebuilt-packages.sh | 3 --- 7 files changed, 10 insertions(+), 11 deletions(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 7509efebd..0995a0afb 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -101,7 +101,7 @@ endif() if(CLP_USE_STATIC_LIBS) set(Boost_USE_STATIC_LIBS ON) endif() -find_package(Boost 1.74 REQUIRED iostreams program_options filesystem system regex) +find_package(Boost 1.81 REQUIRED iostreams program_options filesystem system regex url) if(Boost_FOUND) message(STATUS "Found Boost ${Boost_VERSION}") else() diff --git a/components/core/src/reducer/reducer_server.cpp b/components/core/src/reducer/reducer_server.cpp index ab35b7396..a243c763c 100644 --- a/components/core/src/reducer/reducer_server.cpp +++ b/components/core/src/reducer/reducer_server.cpp @@ -121,7 +121,7 @@ void PeriodicUpsertTask::operator()([[maybe_unused]] boost::system::error_code c } auto& upsert_timer = m_server_ctx->get_upsert_timer(); - upsert_timer.expires_from_now(std::chrono::milliseconds(m_server_ctx->get_upsert_interval())); + upsert_timer.expires_after(std::chrono::milliseconds(m_server_ctx->get_upsert_interval())); upsert_timer.async_wait(PeriodicUpsertTask(m_server_ctx)); } @@ -205,9 +205,8 @@ void SchedulerUpdateListenerTask::operator()( if (m_server_ctx->is_timeline_aggregation()) { auto& upsert_timer = m_server_ctx->get_upsert_timer(); - upsert_timer.expires_from_now( - std::chrono::milliseconds(m_server_ctx->get_upsert_interval()) - ); + upsert_timer.expires_after(std::chrono::milliseconds(m_server_ctx->get_upsert_interval() + )); upsert_timer.async_wait(PeriodicUpsertTask(m_server_ctx)); } diff --git a/components/core/tools/scripts/lib_install/centos-stream-9/install-packages-from-source.sh b/components/core/tools/scripts/lib_install/centos-stream-9/install-packages-from-source.sh index f2965f9fd..e6b6b3579 100755 --- a/components/core/tools/scripts/lib_install/centos-stream-9/install-packages-from-source.sh +++ b/components/core/tools/scripts/lib_install/centos-stream-9/install-packages-from-source.sh @@ -10,7 +10,7 @@ script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" lib_install_scripts_dir="${script_dir}/.." # NOTE: The remaining installation scripts depend on boost, so we install it beforehand. -"${lib_install_scripts_dir}/install-boost.sh" 1.76.0 +"${lib_install_scripts_dir}/install-boost.sh" 1.87.0 "${lib_install_scripts_dir}/fmtlib.sh" 8.0.1 "${lib_install_scripts_dir}/spdlog.sh" 1.9.2 diff --git a/components/core/tools/scripts/lib_install/install-boost.sh b/components/core/tools/scripts/lib_install/install-boost.sh index 9e5f9a1c5..40232caf8 100755 --- a/components/core/tools/scripts/lib_install/install-boost.sh +++ b/components/core/tools/scripts/lib_install/install-boost.sh @@ -34,7 +34,7 @@ tar xzf ${tar_filename} cd boost_${version_with_underscores} # Build -./bootstrap.sh --with-libraries=filesystem,iostreams,program_options,regex,system +./bootstrap.sh --with-libraries=filesystem,iostreams,program_options,regex,system,url ./b2 -j${num_cpus} # Install diff --git a/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh b/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh index 10a2b0482..839f6d3c3 100755 --- a/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh +++ b/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh @@ -10,7 +10,7 @@ script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" lib_install_scripts_dir=$script_dir/.. # NOTE: boost must be installed first since the remaining packages depend on it -"$lib_install_scripts_dir"/install-boost.sh 1.74.0 +"$lib_install_scripts_dir"/install-boost.sh 1.87.0 "$lib_install_scripts_dir"/fmtlib.sh 8.0.1 "$lib_install_scripts_dir"/libarchive.sh 3.5.1 diff --git a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh index 97aaf7093..839f6d3c3 100755 --- a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh +++ b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh @@ -9,6 +9,9 @@ set -u script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" lib_install_scripts_dir=$script_dir/.. +# NOTE: boost must be installed first since the remaining packages depend on it +"$lib_install_scripts_dir"/install-boost.sh 1.87.0 + "$lib_install_scripts_dir"/fmtlib.sh 8.0.1 "$lib_install_scripts_dir"/libarchive.sh 3.5.1 "$lib_install_scripts_dir"/liblzma.sh 5.4.6 diff --git a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh index ca1f5f59e..ea055ffdf 100755 --- a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh +++ b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh @@ -15,9 +15,6 @@ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ build-essential \ git \ jq \ - libboost-filesystem-dev \ - libboost-iostreams-dev \ - libboost-program-options-dev \ libcurl4 \ libcurl4-openssl-dev \ liblzma-dev \ From da66fbf833eff06aec0feb1c20706a9c766b7e2f Mon Sep 17 00:00:00 2001 From: Devin Gibson Date: Mon, 16 Dec 2024 13:00:39 -0500 Subject: [PATCH 53/65] refactor(clp-s): Replace instances of `std::string const&` with `std::string_view` where it would remove unnecessary conversions to and from `std::string`. (#635) Co-authored-by: haiqi96 <14502009+haiqi96@users.noreply.github.com> --- components/core/src/clp_s/ArchiveWriter.hpp | 8 +++--- components/core/src/clp_s/ParsedMessage.hpp | 10 +++++++ .../src/clp_s/TimestampDictionaryWriter.cpp | 10 ++++--- .../src/clp_s/TimestampDictionaryWriter.hpp | 10 ++++--- components/core/src/clp_s/TimestampEntry.hpp | 3 +- .../core/src/clp_s/TimestampPattern.cpp | 28 +++++++++---------- .../core/src/clp_s/TimestampPattern.hpp | 6 ++-- .../core/src/clp_s/search/StringLiteral.hpp | 16 ++--------- 8 files changed, 49 insertions(+), 42 deletions(-) diff --git a/components/core/src/clp_s/ArchiveWriter.hpp b/components/core/src/clp_s/ArchiveWriter.hpp index 3b13f4426..82a0122bc 100644 --- a/components/core/src/clp_s/ArchiveWriter.hpp +++ b/components/core/src/clp_s/ArchiveWriter.hpp @@ -122,9 +122,9 @@ class ArchiveWriter { * @return the epoch time corresponding to the string timestamp */ epochtime_t ingest_timestamp_entry( - std::string const& key, + std::string_view key, int32_t node_id, - std::string const& timestamp, + std::string_view timestamp, uint64_t& pattern_id ) { return m_timestamp_dict.ingest_entry(key, node_id, timestamp, pattern_id); @@ -136,11 +136,11 @@ class ArchiveWriter { * @param node_id * @param timestamp */ - void ingest_timestamp_entry(std::string const& key, int32_t node_id, double timestamp) { + void ingest_timestamp_entry(std::string_view key, int32_t node_id, double timestamp) { m_timestamp_dict.ingest_entry(key, node_id, timestamp); } - void ingest_timestamp_entry(std::string const& key, int32_t node_id, int64_t timestamp) { + void ingest_timestamp_entry(std::string_view key, int32_t node_id, int64_t timestamp) { m_timestamp_dict.ingest_entry(key, node_id, timestamp); } diff --git a/components/core/src/clp_s/ParsedMessage.hpp b/components/core/src/clp_s/ParsedMessage.hpp index c843e2b7b..c1b6d7a35 100644 --- a/components/core/src/clp_s/ParsedMessage.hpp +++ b/components/core/src/clp_s/ParsedMessage.hpp @@ -1,8 +1,10 @@ #ifndef CLP_S_PARSEDMESSAGE_HPP #define CLP_S_PARSEDMESSAGE_HPP +#include #include #include +#include #include #include @@ -34,6 +36,10 @@ class ParsedMessage { m_message.emplace(node_id, value); } + inline void add_value(int32_t node_id, std::string_view value) { + m_message.emplace(node_id, std::string{value}); + } + /** * Adds a timestamp value and its encoding to the message for a given MST node ID. * @param node_id @@ -55,6 +61,10 @@ class ParsedMessage { m_unordered_message.emplace_back(value); } + inline void add_unordered_value(std::string_view value) { + m_unordered_message.emplace_back(std::string{value}); + } + /** * Clears the message */ diff --git a/components/core/src/clp_s/TimestampDictionaryWriter.cpp b/components/core/src/clp_s/TimestampDictionaryWriter.cpp index 39e66a6af..952bc36db 100644 --- a/components/core/src/clp_s/TimestampDictionaryWriter.cpp +++ b/components/core/src/clp_s/TimestampDictionaryWriter.cpp @@ -1,6 +1,8 @@ #include "TimestampDictionaryWriter.hpp" +#include #include +#include #include "Utils.hpp" @@ -42,9 +44,9 @@ uint64_t TimestampDictionaryWriter::get_pattern_id(TimestampPattern const* patte } epochtime_t TimestampDictionaryWriter::ingest_entry( - std::string const& key, + std::string_view key, int32_t node_id, - std::string const& timestamp, + std::string_view timestamp, uint64_t& pattern_id ) { epochtime_t ret; @@ -88,7 +90,7 @@ epochtime_t TimestampDictionaryWriter::ingest_entry( } void TimestampDictionaryWriter::ingest_entry( - std::string const& key, + std::string_view key, int32_t node_id, double timestamp ) { @@ -103,7 +105,7 @@ void TimestampDictionaryWriter::ingest_entry( } void TimestampDictionaryWriter::ingest_entry( - std::string const& key, + std::string_view key, int32_t node_id, int64_t timestamp ) { diff --git a/components/core/src/clp_s/TimestampDictionaryWriter.hpp b/components/core/src/clp_s/TimestampDictionaryWriter.hpp index 29288fd48..7c214a39e 100644 --- a/components/core/src/clp_s/TimestampDictionaryWriter.hpp +++ b/components/core/src/clp_s/TimestampDictionaryWriter.hpp @@ -1,9 +1,11 @@ #ifndef CLP_S_TIMESTAMPDICTIONARYWRITER_HPP #define CLP_S_TIMESTAMPDICTIONARYWRITER_HPP +#include #include #include #include +#include #include #include @@ -47,9 +49,9 @@ class TimestampDictionaryWriter { * @return the epoch time corresponding to the string timestamp */ epochtime_t ingest_entry( - std::string const& key, + std::string_view key, int32_t node_id, - std::string const& timestamp, + std::string_view timestamp, uint64_t& pattern_id ); @@ -59,9 +61,9 @@ class TimestampDictionaryWriter { * @param node_id * @param timestamp */ - void ingest_entry(std::string const& key, int32_t node_id, double timestamp); + void ingest_entry(std::string_view key, int32_t node_id, double timestamp); - void ingest_entry(std::string const& key, int32_t node_id, int64_t timestamp); + void ingest_entry(std::string_view key, int32_t node_id, int64_t timestamp); /** * TODO: guarantee epoch milliseconds. The current clp-s approach to encoding timestamps and diff --git a/components/core/src/clp_s/TimestampEntry.hpp b/components/core/src/clp_s/TimestampEntry.hpp index 326ed9d73..47a26fd9e 100644 --- a/components/core/src/clp_s/TimestampEntry.hpp +++ b/components/core/src/clp_s/TimestampEntry.hpp @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -43,7 +44,7 @@ class TimestampEntry { m_epoch_start(cEpochTimeMax), m_epoch_end(cEpochTimeMin) {} - TimestampEntry(std::string const& key_name) + TimestampEntry(std::string_view key_name) : m_encoding(UnkownTimestampEncoding), m_epoch_start_double(cDoubleEpochTimeMax), m_epoch_end_double(cDoubleEpochTimeMin), diff --git a/components/core/src/clp_s/TimestampPattern.cpp b/components/core/src/clp_s/TimestampPattern.cpp index 4ddb5648e..11fab3480 100644 --- a/components/core/src/clp_s/TimestampPattern.cpp +++ b/components/core/src/clp_s/TimestampPattern.cpp @@ -4,6 +4,8 @@ #include #include +#include +#include #include #include @@ -12,6 +14,7 @@ using clp::string_utils::convert_string_to_int; using std::string; +using std::string_view; using std::to_string; using std::vector; @@ -71,7 +74,7 @@ append_padded_value_notz(int value, char padding_character, size_t max_length, s * @return true if conversion succeeds, false otherwise */ static bool convert_string_to_number( - string const& str, + string_view str, size_t begin_ix, size_t end_ix, char padding_character, @@ -89,7 +92,7 @@ static bool convert_string_to_number( * @return true if conversion succeeds, false otherwise */ static bool convert_string_to_number_notz( - string const& str, + string_view str, size_t max_digits, size_t begin_ix, size_t& end_ix, @@ -125,7 +128,7 @@ append_padded_value_notz(int value, char padding_character, size_t max_length, s } static bool convert_string_to_number( - string const& str, + string_view str, size_t begin_ix, size_t end_ix, char padding_character, @@ -154,7 +157,7 @@ static bool convert_string_to_number( } static bool convert_string_to_number_notz( - string const& str, + string_view str, size_t max_digits, size_t begin_ix, size_t& end_ix, @@ -306,7 +309,7 @@ void TimestampPattern::init() { } TimestampPattern const* TimestampPattern::search_known_ts_patterns( - string const& line, + string_view line, epochtime_t& timestamp, size_t& timestamp_begin_pos, size_t& timestamp_end_pos @@ -342,7 +345,7 @@ void TimestampPattern::clear() { } bool TimestampPattern::parse_timestamp( - string const& line, + string_view line, epochtime_t& timestamp, size_t& timestamp_begin_pos, size_t& timestamp_end_pos @@ -827,23 +830,20 @@ bool TimestampPattern::parse_timestamp( } auto dot_position = line.find('.'); auto nanosecond_start = dot_position + 1; - if (std::string::npos == dot_position || 0 == dot_position + if (string::npos == dot_position || 0 == dot_position || cNanosecondDigits != (line.length() - nanosecond_start)) { return false; } - auto timestamp_view = std::string_view(line); - if (false - == convert_string_to_int(timestamp_view.substr(0, dot_position), timestamp)) - { + if (false == convert_string_to_int(line.substr(0, dot_position), timestamp)) { return false; } epochtime_t timestamp_nanoseconds; if (false == convert_string_to_int( - timestamp_view.substr(nanosecond_start, cNanosecondDigits), + line.substr(nanosecond_start, cNanosecondDigits), timestamp_nanoseconds )) { @@ -1070,14 +1070,14 @@ void TimestampPattern::insert_formatted_timestamp(epochtime_t timestamp, string& case 'E': // UNIX epoch milliseconds // Note: this timestamp format is required to make up the entire timestamp, so // this is safe - new_msg = std::to_string(timestamp); + new_msg = to_string(timestamp); break; case 'F': { // Nanosecond precision floating point UNIX epoch timestamp constexpr auto cNanosecondDigits = 9; // Note: this timestamp format is required to make up the entire timestamp, so // this is safe - new_msg = std::to_string(timestamp); + new_msg = to_string(timestamp); new_msg.insert(new_msg.end() - cNanosecondDigits, '.'); break; } diff --git a/components/core/src/clp_s/TimestampPattern.hpp b/components/core/src/clp_s/TimestampPattern.hpp index 9219d33bb..278bb82e1 100644 --- a/components/core/src/clp_s/TimestampPattern.hpp +++ b/components/core/src/clp_s/TimestampPattern.hpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include "Defs.hpp" @@ -83,7 +85,7 @@ class TimestampPattern { * @return pointer to the timestamp pattern if found, nullptr otherwise */ static TimestampPattern const* search_known_ts_patterns( - std::string const& line, + std::string_view line, epochtime_t& timestamp, size_t& timestamp_begin_pos, size_t& timestamp_end_pos @@ -121,7 +123,7 @@ class TimestampPattern { * @return true if parsed successfully, false otherwise */ bool parse_timestamp( - std::string const& line, + std::string_view line, epochtime_t& timestamp, size_t& timestamp_begin_pos, size_t& timestamp_end_pos diff --git a/components/core/src/clp_s/search/StringLiteral.hpp b/components/core/src/clp_s/search/StringLiteral.hpp index 4ac6b9f2f..67c902a29 100644 --- a/components/core/src/clp_s/search/StringLiteral.hpp +++ b/components/core/src/clp_s/search/StringLiteral.hpp @@ -4,6 +4,7 @@ #include #include +#include "../Utils.hpp" #include "Literal.hpp" namespace clp_s::search { @@ -68,19 +69,8 @@ class StringLiteral : public Literal { m_string_type = LiteralType::VarStringT; } - // If '?' and '*' are not escaped, we add LiteralType::ClpStringT to m_string_type - bool escape = false; - for (char const c : m_v) { - if ('\\' == c) { - escape = !escape; - } else if ('?' == c || '*' == c) { - if (false == escape) { - m_string_type |= LiteralType::ClpStringT; - break; - } - } else { - escape = false; - } + if (StringUtils::has_unescaped_wildcards(m_v)) { + m_string_type |= LiteralType::ClpStringT; } } }; From b7741c079b6f8dfe05343b055b6d84b5b019f19e Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Tue, 17 Dec 2024 02:10:21 +0800 Subject: [PATCH 54/65] docs(core): Indicate dependency install scripts should be run with elevated privileges. (#637) --- .../dev-guide/components-core/centos-stream-9-deps-install.md | 2 +- docs/src/dev-guide/components-core/ubuntu-focal-deps-install.md | 2 +- docs/src/dev-guide/components-core/ubuntu-jammy-deps-install.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/src/dev-guide/components-core/centos-stream-9-deps-install.md b/docs/src/dev-guide/components-core/centos-stream-9-deps-install.md index 654b9bf5a..1bc90910a 100644 --- a/docs/src/dev-guide/components-core/centos-stream-9-deps-install.md +++ b/docs/src/dev-guide/components-core/centos-stream-9-deps-install.md @@ -10,7 +10,7 @@ Before you run any commands below, you should review the scripts to ensure they any dependencies or apply any configurations that you don't expect. ::: -To install all dependencies, run: +To install all dependencies, run the following with elevated privileges: :::{note} The packages built from source ([install-packages-from-source.sh][src-install-script]) are installed diff --git a/docs/src/dev-guide/components-core/ubuntu-focal-deps-install.md b/docs/src/dev-guide/components-core/ubuntu-focal-deps-install.md index 53ee0ecbd..776c2d43e 100644 --- a/docs/src/dev-guide/components-core/ubuntu-focal-deps-install.md +++ b/docs/src/dev-guide/components-core/ubuntu-focal-deps-install.md @@ -10,7 +10,7 @@ Before you run any commands below, you should review the scripts to ensure they any dependencies or apply any configurations that you don't expect. ::: -To install all dependencies, run: +To install all dependencies, run the following with elevated privileges: ```shell components/core/tools/scripts/lib_install/ubuntu-focal/install-all.sh diff --git a/docs/src/dev-guide/components-core/ubuntu-jammy-deps-install.md b/docs/src/dev-guide/components-core/ubuntu-jammy-deps-install.md index 186098446..2e5d4eb3c 100644 --- a/docs/src/dev-guide/components-core/ubuntu-jammy-deps-install.md +++ b/docs/src/dev-guide/components-core/ubuntu-jammy-deps-install.md @@ -10,7 +10,7 @@ Before you run any commands below, you should review the scripts to ensure they any dependencies or apply any configurations that you don't expect. ::: -To install all dependencies, run: +To install all dependencies, run the following with elevated privileges: ```shell components/core/tools/scripts/lib_install/ubuntu-jammy/install-all.sh From 1edc16e2f2165aafa856bdaf4d07d8a773fa8adf Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 20:23:36 -0500 Subject: [PATCH 55/65] feat(package)!: Add support for writing clp-s single file archives to S3. (#634) Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com> --- .../clp_package_utils/general.py | 26 ++- .../clp_package_utils/scripts/decompress.py | 14 +- .../clp_package_utils/scripts/del_archives.py | 7 + .../scripts/native/decompress.py | 4 +- .../scripts/native/del_archives.py | 2 +- .../clp_package_utils/scripts/search.py | 6 + .../clp_package_utils/scripts/start_clp.py | 45 ++--- .../clp-py-utils/clp_py_utils/clp_config.py | 154 +++++++++++++++-- .../initialize-orchestration-db.py | 2 +- .../clp-py-utils/clp_py_utils/s3_utils.py | 51 ++++++ components/clp-py-utils/pyproject.toml | 2 + .../executor/compress/fs_compression_task.py | 163 ++++++++++++------ .../executor/query/extract_stream_task.py | 47 +++-- .../executor/query/fs_search_task.py | 39 +++-- .../job_orchestration/executor/query/utils.py | 5 +- .../job_orchestration/executor/utils.py | 23 +++ .../compress/compression_scheduler.py | 22 +-- .../package-template/src/etc/clp-config.yml | 4 +- 18 files changed, 470 insertions(+), 146 deletions(-) create mode 100644 components/clp-py-utils/clp_py_utils/s3_utils.py create mode 100644 components/job-orchestration/job_orchestration/executor/utils.py diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py index 5fae8166f..60f1053f8 100644 --- a/components/clp-package-utils/clp_package_utils/general.py +++ b/components/clp-package-utils/clp_package_utils/general.py @@ -20,7 +20,9 @@ REDIS_COMPONENT_NAME, REDUCER_COMPONENT_NAME, RESULTS_CACHE_COMPONENT_NAME, + StorageType, WEBUI_COMPONENT_NAME, + WorkerConfig, ) from clp_py_utils.core import ( get_config_value, @@ -239,17 +241,17 @@ def generate_container_config( DockerMountType.BIND, clp_config.logs_directory, container_clp_config.logs_directory ) - container_clp_config.archive_output.directory = pathlib.Path("/") / "mnt" / "archive-output" + container_clp_config.archive_output.set_directory(pathlib.Path("/") / "mnt" / "archive-output") if not is_path_already_mounted( clp_home, CONTAINER_CLP_HOME, - clp_config.archive_output.directory, - container_clp_config.archive_output.directory, + clp_config.archive_output.get_directory(), + container_clp_config.archive_output.get_directory(), ): docker_mounts.archives_output_dir = DockerMount( DockerMountType.BIND, - clp_config.archive_output.directory, - container_clp_config.archive_output.directory, + clp_config.archive_output.get_directory(), + container_clp_config.archive_output.get_directory(), ) container_clp_config.stream_output.directory = pathlib.Path("/") / "mnt" / "stream-output" @@ -268,6 +270,18 @@ def generate_container_config( return container_clp_config, docker_mounts +def generate_worker_config(clp_config: CLPConfig) -> WorkerConfig: + worker_config = WorkerConfig() + worker_config.package = clp_config.package.copy(deep=True) + worker_config.archive_output = clp_config.archive_output.copy(deep=True) + worker_config.data_directory = clp_config.data_directory + + worker_config.stream_output_dir = clp_config.stream_output.directory + worker_config.stream_collection_name = clp_config.results_cache.stream_collection_name + + return worker_config + + def dump_container_config( container_clp_config: CLPConfig, clp_config: CLPConfig, container_name: str ) -> Tuple[pathlib.Path, pathlib.Path]: @@ -482,7 +496,7 @@ def validate_results_cache_config( def validate_worker_config(clp_config: CLPConfig): clp_config.validate_input_logs_dir() - clp_config.validate_archive_output_dir() + clp_config.validate_archive_output_config() clp_config.validate_stream_output_dir() diff --git a/components/clp-package-utils/clp_package_utils/scripts/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/decompress.py index 325f2add6..092c339a6 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/decompress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/decompress.py @@ -5,7 +5,7 @@ import sys from typing import Optional -from clp_py_utils.clp_config import CLPConfig +from clp_py_utils.clp_config import CLPConfig, StorageType from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, @@ -81,6 +81,11 @@ def handle_extract_file_cmd( if clp_config is None: return -1 + storage_type = clp_config.archive_output.storage.type + if StorageType.FS != storage_type: + logger.error(f"File extraction is not supported for archive storage type: {storage_type}.") + return -1 + container_name = generate_container_name(str(JobType.FILE_EXTRACTION)) container_clp_config, mounts = generate_container_config(clp_config, clp_home) generated_config_path_on_container, generated_config_path_on_host = dump_container_config( @@ -156,6 +161,13 @@ def handle_extract_stream_cmd( if clp_config is None: return -1 + storage_type = clp_config.archive_output.storage.type + if StorageType.FS != storage_type: + logger.error( + f"Stream extraction is not supported for archive storage type: {storage_type}." + ) + return -1 + container_name = generate_container_name(str(JobType.IR_EXTRACTION)) container_clp_config, mounts = generate_container_config(clp_config, clp_home) generated_config_path_on_container, generated_config_path_on_host = dump_container_config( diff --git a/components/clp-package-utils/clp_package_utils/scripts/del_archives.py b/components/clp-package-utils/clp_package_utils/scripts/del_archives.py index 54d959771..5b9bc6d97 100644 --- a/components/clp-package-utils/clp_package_utils/scripts/del_archives.py +++ b/components/clp-package-utils/clp_package_utils/scripts/del_archives.py @@ -4,6 +4,8 @@ import sys from pathlib import Path +from clp_py_utils.clp_config import StorageType + from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, dump_container_config, @@ -57,6 +59,11 @@ def main(argv): logger.exception("Failed to load config.") return -1 + storage_type = clp_config.archive_output.storage.type + if StorageType.FS != storage_type: + logger.error(f"Archive deletion is not supported for storage type: {storage_type}.") + return -1 + # Validate the input timestamp begin_ts = parsed_args.begin_ts end_ts = parsed_args.end_ts diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py index d16cdcb6f..7e3c7da6e 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py @@ -167,7 +167,7 @@ def validate_and_load_config_file( """ try: clp_config = load_config_file(config_file_path, default_config_file_path, clp_home) - clp_config.validate_archive_output_dir() + clp_config.validate_archive_output_config() clp_config.validate_logs_dir() return clp_config except Exception: @@ -207,7 +207,7 @@ def handle_extract_file_cmd( list_path = parsed_args.files_from logs_dir = clp_config.logs_directory - archives_dir = clp_config.archive_output.directory + archives_dir = clp_config.archive_output.get_directory() # Generate database config file for clp db_config_file_path = logs_dir / f".decompress-db-config-{uuid.uuid4()}.yml" diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py b/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py index 735bf299d..c489c3806 100644 --- a/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py +++ b/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py @@ -54,7 +54,7 @@ def main(argv): return -1 database_config = clp_config.database - archives_dir = clp_config.archive_output.directory + archives_dir = clp_config.archive_output.get_directory() if not archives_dir.exists(): logger.error("`archive_output.directory` doesn't exist.") return -1 diff --git a/components/clp-package-utils/clp_package_utils/scripts/search.py b/components/clp-package-utils/clp_package_utils/scripts/search.py index beb7fb0b0..38d528528 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/search.py +++ b/components/clp-package-utils/clp_package_utils/scripts/search.py @@ -7,6 +7,7 @@ import uuid import yaml +from clp_py_utils.clp_config import StorageType from clp_package_utils.general import ( CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH, @@ -74,6 +75,11 @@ def main(argv): logger.exception("Failed to load config.") return -1 + storage_type = clp_config.archive_output.storage.type + if StorageType.FS != storage_type: + logger.error(f"Search is not supported for archive storage type: {storage_type}.") + return -1 + container_name = generate_container_name(str(JobType.SEARCH)) container_clp_config, mounts = generate_container_config(clp_config, clp_home) diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py index 8097929f1..6de3174ff 100755 --- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py +++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py @@ -29,6 +29,7 @@ REDIS_COMPONENT_NAME, REDUCER_COMPONENT_NAME, RESULTS_CACHE_COMPONENT_NAME, + StorageType, WEBUI_COMPONENT_NAME, ) from job_orchestration.scheduler.constants import QueueName @@ -42,6 +43,7 @@ DockerMount, DockerMountType, generate_container_config, + generate_worker_config, get_clp_home, is_container_exited, is_container_running, @@ -626,6 +628,7 @@ def start_compression_worker( ): celery_method = "job_orchestration.executor.compress" celery_route = f"{QueueName.COMPRESSION}" + compression_worker_mounts = [mounts.archives_output_dir] generic_start_worker( COMPRESSION_WORKER_COMPONENT_NAME, instance_id, @@ -637,8 +640,7 @@ def start_compression_worker( clp_config.redis.compression_backend_database, num_cpus, mounts, - None, - None, + compression_worker_mounts, ) @@ -652,11 +654,9 @@ def start_query_worker( celery_method = "job_orchestration.executor.query" celery_route = f"{QueueName.QUERY}" - query_worker_mount = [mounts.stream_output_dir] - query_worker_env = { - "CLP_STREAM_OUTPUT_DIR": container_clp_config.stream_output.directory, - "CLP_STREAM_COLLECTION_NAME": clp_config.results_cache.stream_collection_name, - } + query_worker_mounts = [mounts.stream_output_dir] + if clp_config.archive_output.storage.type == StorageType.FS: + query_worker_mounts.append(mounts.archives_output_dir) generic_start_worker( QUERY_WORKER_COMPONENT_NAME, @@ -669,8 +669,7 @@ def start_query_worker( clp_config.redis.query_backend_database, num_cpus, mounts, - query_worker_env, - query_worker_mount, + query_worker_mounts, ) @@ -685,8 +684,7 @@ def generic_start_worker( redis_database: int, num_cpus: int, mounts: CLPDockerMounts, - worker_specific_env: Dict[str, Any], - worker_specific_mount: List[Optional[DockerMount]], + worker_specific_mount: Optional[List[Optional[DockerMount]]], ): logger.info(f"Starting {component_name}...") @@ -694,14 +692,18 @@ def generic_start_worker( if container_exists(container_name): return - validate_worker_config(clp_config) + container_config_filename = f"{container_name}.yml" + container_config_file_path = clp_config.logs_directory / container_config_filename + container_worker_config = generate_worker_config(container_clp_config) + with open(container_config_file_path, "w") as f: + yaml.safe_dump(container_worker_config.dump_to_primitive_dict(), f) logs_dir = clp_config.logs_directory / component_name logs_dir.mkdir(parents=True, exist_ok=True) container_logs_dir = container_clp_config.logs_directory / component_name # Create necessary directories - clp_config.archive_output.directory.mkdir(parents=True, exist_ok=True) + clp_config.archive_output.get_directory().mkdir(parents=True, exist_ok=True) clp_config.stream_output.directory.mkdir(parents=True, exist_ok=True) clp_site_packages_dir = CONTAINER_CLP_HOME / "lib" / "python3" / "site-packages" @@ -724,24 +726,17 @@ def generic_start_worker( f"{container_clp_config.redis.host}:{container_clp_config.redis.port}/{redis_database}" ), "-e", f"CLP_HOME={CONTAINER_CLP_HOME}", - "-e", f"CLP_DATA_DIR={container_clp_config.data_directory}", - "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.directory}", + "-e", f"CLP_CONFIG_PATH={container_clp_config.logs_directory / container_config_filename}", "-e", f"CLP_LOGS_DIR={container_logs_dir}", "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}", - "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}", "-u", f"{os.getuid()}:{os.getgid()}", ] - if worker_specific_env: - for env_name, env_value in worker_specific_env.items(): - container_start_cmd.append("-e") - container_start_cmd.append(f"{env_name}={env_value}") - # fmt: on + necessary_mounts = [ mounts.clp_home, mounts.data_dir, mounts.logs_dir, - mounts.archives_output_dir, mounts.input_logs_dir, ] if worker_specific_mount: @@ -1125,6 +1120,12 @@ def main(argv): QUERY_WORKER_COMPONENT_NAME, ): validate_and_load_redis_credentials_file(clp_config, clp_home, True) + if target in ( + ALL_TARGET_NAME, + COMPRESSION_WORKER_COMPONENT_NAME, + QUERY_WORKER_COMPONENT_NAME, + ): + validate_worker_config(clp_config) clp_config.validate_data_dir() clp_config.validate_logs_dir() diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py index 79a94505d..f59de7647 100644 --- a/components/clp-py-utils/clp_py_utils/clp_config.py +++ b/components/clp-py-utils/clp_py_utils/clp_config.py @@ -1,10 +1,10 @@ import pathlib -import typing from enum import auto +from typing import Literal, Optional, Union from dotenv import dotenv_values from pydantic import BaseModel, PrivateAttr, validator -from strenum import KebabCaseStrEnum +from strenum import KebabCaseStrEnum, LowercaseStrEnum from .clp_logging import get_valid_logging_level, is_valid_logging_level from .core import ( @@ -48,6 +48,11 @@ class StorageEngine(KebabCaseStrEnum): CLP_S = auto() +class StorageType(LowercaseStrEnum): + FS = auto() + S3 = auto() + + VALID_STORAGE_ENGINES = [storage_engine.value for storage_engine in StorageEngine] @@ -69,12 +74,12 @@ class Database(BaseModel): host: str = "localhost" port: int = 3306 name: str = "clp-db" - ssl_cert: typing.Optional[str] = None + ssl_cert: Optional[str] = None auto_commit: bool = False compress: bool = True - username: typing.Optional[str] = None - password: typing.Optional[str] = None + username: Optional[str] = None + password: Optional[str] = None @validator("type") def validate_database_type(cls, field): @@ -227,7 +232,7 @@ class Redis(BaseModel): query_backend_database: int = 0 compression_backend_database: int = 1 # redis can perform authentication without a username - password: typing.Optional[str] + password: Optional[str] @validator("host") def validate_host(cls, field): @@ -300,12 +305,80 @@ class Queue(BaseModel): host: str = "localhost" port: int = 5672 - username: typing.Optional[str] - password: typing.Optional[str] + username: Optional[str] + password: Optional[str] -class ArchiveOutput(BaseModel): +class S3Config(BaseModel): + region_code: str + bucket: str + key_prefix: str + + access_key_id: Optional[str] = None + secret_access_key: Optional[str] = None + + @validator("region_code") + def validate_region_code(cls, field): + if field == "": + raise ValueError("region_code cannot be empty") + return field + + @validator("bucket") + def validate_bucket(cls, field): + if field == "": + raise ValueError("bucket cannot be empty") + return field + + @validator("key_prefix") + def validate_key_prefix(cls, field): + if field == "": + raise ValueError("key_prefix cannot be empty") + if not field.endswith("/"): + raise ValueError('key_prefix must end with "/"') + return field + + +class FsStorage(BaseModel): + type: Literal[StorageType.FS.value] = StorageType.FS.value directory: pathlib.Path = pathlib.Path("var") / "data" / "archives" + + @validator("directory") + def validate_directory(cls, field): + if "" == field: + raise ValueError("directory cannot be empty") + return field + + def make_config_paths_absolute(self, clp_home: pathlib.Path): + self.directory = make_config_path_absolute(clp_home, self.directory) + + def dump_to_primitive_dict(self): + d = self.dict() + d["directory"] = str(d["directory"]) + return d + + +class S3Storage(BaseModel): + type: Literal[StorageType.S3.value] = StorageType.S3.value + staging_directory: pathlib.Path = pathlib.Path("var") / "data" / "staged_archives" + s3_config: S3Config + + @validator("staging_directory") + def validate_staging_directory(cls, field): + if "" == field: + raise ValueError("staging_directory cannot be empty") + return field + + def make_config_paths_absolute(self, clp_home: pathlib.Path): + self.staging_directory = make_config_path_absolute(clp_home, self.staging_directory) + + def dump_to_primitive_dict(self): + d = self.dict() + d["staging_directory"] = str(d["staging_directory"]) + return d + + +class ArchiveOutput(BaseModel): + storage: Union[FsStorage, S3Storage] = FsStorage() target_archive_size: int = 256 * 1024 * 1024 # 256 MB target_dictionaries_size: int = 32 * 1024 * 1024 # 32 MB target_encoded_file_size: int = 256 * 1024 * 1024 # 256 MB @@ -335,13 +408,30 @@ def validate_target_segment_size(cls, field): raise ValueError("target_segment_size must be greater than 0") return field - def make_config_paths_absolute(self, clp_home: pathlib.Path): - self.directory = make_config_path_absolute(clp_home, self.directory) + def set_directory(self, directory: pathlib.Path): + storage_config = self.storage + storage_type = storage_config.type + if StorageType.FS == storage_type: + storage_config.directory = directory + elif StorageType.S3 == storage_type: + storage_config.staging_directory = directory + else: + raise NotImplementedError(f"storage.type {storage_type} is not supported") + + def get_directory(self) -> pathlib.Path: + storage_config = self.storage + storage_type = storage_config.type + if StorageType.FS == storage_config.type: + return storage_config.directory + elif StorageType.S3 == storage_type: + return storage_config.staging_directory + else: + raise NotImplementedError(f"storage.type {storage_type} is not supported") def dump_to_primitive_dict(self): d = self.dict() # Turn directory (pathlib.Path) into a primitive string - d["directory"] = str(d["directory"]) + d["storage"] = self.storage.dump_to_primitive_dict() return d @@ -352,7 +442,7 @@ class StreamOutput(BaseModel): @validator("directory") def validate_directory(cls, field): if "" == field: - raise ValueError("directory can not be empty") + raise ValueError("directory cannot be empty") return field @validator("target_uncompressed_size") @@ -408,7 +498,7 @@ def validate_port(cls, field): class CLPConfig(BaseModel): - execution_container: typing.Optional[str] + execution_container: Optional[str] = None input_logs_directory: pathlib.Path = pathlib.Path("/") @@ -436,7 +526,7 @@ class CLPConfig(BaseModel): def make_config_paths_absolute(self, clp_home: pathlib.Path): self.input_logs_directory = make_config_path_absolute(clp_home, self.input_logs_directory) self.credentials_file_path = make_config_path_absolute(clp_home, self.credentials_file_path) - self.archive_output.make_config_paths_absolute(clp_home) + self.archive_output.storage.make_config_paths_absolute(clp_home) self.stream_output.make_config_paths_absolute(clp_home) self.data_directory = make_config_path_absolute(clp_home, self.data_directory) self.logs_directory = make_config_path_absolute(clp_home, self.logs_directory) @@ -451,11 +541,19 @@ def validate_input_logs_dir(self): if not input_logs_dir.is_dir(): raise ValueError(f"input_logs_directory '{input_logs_dir}' is not a directory.") - def validate_archive_output_dir(self): + def validate_archive_output_config(self): + if ( + StorageType.S3 == self.archive_output.storage.type + and StorageEngine.CLP_S != self.package.storage_engine + ): + raise ValueError( + f"archive_output.storage.type = 's3' is only supported with package.storage_engine" + f" = '{StorageEngine.CLP_S}'" + ) try: - validate_path_could_be_dir(self.archive_output.directory) + validate_path_could_be_dir(self.archive_output.get_directory()) except ValueError as ex: - raise ValueError(f"archive_output.directory is invalid: {ex}") + raise ValueError(f"archive_output.storage's directory is invalid: {ex}") def validate_stream_output_dir(self): try: @@ -537,3 +635,23 @@ def dump_to_primitive_dict(self): d["data_directory"] = str(self.data_directory) d["logs_directory"] = str(self.logs_directory) return d + + +class WorkerConfig(BaseModel): + package: Package = Package() + archive_output: ArchiveOutput = ArchiveOutput() + data_directory: pathlib.Path = CLPConfig().data_directory + + # Only needed by query workers. + stream_output_dir: pathlib.Path = StreamOutput().directory + stream_collection_name: str = ResultsCache().stream_collection_name + + def dump_to_primitive_dict(self): + d = self.dict() + d["archive_output"] = self.archive_output.dump_to_primitive_dict() + + # Turn paths into primitive strings + d["data_directory"] = str(self.data_directory) + d["stream_output_dir"] = str(self.stream_output_dir) + + return d diff --git a/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py b/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py index 1ed727367..2c8133e8a 100644 --- a/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py +++ b/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py @@ -52,7 +52,7 @@ def main(argv): CREATE TABLE IF NOT EXISTS `{COMPRESSION_JOBS_TABLE_NAME}` ( `id` INT NOT NULL AUTO_INCREMENT, `status` INT NOT NULL DEFAULT '{CompressionJobStatus.PENDING}', - `status_msg` VARCHAR(255) NOT NULL DEFAULT '', + `status_msg` VARCHAR(512) NOT NULL DEFAULT '', `creation_time` DATETIME(3) NOT NULL DEFAULT CURRENT_TIMESTAMP(3), `start_time` DATETIME(3) NULL DEFAULT NULL, `duration` FLOAT NULL DEFAULT NULL, diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py new file mode 100644 index 000000000..03717a445 --- /dev/null +++ b/components/clp-py-utils/clp_py_utils/s3_utils.py @@ -0,0 +1,51 @@ +from pathlib import Path + +import boto3 +from botocore.config import Config +from botocore.exceptions import ClientError +from result import Err, Ok, Result + +from clp_py_utils.clp_config import S3Config + + +def s3_put( + s3_config: S3Config, src_file: Path, dest_file_name: str, total_max_attempts: int = 3 +) -> Result[bool, str]: + """ + Uploads a local file to an S3 bucket using AWS's PutObject operation. + :param s3_config: S3 configuration specifying the upload destination and credentials. + :param src_file: Local file to upload. + :param dest_file_name: The name for the uploaded file in the S3 bucket. + :param total_max_attempts: Maximum number of retry attempts for the upload. + :return: Result.OK(bool) on success, or Result.Err(str) with the error message otherwise. + """ + if not src_file.exists(): + return Err(f"{src_file} doesn't exist") + if not src_file.is_file(): + return Err(f"{src_file} is not a file") + if src_file.stat().st_size > 5 * 1024 * 1024 * 1024: + return Err(f"{src_file} is larger than the limit (5GiB) for a single PutObject operation.") + + config = Config(retries=dict(total_max_attempts=total_max_attempts, mode="adaptive")) + + my_s3_client = boto3.client( + "s3", + region_name=s3_config.region_code, + aws_access_key_id=s3_config.access_key_id, + aws_secret_access_key=s3_config.secret_access_key, + config=config, + ) + + with open(src_file, "rb") as file_data: + try: + my_s3_client.put_object( + Bucket=s3_config.bucket, Body=file_data, Key=s3_config.key_prefix + dest_file_name + ) + except ClientError as e: + error_code = e.response["Error"]["Code"] + error_message = e.response["Error"]["Message"] + return Err(f"ClientError: {error_code} - {error_message}") + except Exception as e: + return Err(f"An unexpected error occurred: {e}") + + return Ok(True) diff --git a/components/clp-py-utils/pyproject.toml b/components/clp-py-utils/pyproject.toml index 4e827b926..6d68ceebe 100644 --- a/components/clp-py-utils/pyproject.toml +++ b/components/clp-py-utils/pyproject.toml @@ -10,6 +10,7 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.8 || ^3.10" +boto3 = "^1.35.81" # mariadb version must be compatible with libmariadev installed in runtime env. # See https://mariadb.com/docs/server/connect/programming-languages/python/install/#Dependencies mariadb = "~1.0.11" @@ -19,6 +20,7 @@ python-dotenv = "^1.0.1" python-Levenshtein = "~0.22" sqlalchemy = "~2.0" PyYAML = "^6.0.1" +result = "^0.17.0" StrEnum = "^0.4.15" [build-system] diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index ce88ad185..a5dbc0e35 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -4,6 +4,7 @@ import pathlib import subprocess from contextlib import closing +from typing import Any, Dict, Optional import yaml from celery.app.task import Task @@ -12,9 +13,14 @@ COMPRESSION_JOBS_TABLE_NAME, COMPRESSION_TASKS_TABLE_NAME, Database, + S3Config, StorageEngine, + StorageType, + WorkerConfig, ) from clp_py_utils.clp_logging import set_logging_level +from clp_py_utils.core import read_yaml_config_file +from clp_py_utils.s3_utils import s3_put from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.compress.celery import app from job_orchestration.scheduler.constants import CompressionTaskStatus @@ -108,6 +114,7 @@ def make_clp_s_command( archive_output_dir: pathlib.Path, clp_config: ClpIoConfig, db_config_file_path: pathlib.Path, + enable_s3_write: bool, ): # fmt: off compression_cmd = [ @@ -120,6 +127,9 @@ def make_clp_s_command( ] # fmt: on + if enable_s3_write: + compression_cmd.append("--single-file-archive") + if clp_config.input.timestamp_key is not None: compression_cmd.append("--timestamp-key") compression_cmd.append(clp_config.input.timestamp_key) @@ -128,10 +138,9 @@ def make_clp_s_command( def run_clp( + worker_config: WorkerConfig, clp_config: ClpIoConfig, clp_home: pathlib.Path, - data_dir: pathlib.Path, - archive_output_dir: pathlib.Path, logs_dir: pathlib.Path, job_id: int, task_id: int, @@ -143,10 +152,9 @@ def run_clp( """ Compresses files from an FS into archives on an FS + :param worker_config: WorkerConfig :param clp_config: ClpIoConfig :param clp_home: - :param data_dir: - :param archive_output_dir: :param logs_dir: :param job_id: :param task_id: @@ -156,16 +164,31 @@ def run_clp( :param clp_metadata_db_connection_config :return: tuple -- (whether compression was successful, output messages) """ - clp_storage_engine = str(os.getenv("CLP_STORAGE_ENGINE")) - instance_id_str = f"compression-job-{job_id}-task-{task_id}" + clp_storage_engine = worker_config.package.storage_engine + data_dir = worker_config.data_directory + archive_output_dir = worker_config.archive_output.get_directory() + # Generate database config file for clp db_config_file_path = data_dir / f"{instance_id_str}-db-config.yml" db_config_file = open(db_config_file_path, "w") yaml.safe_dump(clp_metadata_db_connection_config, db_config_file) db_config_file.close() + # Get s3 config + s3_config: S3Config + enable_s3_write = False + storage_type = worker_config.archive_output.storage.type + if StorageType.S3 == storage_type: + if StorageEngine.CLP_S != clp_storage_engine: + error_msg = f"S3 storage is not supported for storage engine: {clp_storage_engine}." + logger.error(error_msg) + return False, {"error_message": error_msg} + + s3_config = worker_config.archive_output.storage.s3_config + enable_s3_write = True + if StorageEngine.CLP == clp_storage_engine: compression_cmd = make_clp_command( clp_home=clp_home, @@ -179,6 +202,7 @@ def run_clp( archive_output_dir=archive_output_dir, clp_config=clp_config, db_config_file_path=db_config_file_path, + enable_s3_write=enable_s3_write, ) else: logger.error(f"Unsupported storage engine {clp_storage_engine}") @@ -212,48 +236,65 @@ def run_clp( # Compute the total amount of data compressed last_archive_stats = None + last_line_decoded = False total_uncompressed_size = 0 total_compressed_size = 0 - while True: + + # Handle job metadata update and s3 write if enabled + s3_error = None + while not last_line_decoded: line = proc.stdout.readline() - if not line: - break - stats = json.loads(line.decode("ascii")) - if last_archive_stats is not None and stats["id"] != last_archive_stats["id"]: - # We've started a new archive so add the previous archive's last - # reported size to the total - total_uncompressed_size += last_archive_stats["uncompressed_size"] - total_compressed_size += last_archive_stats["size"] - with closing(sql_adapter.create_connection(True)) as db_conn, closing( - db_conn.cursor(dictionary=True) - ) as db_cursor: - update_job_metadata_and_tags( - db_cursor, - job_id, - clp_metadata_db_connection_config["table_prefix"], - tag_ids, - last_archive_stats, - ) - db_conn.commit() + stats: Optional[Dict[str, Any]] = None + if "" == line: + # Skip empty lines that could be caused by potential errors in printing archive stats + continue + + if line is not None: + stats = json.loads(line.decode("ascii")) + else: + last_line_decoded = True + + if last_archive_stats is not None and ( + None is stats or stats["id"] != last_archive_stats["id"] + ): + if enable_s3_write: + archive_id = last_archive_stats["id"] + archive_path = archive_output_dir / archive_id + + if s3_error is None: + logger.info(f"Uploading archive {archive_id} to S3...") + result = s3_put(s3_config, archive_path, archive_id) + + if result.is_err(): + logger.error(f"Failed to upload archive {archive_id}: {result.err_value}") + s3_error = result.err_value + # NOTE: It's possible `proc` finishes before we call `terminate` on it, in + # which case the process will still return success. + proc.terminate() + else: + logger.info(f"Finished uploading archive {archive_id} to S3.") + + archive_path.unlink() + + if s3_error is None: + # We've started a new archive so add the previous archive's last reported size to + # the total + total_uncompressed_size += last_archive_stats["uncompressed_size"] + total_compressed_size += last_archive_stats["size"] + with closing(sql_adapter.create_connection(True)) as db_conn, closing( + db_conn.cursor(dictionary=True) + ) as db_cursor: + update_job_metadata_and_tags( + db_cursor, + job_id, + clp_metadata_db_connection_config["table_prefix"], + tag_ids, + last_archive_stats, + ) + db_conn.commit() last_archive_stats = stats - if last_archive_stats is not None: - # Add the last archive's last reported size - total_uncompressed_size += last_archive_stats["uncompressed_size"] - total_compressed_size += last_archive_stats["size"] - with closing(sql_adapter.create_connection(True)) as db_conn, closing( - db_conn.cursor(dictionary=True) - ) as db_cursor: - update_job_metadata_and_tags( - db_cursor, - job_id, - clp_metadata_db_connection_config["table_prefix"], - tag_ids, - last_archive_stats, - ) - db_conn.commit() - # Wait for compression to finish return_code = proc.wait() if 0 != return_code: @@ -274,10 +315,16 @@ def run_clp( "total_uncompressed_size": total_uncompressed_size, "total_compressed_size": total_compressed_size, } - if compression_successful: + + if compression_successful and s3_error is None: return CompressionTaskStatus.SUCCEEDED, worker_output else: - worker_output["error_message"] = f"See logs {stderr_log_path}" + error_msgs = [] + if compression_successful is False: + error_msgs.append(f"See logs {stderr_log_path}") + if s3_error is not None: + error_msgs.append(s3_error) + worker_output["error_message"] = "\n".join(error_msgs) return CompressionTaskStatus.FAILED, worker_output @@ -291,15 +338,28 @@ def compress( paths_to_compress_json: str, clp_metadata_db_connection_config, ): - clp_home_str = os.getenv("CLP_HOME") - data_dir_str = os.getenv("CLP_DATA_DIR") - archive_output_dir_str = os.getenv("CLP_ARCHIVE_OUTPUT_DIR") - logs_dir_str = os.getenv("CLP_LOGS_DIR") + clp_home = pathlib.Path(os.getenv("CLP_HOME")) # Set logging level + logs_dir = pathlib.Path(os.getenv("CLP_LOGS_DIR")) clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL")) set_logging_level(logger, clp_logging_level) + # Load configuration + try: + worker_config = WorkerConfig.parse_obj( + read_yaml_config_file(pathlib.Path(os.getenv("CLP_CONFIG_PATH"))) + ) + except Exception as ex: + error_msg = "Failed to load worker config" + logger.exception(error_msg) + return CompressionTaskResult( + task_id=task_id, + status=CompressionTaskStatus.FAILED, + duration=0, + error_message=error_msg, + ) + clp_io_config = ClpIoConfig.parse_raw(clp_io_config_json) paths_to_compress = PathsToCompress.parse_raw(paths_to_compress_json) @@ -308,11 +368,10 @@ def compress( start_time = datetime.datetime.now() logger.info(f"[job_id={job_id} task_id={task_id}] COMPRESSION STARTED.") compression_task_status, worker_output = run_clp( + worker_config, clp_io_config, - pathlib.Path(clp_home_str), - pathlib.Path(data_dir_str), - pathlib.Path(archive_output_dir_str), - pathlib.Path(logs_dir_str), + clp_home, + logs_dir, job_id, task_id, tag_ids, diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py index 423ebb757..58ae43450 100644 --- a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py @@ -5,14 +5,15 @@ from celery.app.task import Task from celery.utils.log import get_task_logger -from clp_py_utils.clp_config import Database, StorageEngine +from clp_py_utils.clp_config import Database, StorageEngine, StorageType, WorkerConfig from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.celery import app from job_orchestration.executor.query.utils import ( - report_command_creation_failure, + report_task_failure, run_query_task, ) +from job_orchestration.executor.utils import load_worker_config from job_orchestration.scheduler.job_config import ExtractIrJobConfig, ExtractJsonJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskStatus @@ -21,15 +22,17 @@ def make_command( - storage_engine: str, clp_home: Path, - archives_dir: Path, + worker_config: WorkerConfig, archive_id: str, - stream_output_dir: Path, job_config: dict, results_cache_uri: str, - stream_collection_name: str, ) -> Optional[List[str]]: + storage_engine = worker_config.package.storage_engine + archives_dir = worker_config.archive_output.get_directory() + stream_output_dir = worker_config.stream_output_dir + stream_collection_name = worker_config.stream_collection_name + if StorageEngine.CLP == storage_engine: logger.info("Starting IR extraction") extract_ir_config = ExtractIrJobConfig.parse_obj(job_config) @@ -97,28 +100,38 @@ def extract_stream( task_status: QueryTaskStatus sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) + # Load configuration + clp_config_path = Path(os.getenv("CLP_CONFIG_PATH")) + worker_config = load_worker_config(clp_config_path, logger) + if worker_config is None: + return report_task_failure( + sql_adapter=sql_adapter, + task_id=task_id, + start_time=start_time, + ) + + if worker_config.archive_output.storage.type == StorageType.S3: + logger.error(f"Stream extraction is not supported for the S3 storage type") + return report_task_failure( + sql_adapter=sql_adapter, + task_id=task_id, + start_time=start_time, + ) + # Make task_command clp_home = Path(os.getenv("CLP_HOME")) - archive_directory = Path(os.getenv("CLP_ARCHIVE_OUTPUT_DIR")) - clp_storage_engine = os.getenv("CLP_STORAGE_ENGINE") - stream_output_dir = Path(os.getenv("CLP_STREAM_OUTPUT_DIR")) - stream_collection_name = os.getenv("CLP_STREAM_COLLECTION_NAME") task_command = make_command( - storage_engine=clp_storage_engine, clp_home=clp_home, - archives_dir=archive_directory, + worker_config=worker_config, archive_id=archive_id, - stream_output_dir=stream_output_dir, job_config=job_config, results_cache_uri=results_cache_uri, - stream_collection_name=stream_collection_name, ) if not task_command: - return report_command_creation_failure( + logger.error(f"Error creating {task_name} command") + return report_task_failure( sql_adapter=sql_adapter, - logger=logger, - task_name=task_name, task_id=task_id, start_time=start_time, ) diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py index 598bfdcfc..7cf7b330f 100644 --- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py +++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py @@ -5,14 +5,15 @@ from celery.app.task import Task from celery.utils.log import get_task_logger -from clp_py_utils.clp_config import Database, StorageEngine +from clp_py_utils.clp_config import Database, StorageEngine, StorageType, WorkerConfig from clp_py_utils.clp_logging import set_logging_level from clp_py_utils.sql_adapter import SQL_Adapter from job_orchestration.executor.query.celery import app from job_orchestration.executor.query.utils import ( - report_command_creation_failure, + report_task_failure, run_query_task, ) +from job_orchestration.executor.utils import load_worker_config from job_orchestration.scheduler.job_config import SearchJobConfig from job_orchestration.scheduler.scheduler_data import QueryTaskStatus @@ -21,14 +22,16 @@ def make_command( - storage_engine: str, clp_home: Path, - archives_dir: Path, + worker_config: WorkerConfig, archive_id: str, search_config: SearchJobConfig, results_cache_uri: str, results_collection: str, ) -> Optional[List[str]]: + storage_engine = worker_config.package.storage_engine + archives_dir = worker_config.archive_output.get_directory() + if StorageEngine.CLP == storage_engine: command = [str(clp_home / "bin" / "clo"), "s", str(archives_dir / archive_id)] if search_config.path_filter is not None: @@ -116,26 +119,40 @@ def search( task_status: QueryTaskStatus sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params)) + # Load configuration + clp_config_path = Path(os.getenv("CLP_CONFIG_PATH")) + worker_config = load_worker_config(clp_config_path, logger) + if worker_config is None: + return report_task_failure( + sql_adapter=sql_adapter, + task_id=task_id, + start_time=start_time, + ) + + if worker_config.archive_output.storage.type == StorageType.S3: + logger.error(f"Search is not supported for the S3 storage type") + return report_task_failure( + sql_adapter=sql_adapter, + task_id=task_id, + start_time=start_time, + ) + # Make task_command clp_home = Path(os.getenv("CLP_HOME")) - archive_directory = Path(os.getenv("CLP_ARCHIVE_OUTPUT_DIR")) - clp_storage_engine = os.getenv("CLP_STORAGE_ENGINE") search_config = SearchJobConfig.parse_obj(job_config) task_command = make_command( - storage_engine=clp_storage_engine, clp_home=clp_home, - archives_dir=archive_directory, + worker_config=worker_config, archive_id=archive_id, search_config=search_config, results_cache_uri=results_cache_uri, results_collection=job_id, ) if not task_command: - return report_command_creation_failure( + logger.error(f"Error creating {task_name} command") + return report_task_failure( sql_adapter=sql_adapter, - logger=logger, - task_name=task_name, task_id=task_id, start_time=start_time, ) diff --git a/components/job-orchestration/job_orchestration/executor/query/utils.py b/components/job-orchestration/job_orchestration/executor/query/utils.py index 69d22398e..523abbe00 100644 --- a/components/job-orchestration/job_orchestration/executor/query/utils.py +++ b/components/job-orchestration/job_orchestration/executor/query/utils.py @@ -19,14 +19,11 @@ def get_task_log_file_path(clp_logs_dir: Path, job_id: str, task_id: int) -> Pat return worker_logs_dir / f"{task_id}-clo.log" -def report_command_creation_failure( +def report_task_failure( sql_adapter: SQL_Adapter, - logger: Logger, - task_name: str, task_id: int, start_time: datetime.datetime, ): - logger.error(f"Error creating {task_name} command") task_status = QueryTaskStatus.FAILED update_query_task_metadata( sql_adapter, diff --git a/components/job-orchestration/job_orchestration/executor/utils.py b/components/job-orchestration/job_orchestration/executor/utils.py new file mode 100644 index 000000000..47ea702ae --- /dev/null +++ b/components/job-orchestration/job_orchestration/executor/utils.py @@ -0,0 +1,23 @@ +from logging import Logger +from pathlib import Path +from typing import Optional + +from clp_py_utils.clp_config import WorkerConfig +from clp_py_utils.core import read_yaml_config_file + + +def load_worker_config( + config_path: Path, + logger: Logger, +) -> Optional[WorkerConfig]: + """ + Loads a WorkerConfig object from the specified configuration file. + :param config_path: Path to the configuration file. + :param logger: Logger instance for reporting errors if loading fails. + :return: The loaded WorkerConfig object on success, None otherwise. + """ + try: + return WorkerConfig.parse_obj(read_yaml_config_file(config_path)) + except Exception: + logger.exception("Failed to load worker config") + return None diff --git a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py index 62b7a27fc..bd793686b 100644 --- a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py +++ b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py @@ -53,13 +53,14 @@ def update_compression_task_metadata(db_cursor, task_id, kv): logger.error("Must specify at least one field to update") raise ValueError - field_set_expressions = [f'{k}="{v}"' for k, v in kv.items()] + field_set_expressions = [f"{k} = %s" for k in kv.keys()] query = f""" - UPDATE {COMPRESSION_TASKS_TABLE_NAME} - SET {", ".join(field_set_expressions)} - WHERE id={task_id} + UPDATE {COMPRESSION_TASKS_TABLE_NAME} + SET {", ".join(field_set_expressions)} + WHERE id = %s """ - db_cursor.execute(query) + values = list(kv.values()) + [task_id] + db_cursor.execute(query, values) def update_compression_job_metadata(db_cursor, job_id, kv): @@ -67,13 +68,14 @@ def update_compression_job_metadata(db_cursor, job_id, kv): logger.error("Must specify at least one field to update") raise ValueError - field_set_expressions = [f'{k}="{v}"' for k, v in kv.items()] + field_set_expressions = [f"{k} = %s" for k in kv.keys()] query = f""" - UPDATE {COMPRESSION_JOBS_TABLE_NAME} - SET {", ".join(field_set_expressions)} - WHERE id={job_id} + UPDATE {COMPRESSION_JOBS_TABLE_NAME} + SET {", ".join(field_set_expressions)} + WHERE id = %s """ - db_cursor.execute(query) + values = list(kv.values()) + [job_id] + db_cursor.execute(query, values) def search_and_schedule_new_tasks(db_conn, db_cursor, clp_metadata_db_connection_config): diff --git a/components/package-template/src/etc/clp-config.yml b/components/package-template/src/etc/clp-config.yml index f19b93463..22b03b889 100644 --- a/components/package-template/src/etc/clp-config.yml +++ b/components/package-template/src/etc/clp-config.yml @@ -66,7 +66,9 @@ # ## Where archives should be output to #archive_output: -# directory: "var/data/archives" +# storage: +# type: "fs" +# directory: "var/data/archives" # # # How much data CLP should try to compress into each archive # target_archive_size: 268435456 # 256 MB From e4c9dd3c08dac002a50fa670e911a6379c8c9976 Mon Sep 17 00:00:00 2001 From: haiqi96 <14502009+haiqi96@users.noreply.github.com> Date: Wed, 18 Dec 2024 22:54:19 -0500 Subject: [PATCH 56/65] fix(clp-package): Remove faulty error handling for parsing archive compression stats. (#640) --- .../executor/compress/fs_compression_task.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py index a5dbc0e35..593c07bd7 100644 --- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py +++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py @@ -243,16 +243,13 @@ def run_clp( # Handle job metadata update and s3 write if enabled s3_error = None while not last_line_decoded: - line = proc.stdout.readline() stats: Optional[Dict[str, Any]] = None - if "" == line: - # Skip empty lines that could be caused by potential errors in printing archive stats - continue - if line is not None: - stats = json.loads(line.decode("ascii")) - else: + line = proc.stdout.readline() + if not line: last_line_decoded = True + else: + stats = json.loads(line.decode("ascii")) if last_archive_stats is not None and ( None is stats or stats["id"] != last_archive_stats["id"] From 32dc98901129eb0cc514265d2d82acc76a260065 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Thu, 19 Dec 2024 00:11:55 -0500 Subject: [PATCH 57/65] fix(core): Add missing `../` to fix relative header file includes. (#627) Co-authored-by: Bingran Hu --- components/core/src/clp/clo/CommandLineArguments.cpp | 2 +- components/core/src/clp/clp/FileDecompressor.hpp | 6 +++--- components/core/src/clp/clp/decompression.cpp | 2 +- components/core/src/clp/clp/utils.cpp | 4 ++-- components/core/src/clp/clp/utils.hpp | 4 ++-- components/core/src/clp/ir/EncodedTextAst.cpp | 2 +- components/core/src/clp/ir/LogEvent.hpp | 2 +- components/core/src/clp_s/CommandLineArguments.cpp | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/components/core/src/clp/clo/CommandLineArguments.cpp b/components/core/src/clp/clo/CommandLineArguments.cpp index 4e187f985..f0a7f7ecc 100644 --- a/components/core/src/clp/clo/CommandLineArguments.cpp +++ b/components/core/src/clp/clo/CommandLineArguments.cpp @@ -8,8 +8,8 @@ #include +#include "../../reducer/types.hpp" #include "../cli_utils.hpp" -#include "../reducer/types.hpp" #include "../spdlog_with_specializations.hpp" #include "../version.hpp" diff --git a/components/core/src/clp/clp/FileDecompressor.hpp b/components/core/src/clp/clp/FileDecompressor.hpp index b08a21eb4..17a8b8e43 100644 --- a/components/core/src/clp/clp/FileDecompressor.hpp +++ b/components/core/src/clp/clp/FileDecompressor.hpp @@ -6,17 +6,17 @@ #include #include +#include "../ErrorCode.hpp" #include "../FileWriter.hpp" #include "../ir/constants.hpp" #include "../ir/LogEventSerializer.hpp" +#include "../ir/types.hpp" #include "../spdlog_with_specializations.hpp" #include "../streaming_archive/MetadataDB.hpp" #include "../streaming_archive/reader/Archive.hpp" #include "../streaming_archive/reader/File.hpp" #include "../streaming_archive/reader/Message.hpp" -#include "ErrorCode.hpp" -#include "ir/types.hpp" -#include "Utils.hpp" +#include "../Utils.hpp" namespace clp::clp { /** diff --git a/components/core/src/clp/clp/decompression.cpp b/components/core/src/clp/clp/decompression.cpp index c42357334..ce7cbd5c7 100644 --- a/components/core/src/clp/clp/decompression.cpp +++ b/components/core/src/clp/clp/decompression.cpp @@ -7,12 +7,12 @@ #include "../FileWriter.hpp" #include "../GlobalMySQLMetadataDB.hpp" #include "../GlobalSQLiteMetadataDB.hpp" +#include "../ir/constants.hpp" #include "../spdlog_with_specializations.hpp" #include "../streaming_archive/reader/Archive.hpp" #include "../TraceableException.hpp" #include "../Utils.hpp" #include "FileDecompressor.hpp" -#include "ir/constants.hpp" #include "utils.hpp" using std::cerr; diff --git a/components/core/src/clp/clp/utils.cpp b/components/core/src/clp/clp/utils.cpp index 0f05d75ac..123f9a836 100644 --- a/components/core/src/clp/clp/utils.cpp +++ b/components/core/src/clp/clp/utils.cpp @@ -9,9 +9,9 @@ #include "../GlobalMySQLMetadataDB.hpp" #include "../GlobalSQLiteMetadataDB.hpp" #include "../spdlog_with_specializations.hpp" +#include "../streaming_archive/Constants.hpp" +#include "../TraceableException.hpp" #include "../Utils.hpp" -#include "streaming_archive/Constants.hpp" -#include "TraceableException.hpp" using std::string; using std::vector; diff --git a/components/core/src/clp/clp/utils.hpp b/components/core/src/clp/clp/utils.hpp index 0a6918445..47adc50f2 100644 --- a/components/core/src/clp/clp/utils.hpp +++ b/components/core/src/clp/clp/utils.hpp @@ -7,11 +7,11 @@ #include +#include "../ErrorCode.hpp" #include "../GlobalMetadataDB.hpp" #include "../GlobalMetadataDBConfig.hpp" -#include "ErrorCode.hpp" +#include "../TraceableException.hpp" #include "FileToCompress.hpp" -#include "TraceableException.hpp" namespace clp::clp { // Types diff --git a/components/core/src/clp/ir/EncodedTextAst.cpp b/components/core/src/clp/ir/EncodedTextAst.cpp index f0ee4d493..72a8f2729 100644 --- a/components/core/src/clp/ir/EncodedTextAst.cpp +++ b/components/core/src/clp/ir/EncodedTextAst.cpp @@ -5,7 +5,7 @@ #include #include "../ffi/encoding_methods.hpp" -#include "ffi/ir_stream/decoding_methods.hpp" +#include "../ffi/ir_stream/decoding_methods.hpp" using clp::ffi::decode_float_var; using clp::ffi::decode_integer_var; diff --git a/components/core/src/clp/ir/LogEvent.hpp b/components/core/src/clp/ir/LogEvent.hpp index 4a3ef7567..e2d4b310e 100644 --- a/components/core/src/clp/ir/LogEvent.hpp +++ b/components/core/src/clp/ir/LogEvent.hpp @@ -5,8 +5,8 @@ #include #include +#include "../time_types.hpp" #include "EncodedTextAst.hpp" -#include "time_types.hpp" #include "types.hpp" namespace clp::ir { diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp index c7fb9487e..fc7427f11 100644 --- a/components/core/src/clp_s/CommandLineArguments.cpp +++ b/components/core/src/clp_s/CommandLineArguments.cpp @@ -6,9 +6,9 @@ #include #include "../clp/cli_utils.hpp" +#include "../clp/type_utils.hpp" #include "../reducer/types.hpp" #include "FileReader.hpp" -#include "type_utils.hpp" namespace po = boost::program_options; From 02f0b8f1597c810072bbcafb202b5b29bf48b68c Mon Sep 17 00:00:00 2001 From: davidlion Date: Fri, 20 Dec 2024 00:20:21 -0500 Subject: [PATCH 58/65] Refactor lzma stream and add some doc strings. --- .../streaming_compression/lzma/Compressor.cpp | 108 +++++++-------- .../streaming_compression/lzma/Compressor.hpp | 126 ++++++++++++------ .../core/tests/test-StreamingCompression.cpp | 8 +- 3 files changed, 140 insertions(+), 102 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 3e6bb0254..52febe232 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -12,21 +12,22 @@ #include "../../FileWriter.hpp" #include "../../TraceableException.hpp" #include "../../type_utils.hpp" -#include "Constants.hpp" namespace clp::streaming_compression::lzma { -auto Compressor::open(FileWriter& file_writer, int compression_level) -> void { +auto Compressor::open(FileWriter& file_writer) -> void { if (nullptr != m_compressed_stream_file_writer) { throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__); } - if (compression_level < cMinCompressionLevel || compression_level > cMaxCompressionLevel) { - throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__); - } - m_compression_level = compression_level; - m_lzma_ops.init_lzma_encoder(); - m_lzma_ops.detach_input_src(); - m_lzma_ops.attach_output_buffer(); + m_lzma_stream.detach_input(); + if (false + == m_lzma_stream.attach_output( + m_compressed_stream_block_buffer.data(), + m_compressed_stream_block_buffer.size() + )) + { + throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__); + } m_compressed_stream_file_writer = &file_writer; m_uncompressed_stream_pos = 0; } @@ -36,16 +37,14 @@ auto Compressor::close() -> void { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } - if (m_compression_stream.avail_in > 0) { + if (m_lzma_stream.avail_in() > 0) { SPDLOG_WARN("Trying to close LZMA compressor with unprocessed input data. Processing and " "flushing remaining data."); flush_lzma(LZMA_FULL_FLUSH); } flush_lzma(LZMA_FINISH); - // Deallocates LZMA stream's internal data structures - lzma_end(&m_compression_stream); - m_lzma_ops.detach_output_buffer(); + m_lzma_stream.end_and_detach_output(); m_compressed_stream_file_writer = nullptr; } @@ -62,10 +61,14 @@ auto Compressor::write(char const* data, size_t data_length) -> void { throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } - m_compression_stream.next_in = clp::size_checked_pointer_cast(data); - m_compression_stream.avail_in = data_length; + if (false + == m_lzma_stream + .attach_input(clp::size_checked_pointer_cast(data), data_length)) + { + throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__); + } encode_lzma(); - m_lzma_ops.detach_input_src(); + m_lzma_stream.detach_input(); m_uncompressed_stream_pos += data_length; } @@ -85,18 +88,17 @@ auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode { } auto Compressor::encode_lzma() -> void { - while (m_compression_stream.avail_in > 0) { - if (0 == m_compression_stream.avail_out) { + while (m_lzma_stream.avail_in() > 0) { + if (0 == m_lzma_stream.avail_out()) { flush_stream_output_block_buffer(); } - auto const rc = lzma_code(&m_compression_stream, LZMA_RUN); + auto const rc = m_lzma_stream.lzma_code(LZMA_RUN); switch (rc) { case LZMA_OK: break; case LZMA_BUF_ERROR: - SPDLOG_ERROR( - "LZMA compressor input stream is corrupt. No encoding progress can be made." - ); + SPDLOG_ERROR("LZMA compressor input stream is corrupt. No encoding " + "progress can be made."); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); default: SPDLOG_ERROR( @@ -109,7 +111,7 @@ auto Compressor::encode_lzma() -> void { } auto Compressor::flush_lzma(lzma_action flush_action) -> void { - if (false == LzmaStreamOperations::is_flush_action(flush_action)) { + if (false == LzmaStream::is_flush_action(flush_action)) { SPDLOG_ERROR( "lzma_code() supplied with invalid flush action - {}.", static_cast(flush_action) @@ -119,24 +121,24 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void { bool flushed{false}; while (false == flushed) { - if (0 == m_compression_stream.avail_out) { + if (0 == m_lzma_stream.avail_out()) { flush_stream_output_block_buffer(); } - auto const rc = lzma_code(&m_compression_stream, flush_action); + auto const rc = m_lzma_stream.lzma_code(flush_action); switch (rc) { case LZMA_OK: break; case LZMA_STREAM_END: - // NOTE: flush may not have completed if a multithreaded encoder is using action - // LZMA_FULL_BARRIER. For now, we skip this check. + // NOTE: flush may not have completed if a multithreaded encoder is using + // action LZMA_FULL_BARRIER. For now, we skip this check. flushed = true; break; case LZMA_BUF_ERROR: - // NOTE: this can happen if we are using LZMA_FULL_FLUSH or LZMA_FULL_BARRIER. These - // two actions keeps encoding input data alongside flushing buffered encoded data. - SPDLOG_ERROR( - "LZMA compressor input stream is corrupt. No encoding progress can be made." - ); + // NOTE: this can happen if we are using LZMA_FULL_FLUSH or + // LZMA_FULL_BARRIER. These two actions keeps encoding input data + // alongside flushing buffered encoded data. + SPDLOG_ERROR("LZMA compressor input stream is corrupt. No encoding " + "progress can be made."); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); default: SPDLOG_ERROR( @@ -150,50 +152,36 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void { } auto Compressor::flush_stream_output_block_buffer() -> void { - if (cCompressedStreamBlockBufferSize == m_compression_stream.avail_out) { + if (cCompressedStreamBlockBufferSize == m_lzma_stream.avail_out()) { return; } m_compressed_stream_file_writer->write( clp::size_checked_pointer_cast(m_compressed_stream_block_buffer.data()), - cCompressedStreamBlockBufferSize - m_compression_stream.avail_out + cCompressedStreamBlockBufferSize - m_lzma_stream.avail_out() ); - m_lzma_ops.attach_output_buffer(); -} - -auto Compressor::LzmaStreamOperations::is_flush_action(lzma_action action) -> bool { - return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action || LZMA_FULL_BARRIER == action - || LZMA_FINISH == action; -} - -auto Compressor::LzmaStreamOperations::attach_output_buffer() -> void { - m_p->m_compression_stream.next_out = m_p->m_compressed_stream_block_buffer.data(); - m_p->m_compression_stream.avail_out = m_p->m_compressed_stream_block_buffer.size(); -} - -auto Compressor::LzmaStreamOperations::detach_input_src() -> void { - m_p->m_compression_stream.next_in = nullptr; - m_p->m_compression_stream.avail_in = 0; -} - -auto Compressor::LzmaStreamOperations::detach_output_buffer() -> void { - m_p->m_compression_stream.next_out = nullptr; - m_p->m_compression_stream.avail_out = 0; + if (false + == m_lzma_stream.attach_output( + m_compressed_stream_block_buffer.data(), + m_compressed_stream_block_buffer.size() + )) + { + throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__); + } } -auto Compressor::LzmaStreamOperations::init_lzma_encoder(lzma_check check) -> void { +Compressor::LzmaStream::LzmaStream(int compression_level, size_t dict_size, lzma_check check) { lzma_options_lzma options; - if (0 != lzma_lzma_preset(&options, m_p->m_compression_level)) { + if (0 != lzma_lzma_preset(&options, compression_level)) { SPDLOG_ERROR("Failed to initialize LZMA options' compression level."); throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } - options.dict_size = m_p->m_dict_size; + options.dict_size = dict_size; std::array filters{{ {.id = LZMA_FILTER_LZMA2, .options = &options}, {.id = LZMA_VLI_UNKNOWN, .options = nullptr}, }}; - m_p->m_compression_stream = LZMA_STREAM_INIT; - auto const rc = lzma_stream_encoder(&m_p->m_compression_stream, filters.data(), check); + auto const rc = lzma_stream_encoder(&m_stream, filters.data(), check); if (LZMA_OK == rc) { return; } diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index 3e7af18ff..49a3e079a 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -11,9 +11,11 @@ #include "../../FileWriter.hpp" #include "../../TraceableException.hpp" #include "../Compressor.hpp" -#include "Constants.hpp" namespace clp::streaming_compression::lzma { +/** + * Implements a LZMA compressor that compresses byte input data to a file. + */ class Compressor : public ::clp::streaming_compression::Compressor { public: // Types @@ -30,7 +32,8 @@ class Compressor : public ::clp::streaming_compression::Compressor { }; // Constructor - Compressor() = default; + Compressor(int compression_level, size_t dict_size, lzma_check check) + : m_lzma_stream{compression_level, dict_size, check} {} // Destructor ~Compressor() override = default; @@ -43,14 +46,6 @@ class Compressor : public ::clp::streaming_compression::Compressor { Compressor(Compressor&&) noexcept = default; auto operator=(Compressor&&) noexcept -> Compressor& = default; - /** - * Initializes the compression stream with the given compression level - * - * @param file_writer - * @param compression_level - */ - auto open(FileWriter& file_writer, int compression_level) -> void; - // Methods implementing the WriterInterface /** * Writes the given data to the compressor @@ -74,61 +69,114 @@ class Compressor : public ::clp::streaming_compression::Compressor { */ auto try_get_pos(size_t& pos) const -> ErrorCode override; + // Methods implementing the Compressor interface /** * Closes the compressor */ auto close() -> void override; - // Methods implementing the Compressor interface /** - * Initializes the compression stream with the default compression level + * Open the compression stream for encoding to the file_writer. + * * @param file_writer */ - auto open(FileWriter& file_writer) -> void override { - this->open(file_writer, cDefaultCompressionLevel); - } + auto open(FileWriter& file_writer) -> void override; private: - class LzmaStreamOperations { + /** + * Wrapper class around lzma_stream providing easier usage. + */ + class LzmaStream { public: - // Constructor - LzmaStreamOperations(Compressor* parent) : m_p(parent) {} + /** + * Initializes an LZMA compression encoder and its streams. + * + * @param compression_level Compression preset level in the range [0-9] where the higher + * numbers use increasingly more memory for greater compression ratios. + * @param dict_size Max amount of recently processed uncompressed bytes to keep in the + * memory. + * @param check Type of check to verify the integrity of the uncompressed data. + * LZMA_CHECK_CRC64 is the default in the xz command line tool. If the .xz file needs to be + * decompressed with XZ-Embedded, use LZMA_CHECK_CRC32 instead. + * + * @throw `OperationFailed` `ErrorCode_BadParam` if the LZMA options are invalid or the + * encoder fails to initialize. + */ + LzmaStream(int compression_level, size_t dict_size, lzma_check check); // Destructor - ~LzmaStreamOperations() = default; + ~LzmaStream() = default; // Delete copy constructor and assignment operator - LzmaStreamOperations(LzmaStreamOperations const&) = delete; - auto operator=(LzmaStreamOperations const&) -> LzmaStreamOperations& = delete; + LzmaStream(LzmaStream const&) = delete; + auto operator=(LzmaStream const&) -> LzmaStream& = delete; // Default move constructor and assignment operator - LzmaStreamOperations(LzmaStreamOperations&&) noexcept = default; - auto operator=(LzmaStreamOperations&&) noexcept -> LzmaStreamOperations& = default; + LzmaStream(LzmaStream&&) noexcept = default; + auto operator=(LzmaStream&&) noexcept -> LzmaStream& = default; - [[nodiscard]] static auto is_flush_action(lzma_action action) -> bool; + /** + * Attaches a pre-allocated block buffer to the encoder's input stream. + * + * @return false if the data buffer is null or empty. + * @return true on success. + */ + [[nodiscard]] auto attach_input(uint8_t const* data_ptr, size_t data_length) -> bool { + if (nullptr == data_ptr || 0 == data_length) { + return false; + } + m_stream.next_in = data_ptr; + m_stream.avail_in = data_length; + return true; + } /** - * Attaches a pre-allocated block buffer to the encoder's output stream + * Attaches a pre-allocated block buffer to the encoder's output stream. * - * Subsequent calls to this function resets the output buffer to its initial state. + * @return false if the data buffer is null or empty. + * @return true on success. */ - auto attach_output_buffer() -> void; + [[nodiscard]] auto attach_output(uint8_t* data_ptr, size_t data_length) -> bool { + if (nullptr == data_ptr || 0 == data_length) { + return false; + } + m_stream.next_out = data_ptr; + m_stream.avail_out = data_length; + return true; + } - auto detach_input_src() -> void; + [[nodiscard]] auto avail_in() const -> size_t { return m_stream.avail_in; } - auto detach_output_buffer() -> void; + [[nodiscard]] auto avail_out() const -> size_t { return m_stream.avail_out; } /** - * Initializes an LZMA compression encoder and its streams - * - * @param check Type of integrity check calculated from the uncompressed data. - * LZMA_CHECK_CRC64 is the default in the xz command line tool. If the .xz file needs to be - * decompressed with XZ-Embedded, use LZMA_CHECK_CRC32 instead. + * Unset the internal fields of the encoder's input stream. + */ + auto detach_input() -> void { + m_stream.next_in = nullptr; + m_stream.avail_in = 0; + } + + /** + * End the LZMA stream and unset the internal fields of the encoder's output stream. */ - auto init_lzma_encoder(lzma_check check = LZMA_CHECK_CRC64) -> void; + auto end_and_detach_output() -> void { + lzma_end(&m_stream); + m_stream.next_out = nullptr; + m_stream.avail_out = 0; + } + + [[nodiscard]] static auto is_flush_action(lzma_action action) -> bool { + return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action + || LZMA_FULL_BARRIER == action || LZMA_FINISH == action; + } + + [[nodiscard]] auto lzma_code(lzma_action action) -> lzma_ret { + return ::lzma_code(&m_stream, action); + } private: - Compressor* m_p; + lzma_stream m_stream = LZMA_STREAM_INIT; }; static constexpr size_t cCompressedStreamBlockBufferSize{4096}; // 4KiB @@ -170,12 +218,8 @@ class Compressor : public ::clp::streaming_compression::Compressor { FileWriter* m_compressed_stream_file_writer{nullptr}; // Compressed stream variables - LzmaStreamOperations m_lzma_ops{this}; Array m_compressed_stream_block_buffer{cCompressedStreamBlockBufferSize}; - int m_compression_level{cDefaultCompressionLevel}; - lzma_stream m_compression_stream = LZMA_STREAM_INIT; - // Specifies how many bytes of the recently processed uncompressed data to keep in the memory - size_t m_dict_size{cDefaultDictionarySize}; + LzmaStream m_lzma_stream; size_t m_uncompressed_stream_pos{0}; }; } // namespace clp::streaming_compression::lzma diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp index 4076eb88f..8fc7f4286 100644 --- a/components/core/tests/test-StreamingCompression.cpp +++ b/components/core/tests/test-StreamingCompression.cpp @@ -9,6 +9,7 @@ #include #include +#include #include #include "../src/clp/Array.hpp" @@ -18,6 +19,7 @@ #include "../src/clp/streaming_compression/Compressor.hpp" #include "../src/clp/streaming_compression/Decompressor.hpp" #include "../src/clp/streaming_compression/lzma/Compressor.hpp" +#include "../src/clp/streaming_compression/lzma/Constants.hpp" #include "../src/clp/streaming_compression/passthrough/Compressor.hpp" #include "../src/clp/streaming_compression/passthrough/Decompressor.hpp" #include "../src/clp/streaming_compression/zstd/Compressor.hpp" @@ -130,7 +132,11 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") { } SECTION("LZMA compression") { - compressor = std::make_unique(); + compressor = std::make_unique( + clp::streaming_compression::lzma::cDefaultCompressionLevel, + clp::streaming_compression::lzma::cDefaultDictionarySize, + LZMA_CHECK_CRC64 + ); compress(std::move(compressor), uncompressed_buffer.data()); } From fcfc73ae3337f412afe8605a7b56c432f074f9b0 Mon Sep 17 00:00:00 2001 From: davidlion Date: Fri, 20 Dec 2024 00:35:00 -0500 Subject: [PATCH 59/65] Fix accidental comment reflow. --- .../src/clp/streaming_compression/lzma/Compressor.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 52febe232..67e03f871 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -129,14 +129,13 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void { case LZMA_OK: break; case LZMA_STREAM_END: - // NOTE: flush may not have completed if a multithreaded encoder is using - // action LZMA_FULL_BARRIER. For now, we skip this check. + // NOTE: flush may not have completed if a multithreaded encoder is using action + // LZMA_FULL_BARRIER. For now, we skip this check. flushed = true; break; case LZMA_BUF_ERROR: - // NOTE: this can happen if we are using LZMA_FULL_FLUSH or - // LZMA_FULL_BARRIER. These two actions keeps encoding input data - // alongside flushing buffered encoded data. + // NOTE: this can happen if we are using LZMA_FULL_FLUSH or LZMA_FULL_BARRIER. These + // two actions keeps encoding input data alongside flushing buffered encoded data. SPDLOG_ERROR("LZMA compressor input stream is corrupt. No encoding " "progress can be made."); throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); From d139bd593e6585b66a3350645a8ec1c85b079f0a Mon Sep 17 00:00:00 2001 From: davidlion Date: Fri, 20 Dec 2024 13:01:27 -0500 Subject: [PATCH 60/65] Apply suggestions from code review Co-authored-by: Bingran Hu --- .../clp/streaming_compression/lzma/Compressor.cpp | 15 +++------------ .../clp/streaming_compression/lzma/Compressor.hpp | 3 ++- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 67e03f871..a6c5e197a 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -26,7 +26,7 @@ auto Compressor::open(FileWriter& file_writer) -> void { m_compressed_stream_block_buffer.size() )) { - throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__); + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } m_compressed_stream_file_writer = &file_writer; m_uncompressed_stream_pos = 0; @@ -52,20 +52,11 @@ auto Compressor::write(char const* data, size_t data_length) -> void { if (nullptr == m_compressed_stream_file_writer) { throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } - - if (0 == data_length) { - return; - } - - if (nullptr == data) { - throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); - } - if (false == m_lzma_stream .attach_input(clp::size_checked_pointer_cast(data), data_length)) { - throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__); + throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__); } encode_lzma(); m_lzma_stream.detach_input(); @@ -164,7 +155,7 @@ auto Compressor::flush_stream_output_block_buffer() -> void { m_compressed_stream_block_buffer.size() )) { - throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__); + throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); } } diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index 49a3e079a..5d35eb28e 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -31,7 +31,8 @@ class Compressor : public ::clp::streaming_compression::Compressor { } }; - // Constructor + // Constructors + Compressor() : Compressor{cDefaultCompressionLevel, cDefaultDictionarySize, LZMA_CHECK_CRC64} {} Compressor(int compression_level, size_t dict_size, lzma_check check) : m_lzma_stream{compression_level, dict_size, check} {} From 68c4c3677446ab9bdf3c4c7c10be8f6834839441 Mon Sep 17 00:00:00 2001 From: davidlion Date: Fri, 20 Dec 2024 13:08:48 -0500 Subject: [PATCH 61/65] Add missing fixes for PR suggestion. --- .../core/src/clp/streaming_compression/lzma/Compressor.hpp | 2 ++ components/core/tests/test-StreamingCompression.cpp | 7 +------ 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index 5d35eb28e..f46a7a58d 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -11,6 +11,7 @@ #include "../../FileWriter.hpp" #include "../../TraceableException.hpp" #include "../Compressor.hpp" +#include "Constants.hpp" namespace clp::streaming_compression::lzma { /** @@ -33,6 +34,7 @@ class Compressor : public ::clp::streaming_compression::Compressor { // Constructors Compressor() : Compressor{cDefaultCompressionLevel, cDefaultDictionarySize, LZMA_CHECK_CRC64} {} + Compressor(int compression_level, size_t dict_size, lzma_check check) : m_lzma_stream{compression_level, dict_size, check} {} diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp index 8fc7f4286..5ae5532a0 100644 --- a/components/core/tests/test-StreamingCompression.cpp +++ b/components/core/tests/test-StreamingCompression.cpp @@ -19,7 +19,6 @@ #include "../src/clp/streaming_compression/Compressor.hpp" #include "../src/clp/streaming_compression/Decompressor.hpp" #include "../src/clp/streaming_compression/lzma/Compressor.hpp" -#include "../src/clp/streaming_compression/lzma/Constants.hpp" #include "../src/clp/streaming_compression/passthrough/Compressor.hpp" #include "../src/clp/streaming_compression/passthrough/Decompressor.hpp" #include "../src/clp/streaming_compression/zstd/Compressor.hpp" @@ -132,11 +131,7 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") { } SECTION("LZMA compression") { - compressor = std::make_unique( - clp::streaming_compression::lzma::cDefaultCompressionLevel, - clp::streaming_compression::lzma::cDefaultDictionarySize, - LZMA_CHECK_CRC64 - ); + compressor = std::make_unique(); compress(std::move(compressor), uncompressed_buffer.data()); } From dcb843e60f1f20a2d80c3964f98e50876de8cbd8 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Fri, 20 Dec 2024 17:35:01 -0500 Subject: [PATCH 62/65] Address review concern --- .../core/src/clp/streaming_compression/lzma/Compressor.cpp | 4 +--- .../core/src/clp/streaming_compression/lzma/Compressor.hpp | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index a6c5e197a..8061807da 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -38,9 +38,7 @@ auto Compressor::close() -> void { } if (m_lzma_stream.avail_in() > 0) { - SPDLOG_WARN("Trying to close LZMA compressor with unprocessed input data. Processing and " - "flushing remaining data."); - flush_lzma(LZMA_FULL_FLUSH); + throw OperationFailed(ErrorCode_Corrupt, __FILENAME__, __LINE__); } flush_lzma(LZMA_FINISH); diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp index f46a7a58d..de665eaf6 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp @@ -121,11 +121,11 @@ class Compressor : public ::clp::streaming_compression::Compressor { /** * Attaches a pre-allocated block buffer to the encoder's input stream. * - * @return false if the data buffer is null or empty. + * @return false if the data buffer is null. * @return true on success. */ [[nodiscard]] auto attach_input(uint8_t const* data_ptr, size_t data_length) -> bool { - if (nullptr == data_ptr || 0 == data_length) { + if (nullptr == data_ptr) { return false; } m_stream.next_in = data_ptr; From b20162f4d721ac43b1c5df3b590078e49816f2ea Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Fri, 20 Dec 2024 18:19:41 -0500 Subject: [PATCH 63/65] Address review concern --- .../core/src/clp/streaming_compression/lzma/Compressor.cpp | 4 ++-- components/core/tests/test-StreamingCompression.cpp | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index 8061807da..a9fa0d5a0 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -26,7 +26,7 @@ auto Compressor::open(FileWriter& file_writer) -> void { m_compressed_stream_block_buffer.size() )) { - throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + throw OperationFailed(ErrorCode_NoMem, __FILENAME__, __LINE__); } m_compressed_stream_file_writer = &file_writer; m_uncompressed_stream_pos = 0; @@ -153,7 +153,7 @@ auto Compressor::flush_stream_output_block_buffer() -> void { m_compressed_stream_block_buffer.size() )) { - throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__); + throw OperationFailed(ErrorCode_NoMem, __FILENAME__, __LINE__); } } diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp index 5ae5532a0..c3e981562 100644 --- a/components/core/tests/test-StreamingCompression.cpp +++ b/components/core/tests/test-StreamingCompression.cpp @@ -35,13 +35,20 @@ using std::string_view; namespace { constexpr string_view cCompressedFilePath{"test_streaming_compressed_file.bin"}; constexpr size_t cBufferSize{128L * 1024 * 1024}; // 128MB +// Interleave no-ops to ensure the integrity of the compressor states at all times. constexpr auto cCompressionChunkSizes = std::to_array( {cBufferSize / 100, + 0, cBufferSize / 50, + 0, cBufferSize / 25, + 0, cBufferSize / 10, + 0, cBufferSize / 5, + 0, cBufferSize / 2, + 0, cBufferSize} ); From df41b227e94fe7de2f9978ff3772ae97a8681443 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Fri, 20 Dec 2024 18:37:52 -0500 Subject: [PATCH 64/65] nit fix --- components/core/tests/test-StreamingCompression.cpp | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp index c3e981562..22582cff2 100644 --- a/components/core/tests/test-StreamingCompression.cpp +++ b/components/core/tests/test-StreamingCompression.cpp @@ -9,7 +9,6 @@ #include #include -#include #include #include "../src/clp/Array.hpp" @@ -35,20 +34,14 @@ using std::string_view; namespace { constexpr string_view cCompressedFilePath{"test_streaming_compressed_file.bin"}; constexpr size_t cBufferSize{128L * 1024 * 1024}; // 128MB -// Interleave no-ops to ensure the integrity of the compressor states at all times. constexpr auto cCompressionChunkSizes = std::to_array( - {cBufferSize / 100, - 0, + {0, // no-op + cBufferSize / 100, cBufferSize / 50, - 0, cBufferSize / 25, - 0, cBufferSize / 10, - 0, cBufferSize / 5, - 0, cBufferSize / 2, - 0, cBufferSize} ); From 524fe1d1cbd30d4b9839dfe58428b2590c1a102c Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Sun, 22 Dec 2024 17:21:00 -0500 Subject: [PATCH 65/65] Change all instances of programming-error-induced error codes to ErrorCode_Failure --- .../core/src/clp/streaming_compression/lzma/Compressor.cpp | 6 +++--- components/core/tests/test-StreamingCompression.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp index a9fa0d5a0..34c1a0e2b 100644 --- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp +++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp @@ -26,7 +26,7 @@ auto Compressor::open(FileWriter& file_writer) -> void { m_compressed_stream_block_buffer.size() )) { - throw OperationFailed(ErrorCode_NoMem, __FILENAME__, __LINE__); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } m_compressed_stream_file_writer = &file_writer; m_uncompressed_stream_pos = 0; @@ -38,7 +38,7 @@ auto Compressor::close() -> void { } if (m_lzma_stream.avail_in() > 0) { - throw OperationFailed(ErrorCode_Corrupt, __FILENAME__, __LINE__); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } flush_lzma(LZMA_FINISH); @@ -153,7 +153,7 @@ auto Compressor::flush_stream_output_block_buffer() -> void { m_compressed_stream_block_buffer.size() )) { - throw OperationFailed(ErrorCode_NoMem, __FILENAME__, __LINE__); + throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__); } } diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp index 22582cff2..9f0df9306 100644 --- a/components/core/tests/test-StreamingCompression.cpp +++ b/components/core/tests/test-StreamingCompression.cpp @@ -35,7 +35,7 @@ namespace { constexpr string_view cCompressedFilePath{"test_streaming_compressed_file.bin"}; constexpr size_t cBufferSize{128L * 1024 * 1024}; // 128MB constexpr auto cCompressionChunkSizes = std::to_array( - {0, // no-op + {0, cBufferSize / 100, cBufferSize / 50, cBufferSize / 25,