From dceb5648e8a2df0ecc65cbe81a07f538f5538359 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Mon, 25 Nov 2024 09:53:02 -0500
Subject: [PATCH 01/65] Add lzma download and port lzma scripts

---
 components/core/.clang-format                 |   2 +-
 components/core/CMakeLists.txt                |  24 +-
 .../clp/streaming_compression/Constants.hpp   |   1 +
 .../streaming_compression/lzma/Compressor.cpp | 303 +++++++++++++++
 .../streaming_compression/lzma/Compressor.hpp | 133 +++++++
 .../streaming_compression/lzma/Constants.hpp  |  15 +
 .../lzma/Decompressor.cpp                     | 362 ++++++++++++++++++
 .../lzma/Decompressor.hpp                     | 162 ++++++++
 .../core/tests/test-StreamingCompression.cpp  |   1 +
 .../core/tools/scripts/lib_install/liblzma.sh |  66 ++++
 .../install-packages-from-source.sh           |   1 +
 .../ubuntu-focal/install-prebuilt-packages.sh |   1 +
 .../install-packages-from-source.sh           |   1 +
 .../ubuntu-jammy/install-prebuilt-packages.sh |   1 +
 14 files changed, 1071 insertions(+), 2 deletions(-)
 create mode 100644 components/core/src/clp/streaming_compression/lzma/Compressor.cpp
 create mode 100644 components/core/src/clp/streaming_compression/lzma/Compressor.hpp
 create mode 100644 components/core/src/clp/streaming_compression/lzma/Constants.hpp
 create mode 100644 components/core/src/clp/streaming_compression/lzma/Decompressor.cpp
 create mode 100644 components/core/src/clp/streaming_compression/lzma/Decompressor.hpp
 create mode 100755 components/core/tools/scripts/lib_install/liblzma.sh

diff --git a/components/core/.clang-format b/components/core/.clang-format
index ff65adbae..4d0d3a87c 100644
--- a/components/core/.clang-format
+++ b/components/core/.clang-format
@@ -4,7 +4,7 @@ IncludeCategories:
   # NOTE: A header is grouped by first matching regex
   # Library headers. Update when adding new libraries.
   # NOTE: clang-format retains leading white-space on a line in violation of the YAML spec.
-  - Regex: "<(absl|antlr4|archive|boost|bsoncxx|catch2|curl|date|fmt|json|log_surgeon|mongocxx\
+  - Regex: "<(absl|antlr4|archive|boost|bsoncxx|catch2|curl|date|fmt|json|log_surgeon|lzma|mongocxx\
 |msgpack|mysql|openssl|outcome|regex_utils|simdjson|spdlog|sqlite3|string_utils|yaml-cpp|zstd)"
     Priority: 3
   # C system headers
diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index e5c9b06c8..92bb6af19 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -11,13 +11,16 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 # Set general compressor
 set(GENERAL_COMPRESSOR "zstd" CACHE STRING "The general-purpose compressor used as the 2nd-stage compressor")
-set_property(CACHE GENERAL_COMPRESSOR PROPERTY STRINGS passthrough zstd)
+set_property(CACHE GENERAL_COMPRESSOR PROPERTY STRINGS passthrough zstd lzma)
 if ("${GENERAL_COMPRESSOR}" STREQUAL "passthrough")
     add_definitions(-DUSE_PASSTHROUGH_COMPRESSION=1)
     message(STATUS "Using passthrough compression")
 elseif ("${GENERAL_COMPRESSOR}" STREQUAL "zstd")
     add_definitions(-DUSE_ZSTD_COMPRESSION=1)
     message(STATUS "Using Zstandard compression")
+elseif ("${GENERAL_COMPRESSOR}" STREQUAL "lzma")
+    add_definitions(-DUSE_LZMA_COMPRESSION=1)
+    message(STATUS "Using Lempel–Ziv–Markov chain Algorithm compression")
 else()
     message(SEND_ERROR "GENERAL_COMPRESSOR=${GENERAL_COMPRESSOR} is unimplemented.")
 endif()
@@ -224,6 +227,19 @@ else()
     message(FATAL_ERROR "Could not find ${CLP_LIBS_STRING} libraries for ZStd")
 endif()
 
+# Find and setup LZMA Library
+# Notice that we don't have support to switch between static and shared libraries.
+# TODO: add a script in ./cmake/Modules to resolve .a vs. .so
+find_package(LibLZMA REQUIRED)
+if(LIBLZMA_FOUND)
+    message(STATUS "Found LIBLZMA_FOUND ${LIBLZMA_VERSION_STRING}")
+    message(STATUS "Lzma library location: ${LIBLZMA_LIBRARIES}")
+else()
+    message(FATAL_ERROR "Could not find ${CLP_LIBS_STRING} libraries for LIBLZMA_FOUND")
+endif()
+include_directories(${LIBLZMA_INCLUDE_DIRS})
+message("LZMA Include Dir: ${LIBLZMA_INCLUDE_DIRS}")
+
 # sqlite dependencies
 set(sqlite_DYNAMIC_LIBS "dl;m;pthread")
 include(cmake/Modules/FindLibraryDependencies.cmake)
@@ -462,6 +478,11 @@ set(SOURCE_FILES_unitTest
         src/clp/streaming_compression/Compressor.hpp
         src/clp/streaming_compression/Constants.hpp
         src/clp/streaming_compression/Decompressor.hpp
+        src/clp/streaming_compression/lzma/Compressor.cpp
+        src/clp/streaming_compression/lzma/Compressor.hpp
+        src/clp/streaming_compression/lzma/Decompressor.cpp
+        src/clp/streaming_compression/lzma/Decompressor.hpp
+        src/clp/streaming_compression/lzma/Constants.hpp
         src/clp/streaming_compression/passthrough/Compressor.cpp
         src/clp/streaming_compression/passthrough/Compressor.hpp
         src/clp/streaming_compression/passthrough/Decompressor.cpp
@@ -549,6 +570,7 @@ target_link_libraries(unitTest
         clp::regex_utils
         clp::string_utils
         yaml-cpp::yaml-cpp
+        ${LIBLZMA_LIBRARIES}
         ZStd::ZStd
         )
 target_compile_features(unitTest
diff --git a/components/core/src/clp/streaming_compression/Constants.hpp b/components/core/src/clp/streaming_compression/Constants.hpp
index 4649c2e98..080f3a20b 100644
--- a/components/core/src/clp/streaming_compression/Constants.hpp
+++ b/components/core/src/clp/streaming_compression/Constants.hpp
@@ -7,6 +7,7 @@
 namespace clp::streaming_compression {
 enum class CompressorType : uint8_t {
     ZSTD = 0x10,
+    LZMA = 0x20,
     Passthrough = 0xFF,
 };
 }  // namespace clp::streaming_compression
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
new file mode 100644
index 000000000..f10ec915b
--- /dev/null
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -0,0 +1,303 @@
+#include "Compressor.hpp"
+
+// spdlog
+#include <spdlog/spdlog.h>
+
+// Project headers
+#include "../../Defs.h"
+
+// File-scope constants
+static constexpr size_t cCompressedStreamBlockBufferSize = 4096;  // 4KiB
+
+namespace streaming_compression::lzma {
+Compressor::LzmaOption Compressor::m_option;
+
+Compressor::Compressor()
+        : ::streaming_compression::Compressor(CompressorType::LZMA),
+          m_compression_stream_contains_data(false),
+          m_compressed_stream_file_writer(nullptr),
+          m_compression_stream(nullptr) {
+    m_compressed_stream_block_buffer = std::make_unique<Bytef[]>(cCompressedStreamBlockBufferSize);
+    m_compression_stream = new lzma_stream;
+    memset(m_compression_stream, 0, sizeof(lzma_stream));
+}
+
+Compressor::~Compressor() {
+    if (nullptr != m_compression_stream) {
+        delete m_compression_stream;
+    }
+}
+
+void Compressor::init_lzma_encoder(lzma_stream* strm) {
+    lzma_options_lzma options;
+    if (lzma_lzma_preset(&options, m_option.get_compression_level())) {
+        SPDLOG_ERROR("Failed to initialize LZMA options.");
+        throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
+    }
+    options.dict_size = m_option.get_dict_size();
+    lzma_filter filters[2]{
+            {LZMA_FILTER_LZMA2, &options},
+            {LZMA_VLI_UNKNOWN, nullptr},
+    };
+
+    // Initialize the encoder using a preset. Set the integrity to check
+    // to CRC64, which is the default in the xz command line tool. If
+    // the .xz file needs to be decompressed with XZ Embedded, use
+    // LZMA_CHECK_CRC32 instead.
+    lzma_ret ret = lzma_stream_encoder(strm, filters, LZMA_CHECK_CRC64);
+
+    // Return successfully if the initialization went fine.
+    if (ret == LZMA_OK) {
+        return;
+    }
+
+    // Something went wrong. The possible errors are documented in
+    // lzma/container.h (src/liblzma/api/lzma/container.h in the source
+    // package or e.g. /usr/include/lzma/container.h depending on the
+    // install prefix).
+    char const* msg;
+    switch (ret) {
+        case LZMA_MEM_ERROR:
+            msg = "Memory allocation failed";
+            break;
+
+        case LZMA_OPTIONS_ERROR:
+            msg = "Specified preset is not supported";
+            break;
+
+        case LZMA_UNSUPPORTED_CHECK:
+            msg = "Specified integrity check is not supported";
+            break;
+
+        default:
+            // This is most likely LZMA_PROG_ERROR indicating a bug in
+            // this program or in liblzma. It is inconvenient to have a
+            // separate error message for errors that should be impossible
+            // to occur, but knowing the error code is important for
+            // debugging. That's why it is good to print the error code
+            // at least when there is no good error message to show.
+            msg = "Unknown error, possibly a bug";
+            break;
+    }
+
+    SPDLOG_ERROR("Error initializing the encoder: {} (error code {})", msg, int(ret));
+    throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
+}
+
+void Compressor::open(FileWriter& file_writer, int compression_level) {
+    if (nullptr != m_compressed_stream_file_writer) {
+        throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__);
+    }
+
+    if (false == (0 <= compression_level && compression_level <= 9)) {
+        throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__);
+    }
+    if (compression_level != m_option.get_compression_level()) {
+        m_option.set_compression_level(compression_level);
+    }
+
+    init_lzma_encoder(m_compression_stream);
+    // Setup compressed stream parameters
+    m_compression_stream->next_in = nullptr;
+    m_compression_stream->avail_in = 0;
+    m_compression_stream->next_out = m_compressed_stream_block_buffer.get();
+    m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
+
+    m_compressed_stream_file_writer = &file_writer;
+
+    m_uncompressed_stream_pos = 0;
+}
+
+void Compressor::close() {
+    if (nullptr == m_compressed_stream_file_writer) {
+        throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
+    }
+
+    flush_and_close_compression_stream();
+    m_compressed_stream_file_writer = nullptr;
+}
+
+void Compressor::write(char const* data, size_t data_length) {
+    if (nullptr == m_compressed_stream_file_writer) {
+        throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
+    }
+
+    if (0 == data_length) {
+        // Nothing needs to be done because we do not need to compress anything
+        return;
+    }
+    if (nullptr == data) {
+        throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
+    }
+    lzma_action action = LZMA_RUN;
+    m_compression_stream->next_in = reinterpret_cast<Bytef*>(const_cast<char*>(data));
+    m_compression_stream->avail_in = data_length;
+
+    // Compress all data
+    bool hit_input_eof = false;
+    while (!hit_input_eof) {
+        lzma_ret return_value = lzma_code(m_compression_stream, action);
+        switch (return_value) {
+            case LZMA_OK:
+            case LZMA_BUF_ERROR:
+                break;
+            case LZMA_STREAM_END:
+                hit_input_eof = true;
+                break;
+            default:
+                SPDLOG_ERROR("lzma() returned an unexpected value - {}.", int(return_value));
+                throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
+        }
+
+        if (0 == m_compression_stream->avail_in) {
+            // No more data to compress
+            break;
+        }
+
+        // Write output buffer to file if it's full
+        if (0 == m_compression_stream->avail_out) {
+            m_compressed_stream_file_writer->write(
+                    reinterpret_cast<char*>(m_compressed_stream_block_buffer.get()),
+                    cCompressedStreamBlockBufferSize
+            );
+            m_compression_stream->next_out = m_compressed_stream_block_buffer.get();
+            m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
+        }
+    }
+
+    // Write any compressed data
+    if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) {
+        m_compressed_stream_file_writer->write(
+                reinterpret_cast<char*>(m_compressed_stream_block_buffer.get()),
+                cCompressedStreamBlockBufferSize - m_compression_stream->avail_out
+        );
+        m_compression_stream->next_out = m_compressed_stream_block_buffer.get();
+        m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
+    }
+
+    m_compression_stream->next_in = nullptr;
+
+    m_compression_stream_contains_data = true;
+    m_uncompressed_stream_pos += data_length;
+}
+
+void Compressor::flush() {
+    if (false == m_compression_stream_contains_data) {
+        return;
+    }
+    // Z_NO_FLUSH - deflate decides how much data to accumulate before producing output
+    // Z_SYNC_FLUSH - All pending output flushed to output buf and output aligned to byte
+    // boundary (completes current block and follows it with empty block that is 3 bits plus
+    // filler to next byte, followed by 4 bytes Z_PARTIAL_FLUSH - Same as Z_SYNC_FLUSH but
+    // output not aligned to byte boundary (completes current block and follows it with empty
+    // fixed codes block that is 10 bits long) Z_BLOCK - Same as Z_SYNC_FLUSH but output not
+    // aligned on a byte boundary and up to 7 bits of current block held to be written
+    // Z_FULL_FLUSH - Same as Z_SYNC_FLUSH but compression state reset so that decompression can
+    // restart from this point if the previous compressed data has been damaged Z_FINISH -
+    // Pending output flushed and deflate returns Z_STREAM_END if there was enough output space,
+    // or Z_OK or Z_BUF_ERROR if it needs to be called again with more space
+    //
+
+    bool flush_complete = false;
+    while (true) {
+        lzma_ret return_value = lzma_code(m_compression_stream, LZMA_SYNC_FLUSH);
+        switch (return_value) {
+            case LZMA_STREAM_END:
+                flush_complete = true;
+                break;
+            case LZMA_OK:
+            case LZMA_BUF_ERROR:
+                break;
+            default:
+                SPDLOG_ERROR("lzma() returned an unexpected value - {}.", int(return_value));
+                throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
+        }
+        if (flush_complete) {
+            break;
+        }
+
+        // Write output buffer to file if it's full
+        if (0 == m_compression_stream->avail_out) {
+            m_compressed_stream_file_writer->write(
+                    reinterpret_cast<char*>(m_compressed_stream_block_buffer.get()),
+                    cCompressedStreamBlockBufferSize
+            );
+            m_compression_stream->next_out = m_compressed_stream_block_buffer.get();
+            m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
+        }
+    }
+
+    // Write any compressed data
+    if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) {
+        m_compressed_stream_file_writer->write(
+                reinterpret_cast<char*>(m_compressed_stream_block_buffer.get()),
+                cCompressedStreamBlockBufferSize - m_compression_stream->avail_out
+        );
+        m_compression_stream->next_out = m_compressed_stream_block_buffer.get();
+        m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
+    }
+
+    m_compression_stream_contains_data = false;
+}
+
+ErrorCode Compressor::try_get_pos(size_t& pos) const {
+    if (nullptr == m_compressed_stream_file_writer) {
+        return ErrorCode_NotInit;
+    }
+
+    pos = m_uncompressed_stream_pos;
+    return ErrorCode_Success;
+}
+
+void Compressor::flush_and_close_compression_stream() {
+    if (nullptr == m_compressed_stream_file_writer) {
+        throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
+    }
+
+    bool flush_complete = false;
+    while (true) {
+        lzma_ret return_value = lzma_code(m_compression_stream, LZMA_FINISH);
+        switch (return_value) {
+            case LZMA_OK:
+            case LZMA_BUF_ERROR:
+                break;
+            case LZMA_STREAM_END:
+                flush_complete = true;
+                break;
+            default:
+                //                    SPDLOG_ERROR("deflate() returned an unexpected value -
+                //                    {}.", return_value);
+                throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
+        }
+        if (flush_complete) {
+            break;
+        }
+
+        // Write output buffer to file if it's full
+        if (0 == m_compression_stream->avail_out) {
+            m_compressed_stream_file_writer->write(
+                    reinterpret_cast<char*>(m_compressed_stream_block_buffer.get()),
+                    cCompressedStreamBlockBufferSize
+            );
+            m_compression_stream->next_out = m_compressed_stream_block_buffer.get();
+            m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
+        }
+    }
+
+    // Write any compressed data
+    if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) {
+        m_compressed_stream_file_writer->write(
+                reinterpret_cast<char*>(m_compressed_stream_block_buffer.get()),
+                cCompressedStreamBlockBufferSize - m_compression_stream->avail_out
+        );
+        m_compression_stream->next_out = m_compressed_stream_block_buffer.get();
+        m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
+    }
+
+    m_compression_stream_contains_data = false;
+
+    lzma_end(m_compression_stream);
+    m_compression_stream->avail_out = 0;
+    m_compression_stream->next_out = nullptr;
+}
+}  // namespace streaming_compression::lzma
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
new file mode 100644
index 000000000..d31c7687e
--- /dev/null
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -0,0 +1,133 @@
+#ifndef STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP
+#define STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP
+
+// C++ standard libraries
+#include <memory>
+#include <string>
+
+// ZLIB library
+#include <lzma.h>
+#include <zlib.h>
+
+// Project headers
+#include "../../FileWriter.hpp"
+#include "../../TraceableException.hpp"
+#include "../Compressor.hpp"
+#include "Constants.hpp"
+
+namespace streaming_compression::lzma {
+class Compressor : public ::streaming_compression::Compressor {
+public:
+    // Types
+    class OperationFailed : public TraceableException {
+    public:
+        // Constructors
+        OperationFailed(ErrorCode error_code, char const* const filename, int line_number)
+                : TraceableException(error_code, filename, line_number) {}
+
+        // Methods
+        char const* what() const noexcept override {
+            return "streaming_compression::gzip::Compressor operation failed";
+        }
+    };
+
+    class LzmaOption {
+    public:
+        LzmaOption()
+                : m_compression_level{cDefaultCompressionLevel},
+                  m_dict_size{cDefaultDictionarySize} {}
+
+        auto set_compression_level(int compression_level) -> void {
+            if (0 > compression_level) {
+                m_compression_level = 0;
+            } else if (9 < compression_level) {
+                m_compression_level = 9;
+            } else {
+                m_compression_level = compression_level;
+            }
+        }
+
+        auto set_dict_size(uint32_t dict_size) -> void { m_dict_size = dict_size; }
+
+        [[nodiscard]] auto get_compression_level() const -> int { return m_compression_level; }
+
+        [[nodiscard]] auto get_dict_size() const -> uint32_t { return m_dict_size; }
+
+    private:
+        int m_compression_level;
+        uint32_t m_dict_size;
+    };
+
+    // Constructor
+    Compressor();
+
+    // Destructor
+    ~Compressor();
+
+    // Explicitly disable copy and move constructor/assignment
+    Compressor(Compressor const&) = delete;
+    Compressor& operator=(Compressor const&) = delete;
+
+    // Methods implementing the WriterInterface
+    /**
+     * Writes the given data to the compressor
+     * @param data
+     * @param data_length
+     */
+    void write(char const* data, size_t data_length) override;
+    /**
+     * Writes any internally buffered data to file and ends the current frame
+     */
+    void flush() override;
+
+    /**
+     * Tries to get the current position of the write head
+     * @param pos Position of the write head
+     * @return ErrorCode_NotInit if the compressor is not open
+     * @return ErrorCode_Success on success
+     */
+    ErrorCode try_get_pos(size_t& pos) const override;
+
+    // Methods implementing the Compressor interface
+    /**
+     * Initialize streaming compressor
+     * @param file_writer
+     * @param compression_level
+     */
+    void open(FileWriter& file_writer, int compression_level) override;
+
+    /**
+     * Closes the compressor
+     */
+    void close() override;
+
+    // Methods
+    static auto set_compression_level(int compression_level) -> void {
+        m_option.set_compression_level(compression_level);
+    }
+
+    static auto set_dict_size(uint32_t dict_size) -> void { m_option.set_dict_size(dict_size); }
+
+private:
+    /**
+     * Flushes the stream and closes it
+     */
+    void flush_and_close_compression_stream();
+
+    static void init_lzma_encoder(lzma_stream* strm);
+    static LzmaOption m_option;
+
+    // Variables
+    FileWriter* m_compressed_stream_file_writer;
+
+    // Compressed stream variables
+    lzma_stream* m_compression_stream;
+    bool m_compression_stream_contains_data;
+
+    std::unique_ptr<Bytef[]> m_compressed_stream_block_buffer;
+
+    size_t m_uncompressed_stream_pos;
+};
+}  // namespace streaming_compression::lzma
+
+#endif  // STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP
diff --git a/components/core/src/clp/streaming_compression/lzma/Constants.hpp b/components/core/src/clp/streaming_compression/lzma/Constants.hpp
new file mode 100644
index 000000000..959c09f47
--- /dev/null
+++ b/components/core/src/clp/streaming_compression/lzma/Constants.hpp
@@ -0,0 +1,15 @@
+#ifndef STREAMING_COMPRESSION_LZMA_CONSTANTS_HPP
+#define STREAMING_COMPRESSION_LZMA_CONSTANTS_HPP
+
+#include <lzma.h>
+
+// C++ libraries
+#include <cstddef>
+#include <cstdint>
+
+namespace streaming_compression::lzma {
+constexpr int cDefaultCompressionLevel{3};
+constexpr uint32_t cDefaultDictionarySize{LZMA_DICT_SIZE_DEFAULT};
+}  // namespace streaming_compression::lzma
+
+#endif  // STREAMING_COMPRESSION_LZMA_CONSTANTS_HPP
diff --git a/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp b/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp
new file mode 100644
index 000000000..a2ed4d466
--- /dev/null
+++ b/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp
@@ -0,0 +1,362 @@
+#include "Decompressor.hpp"
+
+// C++ Standard Libraries
+#include <algorithm>
+
+// Boost libraries
+#include <boost/filesystem.hpp>
+
+// spdlog
+#include <spdlog/spdlog.h>
+
+// Project headers
+#include "../../Defs.h"
+
+namespace streaming_compression::lzma {
+Decompressor::Decompressor()
+        : ::streaming_compression::Decompressor(CompressorType::LZMA),
+          m_input_type(InputType::NotInitialized),
+          m_decompression_stream(nullptr),
+          m_file_reader(nullptr),
+          m_file_reader_initial_pos(0),
+          m_file_read_buffer_length(0),
+          m_file_read_buffer_capacity(0),
+          m_decompressed_stream_pos(0),
+          m_unused_decompressed_stream_block_size(0) {
+    // Create block to hold unused decompressed data
+    m_unused_decompressed_stream_block_buffer
+            = std::make_unique<char[]>(m_unused_decompressed_stream_block_size);
+    m_decompression_stream = new lzma_stream;
+    memset(m_decompression_stream, 0, sizeof(lzma_stream));
+}
+
+Decompressor::~Decompressor() {
+    delete m_decompression_stream;
+}
+
+void Decompressor::exact_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) {
+    auto errorcode = try_read(buf, num_bytes_to_read, num_bytes_read);
+    if (num_bytes_read != num_bytes_to_read) {
+        SPDLOG_ERROR("FAILED TO READ EXACTLY {} bytes", num_bytes_to_read);
+        throw;
+    }
+    if (errorcode != ErrorCode_Success) {
+        SPDLOG_ERROR("FAILED TO READ EXACTLY {} bytes", num_bytes_to_read);
+        throw;
+    }
+}
+
+ErrorCode Decompressor::try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) {
+    if (InputType::NotInitialized == m_input_type) {
+        return ErrorCode_NotInit;
+    }
+    if (nullptr == buf) {
+        return ErrorCode_BadParam;
+    }
+    if (0 == num_bytes_to_read) {
+        return ErrorCode_Success;
+    }
+
+    num_bytes_read = 0;
+
+    m_decompression_stream->next_out = reinterpret_cast<Bytef*>(buf);
+    m_decompression_stream->avail_out = num_bytes_to_read;
+    while (true) {
+        // Check if there's data that can be decompressed
+        if (0 == m_decompression_stream->avail_in) {
+            if (InputType::File != m_input_type) {
+                // if we hit here, there must be something wrong
+                // we have consumed all data buffer but for some reason it still requires more.
+                return ErrorCode_EndOfFile;
+            } else {
+                auto error_code = m_file_reader->try_read(
+                        m_file_read_buffer.get(),
+                        m_file_read_buffer_capacity,
+                        m_file_read_buffer_length
+                );
+                m_decompression_stream->avail_in = m_file_read_buffer_length;
+                m_decompression_stream->next_in
+                        = reinterpret_cast<Bytef*>(m_file_read_buffer.get());
+                if (ErrorCode_Success != error_code) {
+                    if (ErrorCode_EndOfFile == error_code) {
+                        num_bytes_read = num_bytes_to_read - m_decompression_stream->avail_out;
+                        m_decompressed_stream_pos += num_bytes_read;
+                        return ErrorCode_EndOfFile;
+                    }
+                }
+            }
+        }
+
+        lzma_ret return_value = lzma_code(m_decompression_stream, LZMA_RUN);
+        switch (return_value) {
+            case LZMA_OK:
+            case LZMA_BUF_ERROR:
+                if (0 == m_decompression_stream->avail_out) {
+                    m_decompression_stream->next_out = nullptr;
+                    num_bytes_read = num_bytes_to_read;
+                    m_decompressed_stream_pos += num_bytes_read;
+                    return ErrorCode_Success;
+                }
+                // by breaking here, enter the next iteration of decompressing
+                break;
+            case LZMA_STREAM_END:
+                if (0 == m_decompression_stream->avail_out) {
+                    m_decompression_stream->next_out = nullptr;
+                    num_bytes_read = num_bytes_to_read;
+                    m_decompressed_stream_pos += num_bytes_read;
+                    return ErrorCode_Success;
+                }
+                SPDLOG_ERROR("streaming_compression::lzma::Decompressor wants to read more but "
+                             "reached end of file");
+                throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
+            case LZMA_MEM_ERROR:
+                SPDLOG_ERROR("streaming_compression::lzma::Decompressor inflate() ran out of memory"
+                );
+                throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
+            default:
+                SPDLOG_ERROR("inflate() returned an unexpected value - {}.", int(return_value));
+                throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
+        }
+    }
+}
+
+ErrorCode Decompressor::try_seek_from_begin(size_t pos) {
+    if (InputType::NotInitialized == m_input_type) {
+        throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
+    }
+
+    // Check if we've already decompressed passed the desired position
+    if (m_decompressed_stream_pos > pos) {
+        // ZStd has no way for us to seek back to the desired position, so just reset the stream
+        // to the beginning
+        reset_stream();
+    }
+
+    // We need to fast-forward the decompression stream to decompressed_stream_pos
+    ErrorCode error;
+    while (m_decompressed_stream_pos < pos) {
+        size_t num_bytes_to_decompress = std::min(
+                m_unused_decompressed_stream_block_size,
+                pos - m_decompressed_stream_pos
+        );
+        error = try_read_exact_length(
+                m_unused_decompressed_stream_block_buffer.get(),
+                num_bytes_to_decompress
+        );
+        if (ErrorCode_Success != error) {
+            return error;
+        }
+    }
+
+    return ErrorCode_Success;
+}
+
+ErrorCode Decompressor::try_get_pos(size_t& pos) {
+    if (InputType::NotInitialized == m_input_type) {
+        return ErrorCode_NotInit;
+    }
+
+    pos = m_decompressed_stream_pos;
+    return ErrorCode_Success;
+}
+
+void Decompressor::close() {
+    if (InputType::NotInitialized == m_input_type) {
+        return;
+    }
+    lzma_end(m_decompression_stream);
+    m_decompression_stream->avail_out = 0;
+    m_decompression_stream->next_out = nullptr;
+    if (InputType::MemoryMappedCompressedFile == m_input_type) {
+        if (m_memory_mapped_compressed_file.is_open()) {
+            // An existing file is memory mapped by the decompressor
+            m_memory_mapped_compressed_file.close();
+        }
+    } else if (InputType::File == m_input_type) {
+        m_file_read_buffer.reset();
+        m_file_read_buffer_capacity = 0;
+        m_file_read_buffer_length = 0;
+        m_file_reader = nullptr;
+    }
+    m_input_type = InputType::NotInitialized;
+}
+
+void Decompressor::init_decoder(lzma_stream* strm) {
+    // Initialize a .xz decoder. The decoder supports a memory usage limit
+    // and a set of flags.
+    //
+    // The memory usage of the decompressor depends on the settings used
+    // to compress a .xz file. It can vary from less than a megabyte to
+    // a few gigabytes, but in practice (at least for now) it rarely
+    // exceeds 65 MiB because that's how much memory is required to
+    // decompress files created with "xz -9". Settings requiring more
+    // memory take extra effort to use and don't (at least for now)
+    // provide significantly better compression in most cases.
+    //
+    // Memory usage limit is useful if it is important that the
+    // decompressor won't consume gigabytes of memory. The need
+    // for limiting depends on the application. In this example,
+    // no memory usage limiting is used. This is done by setting
+    // the limit to UINT64_MAX.
+    //
+    // The .xz format allows concatenating compressed files as is:
+    //
+    //     echo foo | xz > foobar.xz
+    //     echo bar | xz >> foobar.xz
+    //
+    // When decompressing normal standalone .xz files, LZMA_CONCATENATED
+    // should always be used to support decompression of concatenated
+    // .xz files. If LZMA_CONCATENATED isn't used, the decoder will stop
+    // after the first .xz stream. This can be useful when .xz data has
+    // been embedded inside another file format.
+    //
+    // Flags other than LZMA_CONCATENATED are supported too, and can
+    // be combined with bitwise-or. See lzma/container.h
+    // (src/liblzma/api/lzma/container.h in the source package or e.g.
+    // /usr/include/lzma/container.h depending on the install prefix)
+    // for details.
+    lzma_ret ret = lzma_stream_decoder(strm, UINT64_MAX, LZMA_CONCATENATED);
+
+    // Return successfully if the initialization went fine.
+    if (ret == LZMA_OK) {
+        return;
+    }
+
+    // Something went wrong. The possible errors are documented in
+    // lzma/container.h (src/liblzma/api/lzma/container.h in the source
+    // package or e.g. /usr/include/lzma/container.h depending on the
+    // install prefix).
+    //
+    // Note that LZMA_MEMLIMIT_ERROR is never possible here. If you
+    // specify a very tiny limit, the error will be delayed until
+    // the first headers have been parsed by a call to lzma_code().
+    char const* msg;
+    switch (ret) {
+        case LZMA_MEM_ERROR:
+            msg = "Memory allocation failed";
+            break;
+
+        case LZMA_OPTIONS_ERROR:
+            msg = "Unsupported decompressor flags";
+            break;
+
+        default:
+            // This is most likely LZMA_PROG_ERROR indicating a bug in
+            // this program or in liblzma. It is inconvenient to have a
+            // separate error message for errors that should be impossible
+            // to occur, but knowing the error code is important for
+            // debugging. That's why it is good to print the error code
+            // at least when there is no good error message to show.
+            msg = "Unknown error, possibly a bug";
+            break;
+    }
+
+    SPDLOG_ERROR("Error initializing the decoder: {} (error code {})", msg, int(ret));
+}
+
+void Decompressor::open(char const* compressed_data_buf, size_t compressed_data_buf_size) {
+    if (InputType::NotInitialized != m_input_type) {
+        throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__);
+    }
+    m_input_type = InputType::CompressedDataBuf;
+
+    // Configure input stream
+    reset_stream();
+    m_decompression_stream->next_in
+            = reinterpret_cast<Bytef*>(const_cast<char*>(compressed_data_buf));
+    m_decompression_stream->avail_in = compressed_data_buf_size;
+    m_decompression_stream->next_out = nullptr;
+    m_decompression_stream->avail_out = 0;
+}
+
+ErrorCode Decompressor::open(std::string const& compressed_file_path) {
+    if (InputType::NotInitialized != m_input_type) {
+        throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__);
+    }
+    m_input_type = InputType::MemoryMappedCompressedFile;
+
+    // Create memory mapping for compressed_file_path, use boost read only memory mapped file
+    boost::system::error_code boost_error_code;
+    size_t compressed_file_size
+            = boost::filesystem::file_size(compressed_file_path, boost_error_code);
+    if (boost_error_code) {
+        SPDLOG_ERROR(
+                "streaming_compression::zstd::Decompressor: Unable to obtain file size for "
+                "'{}' - {}.",
+                compressed_file_path.c_str(),
+                boost_error_code.message().c_str()
+        );
+        return ErrorCode_Failure;
+    }
+
+    boost::iostreams::mapped_file_params memory_map_params;
+    memory_map_params.path = compressed_file_path;
+    memory_map_params.flags = boost::iostreams::mapped_file::readonly;
+    memory_map_params.length = compressed_file_size;
+    memory_map_params.hint = m_memory_mapped_compressed_file.data(
+    );  // Try to map it to the same memory location as previous memory mapped file
+    m_memory_mapped_compressed_file.open(memory_map_params);
+    if (!m_memory_mapped_compressed_file.is_open()) {
+        SPDLOG_ERROR(
+                "streaming_compression::lzma::Decompressor: Unable to memory map the "
+                "compressed file with path: {}",
+                compressed_file_path.c_str()
+        );
+        return ErrorCode_Failure;
+    }
+
+    // Configure input stream
+    reset_stream();
+    m_decompression_stream->next_in
+            = reinterpret_cast<Bytef*>(const_cast<char*>(m_memory_mapped_compressed_file.data()));
+    m_decompression_stream->avail_in = compressed_file_size;
+    m_decompression_stream->next_out = nullptr;
+    m_decompression_stream->avail_out = 0;
+
+    return ErrorCode_Success;
+}
+
+void Decompressor::open(FileReader& file_reader, size_t file_read_buffer_capacity) {
+    if (InputType::NotInitialized != m_input_type) {
+        throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__);
+    }
+    m_input_type = InputType::File;
+
+    m_file_reader = &file_reader;
+    m_file_reader_initial_pos = m_file_reader->get_pos();
+
+    m_file_read_buffer_capacity = file_read_buffer_capacity;
+    m_file_read_buffer = std::make_unique<char[]>(m_file_read_buffer_capacity);
+    m_file_read_buffer_length = 0;
+
+    // Configure input stream
+    reset_stream();
+    m_decompression_stream->next_in = reinterpret_cast<Bytef*>(m_file_read_buffer.get());
+    m_decompression_stream->avail_in = m_file_read_buffer_length;
+    m_decompression_stream->next_out = nullptr;
+    m_decompression_stream->avail_out = 0;
+}
+
+ErrorCode Decompressor::get_decompressed_stream_region(
+        size_t decompressed_stream_pos,
+        char* extraction_buf,
+        size_t extraction_len
+) {
+    auto error_code = try_seek_from_begin(decompressed_stream_pos);
+    if (ErrorCode_Success != error_code) {
+        return error_code;
+    }
+
+    error_code = try_read_exact_length(extraction_buf, extraction_len);
+    return error_code;
+}
+
+void Decompressor::reset_stream() {
+    if (InputType::File == m_input_type) {
+        m_file_reader->seek_from_begin(m_file_reader_initial_pos);
+        m_file_read_buffer_length = 0;
+    }
+    m_decompressed_stream_pos = 0;
+    init_decoder(m_decompression_stream);
+}
+}  // namespace streaming_compression::lzma
diff --git a/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp b/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp
new file mode 100644
index 000000000..996663e44
--- /dev/null
+++ b/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp
@@ -0,0 +1,162 @@
+#ifndef STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP
+#define STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP
+
+// C++ standard libraries
+#include <memory>
+#include <string>
+
+// ZLIB library
+#include <lzma.h>
+#include <zlib.h>
+// Boost libraries
+#include <boost/iostreams/device/mapped_file.hpp>
+
+// Project headers
+#include "../../FileReader.hpp"
+#include "../../TraceableException.hpp"
+#include "../Decompressor.hpp"
+
+namespace streaming_compression::lzma {
+class Decompressor : public ::streaming_compression::Decompressor {
+public:
+    // Types
+    class OperationFailed : public TraceableException {
+    public:
+        // Constructors
+        OperationFailed(ErrorCode error_code, char const* const filename, int line_number)
+                : TraceableException(error_code, filename, line_number) {}
+
+        // Methods
+        char const* what() const noexcept override {
+            return "streaming_compression::lzma::Decompressor operation failed";
+        }
+    };
+
+    // Constructor
+    Decompressor();
+
+    // Destructor
+    ~Decompressor();
+
+    // Explicitly disable copy and move constructor/assignment
+    Decompressor(Decompressor const&) = delete;
+    Decompressor& operator=(Decompressor const&) = delete;
+
+    // Methods implementing the ReaderInterface
+    /**
+     * Tries to read up to a given number of bytes from the decompressor
+     * @param buf
+     * @param num_bytes_to_read The number of bytes to try and read
+     * @param num_bytes_read The actual number of bytes read
+     * @return Same as FileReader::try_read if the decompressor is attached to a file
+     * @return ErrorCode_NotInit if the decompressor is not open
+     * @return ErrorCode_BadParam if buf is invalid
+     * @return ErrorCode_EndOfFile on EOF
+     * @return ErrorCode_Failure on decompression failure
+     * @return ErrorCode_Success on success
+     */
+    ErrorCode try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) override;
+
+    /**
+     */
+    void exact_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read);
+
+    /**
+     * Tries to seek from the beginning to the given position
+     * @param pos
+     * @return ErrorCode_NotInit if the decompressor is not open
+     * @return Same as ReaderInterface::try_read_exact_length
+     * @return ErrorCode_Success on success
+     */
+    ErrorCode try_seek_from_begin(size_t pos) override;
+    /**
+     * Tries to get the current position of the read head
+     * @param pos Position of the read head in the file
+     * @return ErrorCode_NotInit if the decompressor is not open
+     * @return ErrorCode_Success on success
+     */
+    ErrorCode try_get_pos(size_t& pos) override;
+
+    // Methods implementing the Decompressor interface
+    void close() override;
+    /**
+     * Decompresses and copies the range of uncompressed data described by
+     * decompressed_stream_pos and extraction_len into extraction_buf
+     * @param decompressed_stream_pos
+     * @param extraction_buf
+     * @param extraction_len
+     * @return Same as streaming_compression::zstd::Decompressor::try_seek_from_begin
+     * @return Same as ReaderInterface::try_read_exact_length
+     */
+    ErrorCode get_decompressed_stream_region(
+            size_t decompressed_stream_pos,
+            char* extraction_buf,
+            size_t extraction_len
+    ) override;
+
+    // Methods
+    /***
+     * Initialize streaming decompressor to decompress from the specified compressed data buffer
+     * @param compressed_data_buf
+     * @param compressed_data_buf_size
+     */
+    void open(char const* compressed_data_buf, size_t compressed_data_buf_size) override;
+
+    /***
+     * Initialize streaming decompressor to decompress from a compressed file specified by the
+     * given path
+     * @param compressed_file_path
+     * @param decompressed_stream_block_size
+     * @return ErrorCode_Failure if the provided path cannot be memory mapped
+     * @return ErrorCode_Success on success
+     */
+    ErrorCode open(std::string const& compressed_file_path);
+
+    /**
+     * Initializes the decompressor to decompress from an open file
+     * @param file_reader
+     * @param file_read_buffer_capacity The maximum amount of data to read from a file at a time
+     */
+    void open(FileReader& file_reader, size_t file_read_buffer_capacity) override;
+
+private:
+    // Enum class
+    enum class InputType {
+        NotInitialized,  // Note: do nothing but generate an error to prevent this required
+                         // parameter is not initialized properly
+        CompressedDataBuf,
+        MemoryMappedCompressedFile,
+        File
+    };
+
+    // Methods
+    /**
+     * Reset streaming decompression state so it will start decompressing from the beginning of
+     * the stream afterwards
+     */
+    void reset_stream();
+
+    void init_decoder(lzma_stream* strm);
+
+    // Variables
+    InputType m_input_type;
+
+    // Compressed stream variables
+    lzma_stream* m_decompression_stream{nullptr};
+
+    boost::iostreams::mapped_file_source m_memory_mapped_compressed_file;
+    FileReader* m_file_reader;
+    size_t m_file_reader_initial_pos;
+    std::unique_ptr<char[]> m_file_read_buffer;
+    size_t m_file_read_buffer_length;
+    size_t m_file_read_buffer_capacity;
+
+    size_t m_decompressed_stream_pos;
+    size_t m_unused_decompressed_stream_block_size;
+    std::unique_ptr<char[]> m_unused_decompressed_stream_block_buffer;
+
+    char const* m_compressed_stream_block;
+    size_t m_compressed_stream_block_size;
+};
+}  // namespace streaming_compression::lzma
+#endif  // STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP
diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp
index 0fbae9e3a..d632510fc 100644
--- a/components/core/tests/test-StreamingCompression.cpp
+++ b/components/core/tests/test-StreamingCompression.cpp
@@ -15,6 +15,7 @@
 #include "../src/clp/ReadOnlyMemoryMappedFile.hpp"
 #include "../src/clp/streaming_compression/Compressor.hpp"
 #include "../src/clp/streaming_compression/Decompressor.hpp"
+#include "../src/clp/streaming_compression/lzma/Compressor.hpp"
 #include "../src/clp/streaming_compression/passthrough/Compressor.hpp"
 #include "../src/clp/streaming_compression/passthrough/Decompressor.hpp"
 #include "../src/clp/streaming_compression/zstd/Compressor.hpp"
diff --git a/components/core/tools/scripts/lib_install/liblzma.sh b/components/core/tools/scripts/lib_install/liblzma.sh
new file mode 100755
index 000000000..1145b2646
--- /dev/null
+++ b/components/core/tools/scripts/lib_install/liblzma.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Dependencies:
+# - curl
+# - make
+# - gcc
+# NOTE: Dependencies should be installed outside the script to allow the script to be largely distro-agnostic
+
+# Exit on any error
+set -e
+
+# Error on undefined variable
+set -u
+
+cUsage="Usage: ${BASH_SOURCE[0]} <version>[ <.deb output directory>]"
+if [ "$#" -lt 1 ] ; then
+    echo $cUsage
+    exit
+fi
+version=$1
+
+package_name=liblzma
+temp_dir=/tmp/${package_name}-installation
+deb_output_dir=${temp_dir}
+if [[ "$#" -gt 1 ]] ; then
+  deb_output_dir="$(readlink -f "$2")"
+  if [ ! -d ${deb_output_dir} ] ; then
+    echo "${deb_output_dir} does not exist or is not a directory"
+    exit
+  fi
+fi
+
+# Note: we won't check if the package already exists
+
+echo "Checking for elevated privileges..."
+privileged_command_prefix=""
+if [ ${EUID:-$(id -u)} -ne 0 ] ; then
+  sudo echo "Script can elevate privileges."
+  privileged_command_prefix="${privileged_command_prefix} sudo"
+fi
+
+# Get number of cpu cores
+num_cpus=$(grep -c ^processor /proc/cpuinfo)
+
+# Download
+mkdir -p $temp_dir
+cd $temp_dir
+extracted_dir=${temp_dir}/xz-${version}
+if [ ! -e ${extracted_dir} ] ; then
+  tar_filename=xz-${version}.tar.gz
+  if [ ! -e ${tar_filename} ] ; then
+    curl -fsSL https://github.com/tukaani-project/xz/releases/download/v${version}/${tar_filename} -o ${tar_filename}
+  fi
+  tar -xf ${tar_filename}
+fi
+
+# Build
+cd ${extracted_dir}
+mkdir build
+cd build
+cmake -DCMAKE_POSITION_INDEPENDENT_CODE=TRUE ../
+make -j${num_cpus}
+make install liblzma
+
+# Clean up
+rm -rf $temp_dir
diff --git a/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh b/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh
index 1e21314cc..10a2b0482 100755
--- a/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh
+++ b/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh
@@ -14,6 +14,7 @@ lib_install_scripts_dir=$script_dir/..
 
 "$lib_install_scripts_dir"/fmtlib.sh 8.0.1
 "$lib_install_scripts_dir"/libarchive.sh 3.5.1
+"$lib_install_scripts_dir"/liblzma.sh 5.4.6
 "$lib_install_scripts_dir"/lz4.sh 1.8.2
 "$lib_install_scripts_dir"/mongocxx.sh 3.10.2
 "$lib_install_scripts_dir"/msgpack.sh 7.0.0
diff --git a/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh
index 706674764..f1e2ee4ff 100755
--- a/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh
+++ b/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh
@@ -20,6 +20,7 @@ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
   libcurl4 \
   libcurl4-openssl-dev \
   libmariadb-dev \
+  liblzma-dev \
   libssl-dev \
   make \
   openjdk-11-jdk \
diff --git a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh
index 7799c9ba5..97aaf7093 100755
--- a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh
+++ b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh
@@ -11,6 +11,7 @@ lib_install_scripts_dir=$script_dir/..
 
 "$lib_install_scripts_dir"/fmtlib.sh 8.0.1
 "$lib_install_scripts_dir"/libarchive.sh 3.5.1
+"$lib_install_scripts_dir"/liblzma.sh 5.4.6
 "$lib_install_scripts_dir"/lz4.sh 1.8.2
 "$lib_install_scripts_dir"/mongocxx.sh 3.10.2
 "$lib_install_scripts_dir"/msgpack.sh 7.0.0
diff --git a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh
index 92d965b9b..4911a6a98 100755
--- a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh
+++ b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh
@@ -20,6 +20,7 @@ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
   libcurl4 \
   libcurl4-openssl-dev \
   libmariadb-dev \
+  liblzma-dev \
   libssl-dev \
   openjdk-11-jdk \
   pkg-config \

From d5af274f119ad8bb2d5b3bbc9e1e97bca282be7a Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Mon, 25 Nov 2024 12:49:13 -0500
Subject: [PATCH 02/65] Make unit test pass

---
 .../streaming_compression/lzma/Compressor.cpp | 72 +++++++---------
 .../streaming_compression/lzma/Compressor.hpp | 84 +++++++++++--------
 .../streaming_compression/lzma/Constants.hpp  | 12 +--
 .../lzma/Decompressor.cpp                     |  6 +-
 .../lzma/Decompressor.hpp                     | 15 ++--
 .../core/tests/test-StreamingCompression.cpp  |  6 ++
 6 files changed, 103 insertions(+), 92 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index f10ec915b..7bb13e5d3 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -1,34 +1,22 @@
 #include "Compressor.hpp"
 
-// spdlog
 #include <spdlog/spdlog.h>
 
+// Compression libraries
+#include <lzma.h>
+#include <zlib.h>
+
 // Project headers
 #include "../../Defs.h"
 
-// File-scope constants
-static constexpr size_t cCompressedStreamBlockBufferSize = 4096;  // 4KiB
-
-namespace streaming_compression::lzma {
+namespace clp::streaming_compression::lzma {
 Compressor::LzmaOption Compressor::m_option;
 
-Compressor::Compressor()
-        : ::streaming_compression::Compressor(CompressorType::LZMA),
-          m_compression_stream_contains_data(false),
-          m_compressed_stream_file_writer(nullptr),
-          m_compression_stream(nullptr) {
-    m_compressed_stream_block_buffer = std::make_unique<Bytef[]>(cCompressedStreamBlockBufferSize);
-    m_compression_stream = new lzma_stream;
-    memset(m_compression_stream, 0, sizeof(lzma_stream));
-}
-
-Compressor::~Compressor() {
-    if (nullptr != m_compression_stream) {
-        delete m_compression_stream;
-    }
+Compressor::Compressor() {
+    memset(m_compression_stream.get(), 0, sizeof(LzmaStream));
 }
 
-void Compressor::init_lzma_encoder(lzma_stream* strm) {
+void Compressor::init_lzma_encoder(LzmaStream* strm) {
     lzma_options_lzma options;
     if (lzma_lzma_preset(&options, m_option.get_compression_level())) {
         SPDLOG_ERROR("Failed to initialize LZMA options.");
@@ -44,10 +32,10 @@ void Compressor::init_lzma_encoder(lzma_stream* strm) {
     // to CRC64, which is the default in the xz command line tool. If
     // the .xz file needs to be decompressed with XZ Embedded, use
     // LZMA_CHECK_CRC32 instead.
-    lzma_ret ret = lzma_stream_encoder(strm, filters, LZMA_CHECK_CRC64);
+    auto const ret = lzma_stream_encoder(strm, filters, LZMA_CHECK_CRC64);
 
     // Return successfully if the initialization went fine.
-    if (ret == LZMA_OK) {
+    if (LZMA_OK == ret) {
         return;
     }
 
@@ -96,12 +84,12 @@ void Compressor::open(FileWriter& file_writer, int compression_level) {
         m_option.set_compression_level(compression_level);
     }
 
-    init_lzma_encoder(m_compression_stream);
+    init_lzma_encoder(m_compression_stream.get());
     // Setup compressed stream parameters
     m_compression_stream->next_in = nullptr;
     m_compression_stream->avail_in = 0;
-    m_compression_stream->next_out = m_compressed_stream_block_buffer.get();
-    m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
+    m_compression_stream->next_out = m_compressed_stream_block_buffer.data();
+    m_compression_stream->avail_out = m_compressed_stream_block_buffer.size();
 
     m_compressed_stream_file_writer = &file_writer;
 
@@ -136,7 +124,7 @@ void Compressor::write(char const* data, size_t data_length) {
     // Compress all data
     bool hit_input_eof = false;
     while (!hit_input_eof) {
-        lzma_ret return_value = lzma_code(m_compression_stream, action);
+        auto const return_value = lzma_code(m_compression_stream.get(), action);
         switch (return_value) {
             case LZMA_OK:
             case LZMA_BUF_ERROR:
@@ -157,10 +145,10 @@ void Compressor::write(char const* data, size_t data_length) {
         // Write output buffer to file if it's full
         if (0 == m_compression_stream->avail_out) {
             m_compressed_stream_file_writer->write(
-                    reinterpret_cast<char*>(m_compressed_stream_block_buffer.get()),
+                    reinterpret_cast<char*>(m_compressed_stream_block_buffer.data()),
                     cCompressedStreamBlockBufferSize
             );
-            m_compression_stream->next_out = m_compressed_stream_block_buffer.get();
+            m_compression_stream->next_out = m_compressed_stream_block_buffer.data();
             m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
         }
     }
@@ -168,10 +156,10 @@ void Compressor::write(char const* data, size_t data_length) {
     // Write any compressed data
     if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) {
         m_compressed_stream_file_writer->write(
-                reinterpret_cast<char*>(m_compressed_stream_block_buffer.get()),
+                reinterpret_cast<char*>(m_compressed_stream_block_buffer.data()),
                 cCompressedStreamBlockBufferSize - m_compression_stream->avail_out
         );
-        m_compression_stream->next_out = m_compressed_stream_block_buffer.get();
+        m_compression_stream->next_out = m_compressed_stream_block_buffer.data();
         m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
     }
 
@@ -200,7 +188,7 @@ void Compressor::flush() {
 
     bool flush_complete = false;
     while (true) {
-        lzma_ret return_value = lzma_code(m_compression_stream, LZMA_SYNC_FLUSH);
+        auto const return_value = lzma_code(m_compression_stream.get(), LZMA_SYNC_FLUSH);
         switch (return_value) {
             case LZMA_STREAM_END:
                 flush_complete = true;
@@ -219,10 +207,10 @@ void Compressor::flush() {
         // Write output buffer to file if it's full
         if (0 == m_compression_stream->avail_out) {
             m_compressed_stream_file_writer->write(
-                    reinterpret_cast<char*>(m_compressed_stream_block_buffer.get()),
+                    reinterpret_cast<char*>(m_compressed_stream_block_buffer.data()),
                     cCompressedStreamBlockBufferSize
             );
-            m_compression_stream->next_out = m_compressed_stream_block_buffer.get();
+            m_compression_stream->next_out = m_compressed_stream_block_buffer.data();
             m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
         }
     }
@@ -230,10 +218,10 @@ void Compressor::flush() {
     // Write any compressed data
     if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) {
         m_compressed_stream_file_writer->write(
-                reinterpret_cast<char*>(m_compressed_stream_block_buffer.get()),
+                reinterpret_cast<char*>(m_compressed_stream_block_buffer.data()),
                 cCompressedStreamBlockBufferSize - m_compression_stream->avail_out
         );
-        m_compression_stream->next_out = m_compressed_stream_block_buffer.get();
+        m_compression_stream->next_out = m_compressed_stream_block_buffer.data();
         m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
     }
 
@@ -256,7 +244,7 @@ void Compressor::flush_and_close_compression_stream() {
 
     bool flush_complete = false;
     while (true) {
-        lzma_ret return_value = lzma_code(m_compression_stream, LZMA_FINISH);
+        lzma_ret return_value = lzma_code(m_compression_stream.get(), LZMA_FINISH);
         switch (return_value) {
             case LZMA_OK:
             case LZMA_BUF_ERROR:
@@ -276,10 +264,10 @@ void Compressor::flush_and_close_compression_stream() {
         // Write output buffer to file if it's full
         if (0 == m_compression_stream->avail_out) {
             m_compressed_stream_file_writer->write(
-                    reinterpret_cast<char*>(m_compressed_stream_block_buffer.get()),
+                    reinterpret_cast<char*>(m_compressed_stream_block_buffer.data()),
                     cCompressedStreamBlockBufferSize
             );
-            m_compression_stream->next_out = m_compressed_stream_block_buffer.get();
+            m_compression_stream->next_out = m_compressed_stream_block_buffer.data();
             m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
         }
     }
@@ -287,17 +275,17 @@ void Compressor::flush_and_close_compression_stream() {
     // Write any compressed data
     if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) {
         m_compressed_stream_file_writer->write(
-                reinterpret_cast<char*>(m_compressed_stream_block_buffer.get()),
+                reinterpret_cast<char*>(m_compressed_stream_block_buffer.data()),
                 cCompressedStreamBlockBufferSize - m_compression_stream->avail_out
         );
-        m_compression_stream->next_out = m_compressed_stream_block_buffer.get();
+        m_compression_stream->next_out = m_compressed_stream_block_buffer.data();
         m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
     }
 
     m_compression_stream_contains_data = false;
 
-    lzma_end(m_compression_stream);
+    lzma_end(m_compression_stream.get());
     m_compression_stream->avail_out = 0;
     m_compression_stream->next_out = nullptr;
 }
-}  // namespace streaming_compression::lzma
+}  // namespace clp::streaming_compression::lzma
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index d31c7687e..53f82b139 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -1,22 +1,22 @@
-#ifndef STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP
-#define STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP
+#ifndef CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP
+#define CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP
 
-// C++ standard libraries
+#include <cstdint>
+#include <cstddef>
 #include <memory>
-#include <string>
 
-// ZLIB library
 #include <lzma.h>
-#include <zlib.h>
+#include <zconf.h>
 
-// Project headers
+#include "../../Array.hpp"
+#include "../../ErrorCode.hpp"
 #include "../../FileWriter.hpp"
 #include "../../TraceableException.hpp"
 #include "../Compressor.hpp"
 #include "Constants.hpp"
 
-namespace streaming_compression::lzma {
-class Compressor : public ::streaming_compression::Compressor {
+namespace clp::streaming_compression::lzma {
+class Compressor : public ::clp::streaming_compression::Compressor {
 public:
     // Types
     class OperationFailed : public TraceableException {
@@ -26,8 +26,8 @@ class Compressor : public ::streaming_compression::Compressor {
                 : TraceableException(error_code, filename, line_number) {}
 
         // Methods
-        char const* what() const noexcept override {
-            return "streaming_compression::gzip::Compressor operation failed";
+        [[nodiscard]] auto what() const noexcept -> char const* override {
+            return "streaming_compression::lzma::Compressor operation failed";
         }
     };
 
@@ -38,10 +38,10 @@ class Compressor : public ::streaming_compression::Compressor {
                   m_dict_size{cDefaultDictionarySize} {}
 
         auto set_compression_level(int compression_level) -> void {
-            if (0 > compression_level) {
-                m_compression_level = 0;
-            } else if (9 < compression_level) {
-                m_compression_level = 9;
+            if (compression_level < cMinCompressionLevel) {
+                m_compression_level = cMinCompressionLevel;
+            } else if (compression_level > cMaxCompressionLevel) {
+                m_compression_level = cMaxCompressionLevel;
             } else {
                 m_compression_level = compression_level;
             }
@@ -62,11 +62,15 @@ class Compressor : public ::streaming_compression::Compressor {
     Compressor();
 
     // Destructor
-    ~Compressor();
+    ~Compressor() override = default;
 
-    // Explicitly disable copy and move constructor/assignment
+    // Delete copy constructor and assignment operator
     Compressor(Compressor const&) = delete;
-    Compressor& operator=(Compressor const&) = delete;
+    auto operator=(Compressor const&) -> Compressor& = delete;
+
+    // Default move constructor and assignment operator
+    Compressor(Compressor&&) noexcept = default;
+    auto operator=(Compressor&&) noexcept -> Compressor& = default;
 
     // Methods implementing the WriterInterface
     /**
@@ -74,11 +78,12 @@ class Compressor : public ::streaming_compression::Compressor {
      * @param data
      * @param data_length
      */
-    void write(char const* data, size_t data_length) override;
+    auto write(char const* data, size_t data_length) -> void override;
+
     /**
      * Writes any internally buffered data to file and ends the current frame
      */
-    void flush() override;
+    auto flush() -> void override;
 
     /**
      * Tries to get the current position of the write head
@@ -86,20 +91,28 @@ class Compressor : public ::streaming_compression::Compressor {
      * @return ErrorCode_NotInit if the compressor is not open
      * @return ErrorCode_Success on success
      */
-    ErrorCode try_get_pos(size_t& pos) const override;
+    auto try_get_pos(size_t& pos) const -> ErrorCode override;
+
+    /**
+     * Closes the compressor
+     */
+    auto close() -> void override;
 
     // Methods implementing the Compressor interface
     /**
-     * Initialize streaming compressor
+     * Initializes the compression stream with the default compression level
      * @param file_writer
-     * @param compression_level
      */
-    void open(FileWriter& file_writer, int compression_level) override;
+    auto open(FileWriter& file_writer) -> void override {
+        this->open(file_writer, cDefaultCompressionLevel);
+    }
 
     /**
-     * Closes the compressor
+     * Initializes the compression stream with the given compression level
+     * @param file_writer
+     * @param compression_level
      */
-    void close() override;
+    auto open(FileWriter& file_writer, int compression_level) -> void;
 
     // Methods
     static auto set_compression_level(int compression_level) -> void {
@@ -109,25 +122,28 @@ class Compressor : public ::streaming_compression::Compressor {
     static auto set_dict_size(uint32_t dict_size) -> void { m_option.set_dict_size(dict_size); }
 
 private:
+    using LzmaStream = lzma_stream;
+
     /**
      * Flushes the stream and closes it
      */
     void flush_and_close_compression_stream();
 
-    static void init_lzma_encoder(lzma_stream* strm);
+    static void init_lzma_encoder(LzmaStream* strm);
     static LzmaOption m_option;
+    static constexpr size_t cCompressedStreamBlockBufferSize{4096};  // 4KiB
 
     // Variables
-    FileWriter* m_compressed_stream_file_writer;
+    FileWriter* m_compressed_stream_file_writer{nullptr};
 
     // Compressed stream variables
-    lzma_stream* m_compression_stream;
-    bool m_compression_stream_contains_data;
+    std::unique_ptr<LzmaStream> m_compression_stream{std::make_unique<LzmaStream>()};
+    bool m_compression_stream_contains_data{false};
 
-    std::unique_ptr<Bytef[]> m_compressed_stream_block_buffer;
+    Array<Bytef> m_compressed_stream_block_buffer{cCompressedStreamBlockBufferSize};
 
-    size_t m_uncompressed_stream_pos;
+    size_t m_uncompressed_stream_pos{0};
 };
-}  // namespace streaming_compression::lzma
+}  // namespace clp::streaming_compression::lzma
 
-#endif  // STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP
+#endif  // CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP
diff --git a/components/core/src/clp/streaming_compression/lzma/Constants.hpp b/components/core/src/clp/streaming_compression/lzma/Constants.hpp
index 959c09f47..4e261187a 100644
--- a/components/core/src/clp/streaming_compression/lzma/Constants.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Constants.hpp
@@ -1,15 +1,15 @@
 #ifndef STREAMING_COMPRESSION_LZMA_CONSTANTS_HPP
 #define STREAMING_COMPRESSION_LZMA_CONSTANTS_HPP
 
-#include <lzma.h>
-
-// C++ libraries
-#include <cstddef>
 #include <cstdint>
 
-namespace streaming_compression::lzma {
+#include <lzma.h>
+
+namespace clp::streaming_compression::lzma {
 constexpr int cDefaultCompressionLevel{3};
+constexpr int cMinCompressionLevel{0};
+constexpr int cMaxCompressionLevel{9};
 constexpr uint32_t cDefaultDictionarySize{LZMA_DICT_SIZE_DEFAULT};
-}  // namespace streaming_compression::lzma
+}  // namespace clp::streaming_compression::lzma
 
 #endif  // STREAMING_COMPRESSION_LZMA_CONSTANTS_HPP
diff --git a/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp b/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp
index a2ed4d466..b6a10b418 100644
--- a/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp
@@ -12,9 +12,9 @@
 // Project headers
 #include "../../Defs.h"
 
-namespace streaming_compression::lzma {
+namespace clp::streaming_compression::lzma {
 Decompressor::Decompressor()
-        : ::streaming_compression::Decompressor(CompressorType::LZMA),
+        : ::clp::streaming_compression::Decompressor(CompressorType::LZMA),
           m_input_type(InputType::NotInitialized),
           m_decompression_stream(nullptr),
           m_file_reader(nullptr),
@@ -359,4 +359,4 @@ void Decompressor::reset_stream() {
     m_decompressed_stream_pos = 0;
     init_decoder(m_decompression_stream);
 }
-}  // namespace streaming_compression::lzma
+}  // namespace clp::streaming_compression::lzma
diff --git a/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp b/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp
index 996663e44..5e90f5942 100644
--- a/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp
@@ -1,13 +1,14 @@
-#ifndef STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP
-#define STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP
+#ifndef CLP_STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP
+#define CLP_STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP
 
 // C++ standard libraries
 #include <memory>
 #include <string>
 
 // ZLIB library
-#include <lzma.h>
 #include <zlib.h>
+
+#include <lzma.h>
 // Boost libraries
 #include <boost/iostreams/device/mapped_file.hpp>
 
@@ -16,8 +17,8 @@
 #include "../../TraceableException.hpp"
 #include "../Decompressor.hpp"
 
-namespace streaming_compression::lzma {
-class Decompressor : public ::streaming_compression::Decompressor {
+namespace clp::streaming_compression::lzma {
+class Decompressor : public ::clp::streaming_compression::Decompressor {
 public:
     // Types
     class OperationFailed : public TraceableException {
@@ -158,5 +159,5 @@ class Decompressor : public ::streaming_compression::Decompressor {
     char const* m_compressed_stream_block;
     size_t m_compressed_stream_block_size;
 };
-}  // namespace streaming_compression::lzma
-#endif  // STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP
+}  // namespace clp::streaming_compression::lzma
+#endif  // CLP_STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP
diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp
index d632510fc..d58d4c1ce 100644
--- a/components/core/tests/test-StreamingCompression.cpp
+++ b/components/core/tests/test-StreamingCompression.cpp
@@ -16,6 +16,7 @@
 #include "../src/clp/streaming_compression/Compressor.hpp"
 #include "../src/clp/streaming_compression/Decompressor.hpp"
 #include "../src/clp/streaming_compression/lzma/Compressor.hpp"
+#include "../src/clp/streaming_compression/lzma/Decompressor.hpp"
 #include "../src/clp/streaming_compression/passthrough/Compressor.hpp"
 #include "../src/clp/streaming_compression/passthrough/Decompressor.hpp"
 #include "../src/clp/streaming_compression/zstd/Compressor.hpp"
@@ -56,6 +57,11 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") {
         decompressor = std::make_unique<clp::streaming_compression::passthrough::Decompressor>();
     }
 
+    SECTION("LZMA compression") {
+        compressor = std::make_unique<clp::streaming_compression::lzma::Compressor>();
+        decompressor = std::make_unique<clp::streaming_compression::lzma::Decompressor>();
+    }
+
     // Initialize buffers
     Array<char> uncompressed_buffer{cBufferSize};
     for (size_t i{0}; i < cBufferSize; ++i) {

From b94ca2695d4ebaf7c79a8ac3e31b94eae1e52e16 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Tue, 26 Nov 2024 03:21:38 -0500
Subject: [PATCH 03/65] Refactor lzma compressor to group common
 functionalities into helplers

---
 .../streaming_compression/lzma/Compressor.cpp | 210 ++++++------------
 .../streaming_compression/lzma/Compressor.hpp |  56 ++---
 2 files changed, 84 insertions(+), 182 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 7bb13e5d3..74a59ebca 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -1,38 +1,40 @@
 #include "Compressor.hpp"
 
-#include <spdlog/spdlog.h>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
 
-// Compression libraries
 #include <lzma.h>
-#include <zlib.h>
+#include <spdlog/spdlog.h>
 
-// Project headers
-#include "../../Defs.h"
+#include "../../ErrorCode.hpp"
+#include "../../FileWriter.hpp"
+#include "../../TraceableException.hpp"
+#include "../../type_utils.hpp"
+#include "Constants.hpp"
 
 namespace clp::streaming_compression::lzma {
-Compressor::LzmaOption Compressor::m_option;
-
-Compressor::Compressor() {
-    memset(m_compression_stream.get(), 0, sizeof(LzmaStream));
-}
+using clp::size_checked_pointer_cast;
 
-void Compressor::init_lzma_encoder(LzmaStream* strm) {
-    lzma_options_lzma options;
-    if (lzma_lzma_preset(&options, m_option.get_compression_level())) {
+auto Compressor::init_lzma_encoder(LzmaStream* strm, int compression_level, size_t dict_size)
+        -> void {
+    LzmaOptionsLzma options;
+    if (0 != lzma_lzma_preset(&options, compression_level)) {
         SPDLOG_ERROR("Failed to initialize LZMA options.");
         throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
     }
-    options.dict_size = m_option.get_dict_size();
-    lzma_filter filters[2]{
-            {LZMA_FILTER_LZMA2, &options},
-            {LZMA_VLI_UNKNOWN, nullptr},
-    };
+    options.dict_size = dict_size;
+    std::array<LzmaFilter, 2> filters{{
+            {.id = LZMA_FILTER_LZMA2, .options = &options},
+            {.id = LZMA_VLI_UNKNOWN, .options = nullptr},
+    }};
 
     // Initialize the encoder using a preset. Set the integrity to check
     // to CRC64, which is the default in the xz command line tool. If
     // the .xz file needs to be decompressed with XZ Embedded, use
     // LZMA_CHECK_CRC32 instead.
-    auto const ret = lzma_stream_encoder(strm, filters, LZMA_CHECK_CRC64);
+    auto const ret{lzma_stream_encoder(strm, filters.data(), LZMA_CHECK_CRC64)};
 
     // Return successfully if the initialization went fine.
     if (LZMA_OK == ret) {
@@ -43,7 +45,7 @@ void Compressor::init_lzma_encoder(LzmaStream* strm) {
     // lzma/container.h (src/liblzma/api/lzma/container.h in the source
     // package or e.g. /usr/include/lzma/container.h depending on the
     // install prefix).
-    char const* msg;
+    char const* msg{nullptr};
     switch (ret) {
         case LZMA_MEM_ERROR:
             msg = "Memory allocation failed";
@@ -68,23 +70,21 @@ void Compressor::init_lzma_encoder(LzmaStream* strm) {
             break;
     }
 
-    SPDLOG_ERROR("Error initializing the encoder: {} (error code {})", msg, int(ret));
+    SPDLOG_ERROR("Error initializing the encoder: {} (error code {})", msg, static_cast<int>(ret));
     throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
 }
 
-void Compressor::open(FileWriter& file_writer, int compression_level) {
+auto Compressor::open(FileWriter& file_writer, int compression_level) -> void {
     if (nullptr != m_compressed_stream_file_writer) {
         throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__);
     }
 
-    if (false == (0 <= compression_level && compression_level <= 9)) {
+    if (compression_level < cMinCompressionLevel || compression_level > cMaxCompressionLevel) {
         throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__);
     }
-    if (compression_level != m_option.get_compression_level()) {
-        m_option.set_compression_level(compression_level);
-    }
 
-    init_lzma_encoder(m_compression_stream.get());
+    memset(m_compression_stream.get(), 0, sizeof(LzmaStream));
+    init_lzma_encoder(m_compression_stream.get(), compression_level, m_dict_size);
     // Setup compressed stream parameters
     m_compression_stream->next_in = nullptr;
     m_compression_stream->avail_in = 0;
@@ -96,7 +96,7 @@ void Compressor::open(FileWriter& file_writer, int compression_level) {
     m_uncompressed_stream_pos = 0;
 }
 
-void Compressor::close() {
+auto Compressor::close() -> void {
     if (nullptr == m_compressed_stream_file_writer) {
         throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
     }
@@ -105,7 +105,7 @@ void Compressor::close() {
     m_compressed_stream_file_writer = nullptr;
 }
 
-void Compressor::write(char const* data, size_t data_length) {
+auto Compressor::write(char const* data, size_t data_length) -> void {
     if (nullptr == m_compressed_stream_file_writer) {
         throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
     }
@@ -114,54 +114,15 @@ void Compressor::write(char const* data, size_t data_length) {
         // Nothing needs to be done because we do not need to compress anything
         return;
     }
+
     if (nullptr == data) {
         throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
     }
-    lzma_action action = LZMA_RUN;
-    m_compression_stream->next_in = reinterpret_cast<Bytef*>(const_cast<char*>(data));
-    m_compression_stream->avail_in = data_length;
-
-    // Compress all data
-    bool hit_input_eof = false;
-    while (!hit_input_eof) {
-        auto const return_value = lzma_code(m_compression_stream.get(), action);
-        switch (return_value) {
-            case LZMA_OK:
-            case LZMA_BUF_ERROR:
-                break;
-            case LZMA_STREAM_END:
-                hit_input_eof = true;
-                break;
-            default:
-                SPDLOG_ERROR("lzma() returned an unexpected value - {}.", int(return_value));
-                throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
-        }
 
-        if (0 == m_compression_stream->avail_in) {
-            // No more data to compress
-            break;
-        }
+    m_compression_stream->next_in = size_checked_pointer_cast<uint8_t const>(data);
+    m_compression_stream->avail_in = data_length;
 
-        // Write output buffer to file if it's full
-        if (0 == m_compression_stream->avail_out) {
-            m_compressed_stream_file_writer->write(
-                    reinterpret_cast<char*>(m_compressed_stream_block_buffer.data()),
-                    cCompressedStreamBlockBufferSize
-            );
-            m_compression_stream->next_out = m_compressed_stream_block_buffer.data();
-            m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
-        }
-    }
-
-    // Write any compressed data
-    if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) {
-        m_compressed_stream_file_writer->write(
-                reinterpret_cast<char*>(m_compressed_stream_block_buffer.data()),
-                cCompressedStreamBlockBufferSize - m_compression_stream->avail_out
-        );
-        m_compression_stream->next_out = m_compressed_stream_block_buffer.data();
-        m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
-    }
+    run_lzma(LZMA_RUN);
 
     m_compression_stream->next_in = nullptr;
 
@@ -169,7 +130,7 @@ void Compressor::write(char const* data, size_t data_length) {
     m_uncompressed_stream_pos += data_length;
 }
 
-void Compressor::flush() {
+auto Compressor::flush() -> void {
     if (false == m_compression_stream_contains_data) {
         return;
     }
@@ -184,51 +145,11 @@ void Compressor::flush() {
     // restart from this point if the previous compressed data has been damaged Z_FINISH -
     // Pending output flushed and deflate returns Z_STREAM_END if there was enough output space,
     // or Z_OK or Z_BUF_ERROR if it needs to be called again with more space
-    //
-
-    bool flush_complete = false;
-    while (true) {
-        auto const return_value = lzma_code(m_compression_stream.get(), LZMA_SYNC_FLUSH);
-        switch (return_value) {
-            case LZMA_STREAM_END:
-                flush_complete = true;
-                break;
-            case LZMA_OK:
-            case LZMA_BUF_ERROR:
-                break;
-            default:
-                SPDLOG_ERROR("lzma() returned an unexpected value - {}.", int(return_value));
-                throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
-        }
-        if (flush_complete) {
-            break;
-        }
-
-        // Write output buffer to file if it's full
-        if (0 == m_compression_stream->avail_out) {
-            m_compressed_stream_file_writer->write(
-                    reinterpret_cast<char*>(m_compressed_stream_block_buffer.data()),
-                    cCompressedStreamBlockBufferSize
-            );
-            m_compression_stream->next_out = m_compressed_stream_block_buffer.data();
-            m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
-        }
-    }
-
-    // Write any compressed data
-    if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) {
-        m_compressed_stream_file_writer->write(
-                reinterpret_cast<char*>(m_compressed_stream_block_buffer.data()),
-                cCompressedStreamBlockBufferSize - m_compression_stream->avail_out
-        );
-        m_compression_stream->next_out = m_compressed_stream_block_buffer.data();
-        m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
-    }
-
+    run_lzma(LZMA_SYNC_FLUSH);
     m_compression_stream_contains_data = false;
 }
 
-ErrorCode Compressor::try_get_pos(size_t& pos) const {
+auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode {
     if (nullptr == m_compressed_stream_file_writer) {
         return ErrorCode_NotInit;
     }
@@ -237,55 +158,64 @@ ErrorCode Compressor::try_get_pos(size_t& pos) const {
     return ErrorCode_Success;
 }
 
-void Compressor::flush_and_close_compression_stream() {
+auto Compressor::flush_and_close_compression_stream() -> void {
     if (nullptr == m_compressed_stream_file_writer) {
         throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
     }
 
-    bool flush_complete = false;
+    run_lzma(LZMA_FINISH);
+
+    m_compression_stream_contains_data = false;
+
+    lzma_end(m_compression_stream.get());
+    m_compression_stream->avail_out = 0;
+    m_compression_stream->next_out = nullptr;
+}
+
+auto Compressor::run_lzma(LzmaAction action) -> void {
+    // Compress all data
+    bool hit_input_eof{false};
     while (true) {
-        lzma_ret return_value = lzma_code(m_compression_stream.get(), LZMA_FINISH);
-        switch (return_value) {
+        auto const rc = lzma_code(m_compression_stream.get(), action);
+        switch (rc) {
             case LZMA_OK:
             case LZMA_BUF_ERROR:
                 break;
             case LZMA_STREAM_END:
-                flush_complete = true;
+                hit_input_eof = true;
                 break;
             default:
-                //                    SPDLOG_ERROR("deflate() returned an unexpected value -
-                //                    {}.", return_value);
+                SPDLOG_ERROR("lzma() returned an unexpected value - {}.", static_cast<int>(rc));
                 throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
         }
-        if (flush_complete) {
+
+        if (LZMA_RUN == action && 0 == m_compression_stream->avail_in) {
+            // No more data to compress
+            break;
+        }
+
+        if (hit_input_eof) {
             break;
         }
 
         // Write output buffer to file if it's full
         if (0 == m_compression_stream->avail_out) {
-            m_compressed_stream_file_writer->write(
-                    reinterpret_cast<char*>(m_compressed_stream_block_buffer.data()),
-                    cCompressedStreamBlockBufferSize
-            );
-            m_compression_stream->next_out = m_compressed_stream_block_buffer.data();
-            m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
+            write_data();
         }
     }
 
     // Write any compressed data
     if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) {
-        m_compressed_stream_file_writer->write(
-                reinterpret_cast<char*>(m_compressed_stream_block_buffer.data()),
-                cCompressedStreamBlockBufferSize - m_compression_stream->avail_out
-        );
-        m_compression_stream->next_out = m_compressed_stream_block_buffer.data();
-        m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
+        write_data();
     }
+}
 
-    m_compression_stream_contains_data = false;
-
-    lzma_end(m_compression_stream.get());
-    m_compression_stream->avail_out = 0;
-    m_compression_stream->next_out = nullptr;
+auto Compressor::write_data() -> void {
+    m_compressed_stream_file_writer->write(
+            size_checked_pointer_cast<char>(m_compressed_stream_block_buffer.data()),
+            cCompressedStreamBlockBufferSize - m_compression_stream->avail_out
+    );
+    m_compression_stream->next_out = m_compressed_stream_block_buffer.data();
+    m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
 }
 }  // namespace clp::streaming_compression::lzma
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index 53f82b139..f6c6b4963 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -1,12 +1,12 @@
 #ifndef CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP
 #define CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP
 
-#include <cstdint>
+#include <zconf.h>
+
 #include <cstddef>
 #include <memory>
 
 #include <lzma.h>
-#include <zconf.h>
 
 #include "../../Array.hpp"
 #include "../../ErrorCode.hpp"
@@ -31,35 +31,8 @@ class Compressor : public ::clp::streaming_compression::Compressor {
         }
     };
 
-    class LzmaOption {
-    public:
-        LzmaOption()
-                : m_compression_level{cDefaultCompressionLevel},
-                  m_dict_size{cDefaultDictionarySize} {}
-
-        auto set_compression_level(int compression_level) -> void {
-            if (compression_level < cMinCompressionLevel) {
-                m_compression_level = cMinCompressionLevel;
-            } else if (compression_level > cMaxCompressionLevel) {
-                m_compression_level = cMaxCompressionLevel;
-            } else {
-                m_compression_level = compression_level;
-            }
-        }
-
-        auto set_dict_size(uint32_t dict_size) -> void { m_dict_size = dict_size; }
-
-        [[nodiscard]] auto get_compression_level() const -> int { return m_compression_level; }
-
-        [[nodiscard]] auto get_dict_size() const -> uint32_t { return m_dict_size; }
-
-    private:
-        int m_compression_level;
-        uint32_t m_dict_size;
-    };
-
     // Constructor
-    Compressor();
+    Compressor() = default;
 
     // Destructor
     ~Compressor() override = default;
@@ -114,24 +87,22 @@ class Compressor : public ::clp::streaming_compression::Compressor {
      */
     auto open(FileWriter& file_writer, int compression_level) -> void;
 
-    // Methods
-    static auto set_compression_level(int compression_level) -> void {
-        m_option.set_compression_level(compression_level);
-    }
-
-    static auto set_dict_size(uint32_t dict_size) -> void { m_option.set_dict_size(dict_size); }
-
 private:
+    using LzmaAction = lzma_action;
+    using LzmaFilter = lzma_filter;
+    using LzmaOptionsLzma = lzma_options_lzma;
     using LzmaStream = lzma_stream;
 
+    static auto
+    init_lzma_encoder(LzmaStream* strm, int compression_level, size_t dict_size) -> void;
+    static constexpr size_t cCompressedStreamBlockBufferSize{4096};  // 4KiB
+
     /**
      * Flushes the stream and closes it
      */
-    void flush_and_close_compression_stream();
-
-    static void init_lzma_encoder(LzmaStream* strm);
-    static LzmaOption m_option;
-    static constexpr size_t cCompressedStreamBlockBufferSize{4096};  // 4KiB
+    auto flush_and_close_compression_stream() -> void;
+    auto write_data() -> void;
+    auto run_lzma(lzma_action action) -> void;
 
     // Variables
     FileWriter* m_compressed_stream_file_writer{nullptr};
@@ -139,6 +110,7 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     // Compressed stream variables
     std::unique_ptr<LzmaStream> m_compression_stream{std::make_unique<LzmaStream>()};
     bool m_compression_stream_contains_data{false};
+    size_t m_dict_size{cDefaultDictionarySize};
 
     Array<Bytef> m_compressed_stream_block_buffer{cCompressedStreamBlockBufferSize};
 

From 707c41219a5e2ad91ccbf01b91df973e9856ef6d Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Wed, 27 Nov 2024 01:14:54 -0500
Subject: [PATCH 04/65] Improve comments

---
 .../streaming_compression/lzma/Compressor.cpp | 35 ++++++++-----------
 .../streaming_compression/lzma/Compressor.hpp | 25 +++++++++++--
 2 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 74a59ebca..6f6b5b4cf 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -21,7 +21,7 @@ auto Compressor::init_lzma_encoder(LzmaStream* strm, int compression_level, size
         -> void {
     LzmaOptionsLzma options;
     if (0 != lzma_lzma_preset(&options, compression_level)) {
-        SPDLOG_ERROR("Failed to initialize LZMA options.");
+        SPDLOG_ERROR("Failed to initialize LZMA options' compression level.");
         throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
     }
     options.dict_size = dict_size;
@@ -122,7 +122,9 @@ auto Compressor::write(char const* data, size_t data_length) -> void {
     m_compression_stream->next_in = size_checked_pointer_cast<uint8_t const>(data);
     m_compression_stream->avail_in = data_length;
 
-    run_lzma(LZMA_RUN);
+    // Normal compression encoding workflow. Continue until the input buffer is
+    // exhausted.
+    compress(LZMA_RUN);
 
     m_compression_stream->next_in = nullptr;
 
@@ -134,18 +136,9 @@ auto Compressor::flush() -> void {
     if (false == m_compression_stream_contains_data) {
         return;
     }
-    // Z_NO_FLUSH - deflate decides how much data to accumulate before producing output
-    // Z_SYNC_FLUSH - All pending output flushed to output buf and output aligned to byte
-    // boundary (completes current block and follows it with empty block that is 3 bits plus
-    // filler to next byte, followed by 4 bytes Z_PARTIAL_FLUSH - Same as Z_SYNC_FLUSH but
-    // output not aligned to byte boundary (completes current block and follows it with empty
-    // fixed codes block that is 10 bits long) Z_BLOCK - Same as Z_SYNC_FLUSH but output not
-    // aligned on a byte boundary and up to 7 bits of current block held to be written
-    // Z_FULL_FLUSH - Same as Z_SYNC_FLUSH but compression state reset so that decompression can
-    // restart from this point if the previous compressed data has been damaged Z_FINISH -
-    // Pending output flushed and deflate returns Z_STREAM_END if there was enough output space,
-    // or Z_OK or Z_BUF_ERROR if it needs to be called again with more space
-    run_lzma(LZMA_SYNC_FLUSH);
+
+    // Forces all the buffered data to be available at output
+    compress(LZMA_SYNC_FLUSH);
     m_compression_stream_contains_data = false;
 }
 
@@ -163,7 +156,8 @@ auto Compressor::flush_and_close_compression_stream() -> void {
         throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
     }
 
-    run_lzma(LZMA_FINISH);
+    // Same as flush but all the input data must have been given to the encoder
+    compress(LZMA_FINISH);
 
     m_compression_stream_contains_data = false;
 
@@ -172,8 +166,7 @@ auto Compressor::flush_and_close_compression_stream() -> void {
     m_compression_stream->next_out = nullptr;
 }
 
-auto Compressor::run_lzma(LzmaAction action) -> void {
-    // Compress all data
+auto Compressor::compress(LzmaAction action) -> void {
     bool hit_input_eof{false};
     while (true) {
         auto const rc = lzma_code(m_compression_stream.get(), action);
@@ -200,17 +193,17 @@ auto Compressor::run_lzma(LzmaAction action) -> void {
 
         // Write output buffer to file if it's full
         if (0 == m_compression_stream->avail_out) {
-            write_data();
+            pipe_data();
         }
     }
 
-    // Write any compressed data
+    // Write remaining compressed data
     if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) {
-        write_data();
+        pipe_data();
     }
 }
 
-auto Compressor::write_data() -> void {
+auto Compressor::pipe_data() -> void {
     m_compressed_stream_file_writer->write(
             size_checked_pointer_cast<char>(m_compressed_stream_block_buffer.data()),
             cCompressedStreamBlockBufferSize - m_compression_stream->avail_out
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index f6c6b4963..03f32a186 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -93,6 +93,13 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     using LzmaOptionsLzma = lzma_options_lzma;
     using LzmaStream = lzma_stream;
 
+    /**
+     * Initialize the Lzma compression stream
+     * @param strm A pre-allocated `lzma_stream` object
+     * @param compression_level
+     * @param dict_size Dictionary size that indicates how many bytes of the
+     *                  recently processed uncompressed data is kept in memory
+     */
     static auto
     init_lzma_encoder(LzmaStream* strm, int compression_level, size_t dict_size) -> void;
     static constexpr size_t cCompressedStreamBlockBufferSize{4096};  // 4KiB
@@ -101,8 +108,22 @@ class Compressor : public ::clp::streaming_compression::Compressor {
      * Flushes the stream and closes it
      */
     auto flush_and_close_compression_stream() -> void;
-    auto write_data() -> void;
-    auto run_lzma(lzma_action action) -> void;
+
+    /**
+     * Repeatedly invoke lzma_code() compression workflow until LZMA_STREAM_END
+     * is reached.
+     * The workflow action needs to be kept the same throughout this process.
+     * See also: https://github.com/frida/xz/blob/main/src/liblzma/api/lzma/base.h#L246
+     *
+     * @param action
+     */
+    auto compress(lzma_action action) -> void;
+
+    /**
+     * Pipes the current compressed data in the lzma buffer to the output file
+     * and reset the compression buffer to receive new data.
+     */
+    auto pipe_data() -> void;
 
     // Variables
     FileWriter* m_compressed_stream_file_writer{nullptr};

From 6d1ab8fa907632a9af6001f9075404fc09708633 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Wed, 27 Nov 2024 11:08:09 -0500
Subject: [PATCH 05/65] Fix reference link

---
 .../core/src/clp/streaming_compression/lzma/Compressor.hpp      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index 03f32a186..80052e50c 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -113,7 +113,7 @@ class Compressor : public ::clp::streaming_compression::Compressor {
      * Repeatedly invoke lzma_code() compression workflow until LZMA_STREAM_END
      * is reached.
      * The workflow action needs to be kept the same throughout this process.
-     * See also: https://github.com/frida/xz/blob/main/src/liblzma/api/lzma/base.h#L246
+     * See also: https://github.com/tukaani-project/xz/blob/master/src/liblzma/api/lzma/base.h#L274
      *
      * @param action
      */

From 89b57074a7851d66310bb32b3031da566a3902f4 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Wed, 27 Nov 2024 11:43:56 -0500
Subject: [PATCH 06/65] Add install for CentOS

---
 .../lib_install/centos-stream-9/install-prebuilt-packages.sh   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh
index e90f54733..eede5e004 100755
--- a/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh
+++ b/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh
@@ -16,4 +16,5 @@ dnf install -y \
     libzstd-devel \
     make \
     mariadb-connector-c-devel \
-    openssl-devel
+    openssl-devel \
+    xz-devel

From c646cea6325763dbebc23c790d87445a7c0c8ecd Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Wed, 27 Nov 2024 14:04:31 -0500
Subject: [PATCH 07/65] Apply coderabbit suggestions

---
 components/core/CMakeLists.txt                |  6 +--
 .../streaming_compression/lzma/Compressor.cpp | 44 +++++++++----------
 .../streaming_compression/lzma/Compressor.hpp |  6 +--
 .../core/tools/scripts/lib_install/liblzma.sh | 24 +++++-----
 4 files changed, 39 insertions(+), 41 deletions(-)

diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index 92bb6af19..56156c131 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -232,13 +232,13 @@ endif()
 # TODO: add a script in ./cmake/Modules to resolve .a vs. .so
 find_package(LibLZMA REQUIRED)
 if(LIBLZMA_FOUND)
-    message(STATUS "Found LIBLZMA_FOUND ${LIBLZMA_VERSION_STRING}")
+    message(STATUS "Found Lzma ${LIBLZMA_VERSION_STRING}")
     message(STATUS "Lzma library location: ${LIBLZMA_LIBRARIES}")
 else()
-    message(FATAL_ERROR "Could not find ${CLP_LIBS_STRING} libraries for LIBLZMA_FOUND")
+    message(FATAL_ERROR "Could not find ${CLP_LIBS_STRING} libraries for Lzma")
 endif()
 include_directories(${LIBLZMA_INCLUDE_DIRS})
-message("LZMA Include Dir: ${LIBLZMA_INCLUDE_DIRS}")
+message("Lzma Include Dir: ${LIBLZMA_INCLUDE_DIRS}")
 
 # sqlite dependencies
 set(sqlite_DYNAMIC_LIBS "dl;m;pthread")
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 6f6b5b4cf..c7b46cd6c 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -83,13 +83,13 @@ auto Compressor::open(FileWriter& file_writer, int compression_level) -> void {
         throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__);
     }
 
-    memset(m_compression_stream.get(), 0, sizeof(LzmaStream));
-    init_lzma_encoder(m_compression_stream.get(), compression_level, m_dict_size);
+    m_compression_stream = LZMA_STREAM_INIT;
+    init_lzma_encoder(&m_compression_stream, compression_level, m_dict_size);
     // Setup compressed stream parameters
-    m_compression_stream->next_in = nullptr;
-    m_compression_stream->avail_in = 0;
-    m_compression_stream->next_out = m_compressed_stream_block_buffer.data();
-    m_compression_stream->avail_out = m_compressed_stream_block_buffer.size();
+    m_compression_stream.next_in = nullptr;
+    m_compression_stream.avail_in = 0;
+    m_compression_stream.next_out = m_compressed_stream_block_buffer.data();
+    m_compression_stream.avail_out = m_compressed_stream_block_buffer.size();
 
     m_compressed_stream_file_writer = &file_writer;
 
@@ -119,14 +119,14 @@ auto Compressor::write(char const* data, size_t data_length) -> void {
         throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
     }
 
-    m_compression_stream->next_in = size_checked_pointer_cast<uint8_t const>(data);
-    m_compression_stream->avail_in = data_length;
+    m_compression_stream.next_in = size_checked_pointer_cast<uint8_t const>(data);
+    m_compression_stream.avail_in = data_length;
 
     // Normal compression encoding workflow. Continue until the input buffer is
     // exhausted.
     compress(LZMA_RUN);
 
-    m_compression_stream->next_in = nullptr;
+    m_compression_stream.next_in = nullptr;
 
     m_compression_stream_contains_data = true;
     m_uncompressed_stream_pos += data_length;
@@ -161,44 +161,44 @@ auto Compressor::flush_and_close_compression_stream() -> void {
 
     m_compression_stream_contains_data = false;
 
-    lzma_end(m_compression_stream.get());
-    m_compression_stream->avail_out = 0;
-    m_compression_stream->next_out = nullptr;
+    lzma_end(&m_compression_stream);
+    m_compression_stream.avail_out = 0;
+    m_compression_stream.next_out = nullptr;
 }
 
 auto Compressor::compress(LzmaAction action) -> void {
-    bool hit_input_eof{false};
+    bool hit_stream_end{false};
     while (true) {
-        auto const rc = lzma_code(m_compression_stream.get(), action);
+        auto const rc = lzma_code(&m_compression_stream, action);
         switch (rc) {
             case LZMA_OK:
             case LZMA_BUF_ERROR:
                 break;
             case LZMA_STREAM_END:
-                hit_input_eof = true;
+                hit_stream_end = true;
                 break;
             default:
                 SPDLOG_ERROR("lzma() returned an unexpected value - {}.", static_cast<int>(rc));
                 throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
         }
 
-        if (LZMA_RUN == action && 0 == m_compression_stream->avail_in) {
+        if (LZMA_RUN == action && 0 == m_compression_stream.avail_in) {
             // No more data to compress
             break;
         }
 
-        if (hit_input_eof) {
+        if (hit_stream_end) {
             break;
         }
 
         // Write output buffer to file if it's full
-        if (0 == m_compression_stream->avail_out) {
+        if (0 == m_compression_stream.avail_out) {
             pipe_data();
         }
     }
 
     // Write remaining compressed data
-    if (m_compression_stream->avail_out < cCompressedStreamBlockBufferSize) {
+    if (m_compression_stream.avail_out < cCompressedStreamBlockBufferSize) {
         pipe_data();
     }
 }
@@ -206,9 +206,9 @@ auto Compressor::compress(LzmaAction action) -> void {
 auto Compressor::pipe_data() -> void {
     m_compressed_stream_file_writer->write(
             size_checked_pointer_cast<char>(m_compressed_stream_block_buffer.data()),
-            cCompressedStreamBlockBufferSize - m_compression_stream->avail_out
+            cCompressedStreamBlockBufferSize - m_compression_stream.avail_out
     );
-    m_compression_stream->next_out = m_compressed_stream_block_buffer.data();
-    m_compression_stream->avail_out = cCompressedStreamBlockBufferSize;
+    m_compression_stream.next_out = m_compressed_stream_block_buffer.data();
+    m_compression_stream.avail_out = cCompressedStreamBlockBufferSize;
 }
 }  // namespace clp::streaming_compression::lzma
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index 80052e50c..d10810e88 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -1,8 +1,6 @@
 #ifndef CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP
 #define CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP
 
-#include <zconf.h>
-
 #include <cstddef>
 #include <memory>
 
@@ -129,11 +127,11 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     FileWriter* m_compressed_stream_file_writer{nullptr};
 
     // Compressed stream variables
-    std::unique_ptr<LzmaStream> m_compression_stream{std::make_unique<LzmaStream>()};
+    LzmaStream m_compression_stream;
     bool m_compression_stream_contains_data{false};
     size_t m_dict_size{cDefaultDictionarySize};
 
-    Array<Bytef> m_compressed_stream_block_buffer{cCompressedStreamBlockBufferSize};
+    Array<uint8_t> m_compressed_stream_block_buffer{cCompressedStreamBlockBufferSize};
 
     size_t m_uncompressed_stream_pos{0};
 };
diff --git a/components/core/tools/scripts/lib_install/liblzma.sh b/components/core/tools/scripts/lib_install/liblzma.sh
index 1145b2646..28766eced 100755
--- a/components/core/tools/scripts/lib_install/liblzma.sh
+++ b/components/core/tools/scripts/lib_install/liblzma.sh
@@ -1,16 +1,23 @@
 #!/bin/bash
 
+# Exit on any error
+set -e
+
+# Error on undefined variable
+set -u
+
 # Dependencies:
 # - curl
 # - make
 # - gcc
 # NOTE: Dependencies should be installed outside the script to allow the script to be largely distro-agnostic
 
-# Exit on any error
-set -e
-
-# Error on undefined variable
-set -u
+for cmd in curl make gcc; do
+    if ! $cmd --version >/dev/null 2>&1; then
+        echo "Error: Required dependency '$cmd' not found"
+        exit 1
+    fi
+done
 
 cUsage="Usage: ${BASH_SOURCE[0]} <version>[ <.deb output directory>]"
 if [ "$#" -lt 1 ] ; then
@@ -32,13 +39,6 @@ fi
 
 # Note: we won't check if the package already exists
 
-echo "Checking for elevated privileges..."
-privileged_command_prefix=""
-if [ ${EUID:-$(id -u)} -ne 0 ] ; then
-  sudo echo "Script can elevate privileges."
-  privileged_command_prefix="${privileged_command_prefix} sudo"
-fi
-
 # Get number of cpu cores
 num_cpus=$(grep -c ^processor /proc/cpuinfo)
 

From c91e5fb90752c0d89190b88ce45cafeab4e163a6 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Wed, 27 Nov 2024 14:21:10 -0500
Subject: [PATCH 08/65] Remove decompressor related files

---
 components/core/CMakeLists.txt                |   2 -
 .../lzma/Decompressor.cpp                     | 362 ------------------
 .../lzma/Decompressor.hpp                     | 163 --------
 .../core/tests/test-StreamingCompression.cpp  |   7 +-
 4 files changed, 5 insertions(+), 529 deletions(-)
 delete mode 100644 components/core/src/clp/streaming_compression/lzma/Decompressor.cpp
 delete mode 100644 components/core/src/clp/streaming_compression/lzma/Decompressor.hpp

diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index 56156c131..312c6e2ef 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -480,8 +480,6 @@ set(SOURCE_FILES_unitTest
         src/clp/streaming_compression/Decompressor.hpp
         src/clp/streaming_compression/lzma/Compressor.cpp
         src/clp/streaming_compression/lzma/Compressor.hpp
-        src/clp/streaming_compression/lzma/Decompressor.cpp
-        src/clp/streaming_compression/lzma/Decompressor.hpp
         src/clp/streaming_compression/lzma/Constants.hpp
         src/clp/streaming_compression/passthrough/Compressor.cpp
         src/clp/streaming_compression/passthrough/Compressor.hpp
diff --git a/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp b/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp
deleted file mode 100644
index b6a10b418..000000000
--- a/components/core/src/clp/streaming_compression/lzma/Decompressor.cpp
+++ /dev/null
@@ -1,362 +0,0 @@
-#include "Decompressor.hpp"
-
-// C++ Standard Libraries
-#include <algorithm>
-
-// Boost libraries
-#include <boost/filesystem.hpp>
-
-// spdlog
-#include <spdlog/spdlog.h>
-
-// Project headers
-#include "../../Defs.h"
-
-namespace clp::streaming_compression::lzma {
-Decompressor::Decompressor()
-        : ::clp::streaming_compression::Decompressor(CompressorType::LZMA),
-          m_input_type(InputType::NotInitialized),
-          m_decompression_stream(nullptr),
-          m_file_reader(nullptr),
-          m_file_reader_initial_pos(0),
-          m_file_read_buffer_length(0),
-          m_file_read_buffer_capacity(0),
-          m_decompressed_stream_pos(0),
-          m_unused_decompressed_stream_block_size(0) {
-    // Create block to hold unused decompressed data
-    m_unused_decompressed_stream_block_buffer
-            = std::make_unique<char[]>(m_unused_decompressed_stream_block_size);
-    m_decompression_stream = new lzma_stream;
-    memset(m_decompression_stream, 0, sizeof(lzma_stream));
-}
-
-Decompressor::~Decompressor() {
-    delete m_decompression_stream;
-}
-
-void Decompressor::exact_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) {
-    auto errorcode = try_read(buf, num_bytes_to_read, num_bytes_read);
-    if (num_bytes_read != num_bytes_to_read) {
-        SPDLOG_ERROR("FAILED TO READ EXACTLY {} bytes", num_bytes_to_read);
-        throw;
-    }
-    if (errorcode != ErrorCode_Success) {
-        SPDLOG_ERROR("FAILED TO READ EXACTLY {} bytes", num_bytes_to_read);
-        throw;
-    }
-}
-
-ErrorCode Decompressor::try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) {
-    if (InputType::NotInitialized == m_input_type) {
-        return ErrorCode_NotInit;
-    }
-    if (nullptr == buf) {
-        return ErrorCode_BadParam;
-    }
-    if (0 == num_bytes_to_read) {
-        return ErrorCode_Success;
-    }
-
-    num_bytes_read = 0;
-
-    m_decompression_stream->next_out = reinterpret_cast<Bytef*>(buf);
-    m_decompression_stream->avail_out = num_bytes_to_read;
-    while (true) {
-        // Check if there's data that can be decompressed
-        if (0 == m_decompression_stream->avail_in) {
-            if (InputType::File != m_input_type) {
-                // if we hit here, there must be something wrong
-                // we have consumed all data buffer but for some reason it still requires more.
-                return ErrorCode_EndOfFile;
-            } else {
-                auto error_code = m_file_reader->try_read(
-                        m_file_read_buffer.get(),
-                        m_file_read_buffer_capacity,
-                        m_file_read_buffer_length
-                );
-                m_decompression_stream->avail_in = m_file_read_buffer_length;
-                m_decompression_stream->next_in
-                        = reinterpret_cast<Bytef*>(m_file_read_buffer.get());
-                if (ErrorCode_Success != error_code) {
-                    if (ErrorCode_EndOfFile == error_code) {
-                        num_bytes_read = num_bytes_to_read - m_decompression_stream->avail_out;
-                        m_decompressed_stream_pos += num_bytes_read;
-                        return ErrorCode_EndOfFile;
-                    }
-                }
-            }
-        }
-
-        lzma_ret return_value = lzma_code(m_decompression_stream, LZMA_RUN);
-        switch (return_value) {
-            case LZMA_OK:
-            case LZMA_BUF_ERROR:
-                if (0 == m_decompression_stream->avail_out) {
-                    m_decompression_stream->next_out = nullptr;
-                    num_bytes_read = num_bytes_to_read;
-                    m_decompressed_stream_pos += num_bytes_read;
-                    return ErrorCode_Success;
-                }
-                // by breaking here, enter the next iteration of decompressing
-                break;
-            case LZMA_STREAM_END:
-                if (0 == m_decompression_stream->avail_out) {
-                    m_decompression_stream->next_out = nullptr;
-                    num_bytes_read = num_bytes_to_read;
-                    m_decompressed_stream_pos += num_bytes_read;
-                    return ErrorCode_Success;
-                }
-                SPDLOG_ERROR("streaming_compression::lzma::Decompressor wants to read more but "
-                             "reached end of file");
-                throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
-            case LZMA_MEM_ERROR:
-                SPDLOG_ERROR("streaming_compression::lzma::Decompressor inflate() ran out of memory"
-                );
-                throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
-            default:
-                SPDLOG_ERROR("inflate() returned an unexpected value - {}.", int(return_value));
-                throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
-        }
-    }
-}
-
-ErrorCode Decompressor::try_seek_from_begin(size_t pos) {
-    if (InputType::NotInitialized == m_input_type) {
-        throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
-    }
-
-    // Check if we've already decompressed passed the desired position
-    if (m_decompressed_stream_pos > pos) {
-        // ZStd has no way for us to seek back to the desired position, so just reset the stream
-        // to the beginning
-        reset_stream();
-    }
-
-    // We need to fast-forward the decompression stream to decompressed_stream_pos
-    ErrorCode error;
-    while (m_decompressed_stream_pos < pos) {
-        size_t num_bytes_to_decompress = std::min(
-                m_unused_decompressed_stream_block_size,
-                pos - m_decompressed_stream_pos
-        );
-        error = try_read_exact_length(
-                m_unused_decompressed_stream_block_buffer.get(),
-                num_bytes_to_decompress
-        );
-        if (ErrorCode_Success != error) {
-            return error;
-        }
-    }
-
-    return ErrorCode_Success;
-}
-
-ErrorCode Decompressor::try_get_pos(size_t& pos) {
-    if (InputType::NotInitialized == m_input_type) {
-        return ErrorCode_NotInit;
-    }
-
-    pos = m_decompressed_stream_pos;
-    return ErrorCode_Success;
-}
-
-void Decompressor::close() {
-    if (InputType::NotInitialized == m_input_type) {
-        return;
-    }
-    lzma_end(m_decompression_stream);
-    m_decompression_stream->avail_out = 0;
-    m_decompression_stream->next_out = nullptr;
-    if (InputType::MemoryMappedCompressedFile == m_input_type) {
-        if (m_memory_mapped_compressed_file.is_open()) {
-            // An existing file is memory mapped by the decompressor
-            m_memory_mapped_compressed_file.close();
-        }
-    } else if (InputType::File == m_input_type) {
-        m_file_read_buffer.reset();
-        m_file_read_buffer_capacity = 0;
-        m_file_read_buffer_length = 0;
-        m_file_reader = nullptr;
-    }
-    m_input_type = InputType::NotInitialized;
-}
-
-void Decompressor::init_decoder(lzma_stream* strm) {
-    // Initialize a .xz decoder. The decoder supports a memory usage limit
-    // and a set of flags.
-    //
-    // The memory usage of the decompressor depends on the settings used
-    // to compress a .xz file. It can vary from less than a megabyte to
-    // a few gigabytes, but in practice (at least for now) it rarely
-    // exceeds 65 MiB because that's how much memory is required to
-    // decompress files created with "xz -9". Settings requiring more
-    // memory take extra effort to use and don't (at least for now)
-    // provide significantly better compression in most cases.
-    //
-    // Memory usage limit is useful if it is important that the
-    // decompressor won't consume gigabytes of memory. The need
-    // for limiting depends on the application. In this example,
-    // no memory usage limiting is used. This is done by setting
-    // the limit to UINT64_MAX.
-    //
-    // The .xz format allows concatenating compressed files as is:
-    //
-    //     echo foo | xz > foobar.xz
-    //     echo bar | xz >> foobar.xz
-    //
-    // When decompressing normal standalone .xz files, LZMA_CONCATENATED
-    // should always be used to support decompression of concatenated
-    // .xz files. If LZMA_CONCATENATED isn't used, the decoder will stop
-    // after the first .xz stream. This can be useful when .xz data has
-    // been embedded inside another file format.
-    //
-    // Flags other than LZMA_CONCATENATED are supported too, and can
-    // be combined with bitwise-or. See lzma/container.h
-    // (src/liblzma/api/lzma/container.h in the source package or e.g.
-    // /usr/include/lzma/container.h depending on the install prefix)
-    // for details.
-    lzma_ret ret = lzma_stream_decoder(strm, UINT64_MAX, LZMA_CONCATENATED);
-
-    // Return successfully if the initialization went fine.
-    if (ret == LZMA_OK) {
-        return;
-    }
-
-    // Something went wrong. The possible errors are documented in
-    // lzma/container.h (src/liblzma/api/lzma/container.h in the source
-    // package or e.g. /usr/include/lzma/container.h depending on the
-    // install prefix).
-    //
-    // Note that LZMA_MEMLIMIT_ERROR is never possible here. If you
-    // specify a very tiny limit, the error will be delayed until
-    // the first headers have been parsed by a call to lzma_code().
-    char const* msg;
-    switch (ret) {
-        case LZMA_MEM_ERROR:
-            msg = "Memory allocation failed";
-            break;
-
-        case LZMA_OPTIONS_ERROR:
-            msg = "Unsupported decompressor flags";
-            break;
-
-        default:
-            // This is most likely LZMA_PROG_ERROR indicating a bug in
-            // this program or in liblzma. It is inconvenient to have a
-            // separate error message for errors that should be impossible
-            // to occur, but knowing the error code is important for
-            // debugging. That's why it is good to print the error code
-            // at least when there is no good error message to show.
-            msg = "Unknown error, possibly a bug";
-            break;
-    }
-
-    SPDLOG_ERROR("Error initializing the decoder: {} (error code {})", msg, int(ret));
-}
-
-void Decompressor::open(char const* compressed_data_buf, size_t compressed_data_buf_size) {
-    if (InputType::NotInitialized != m_input_type) {
-        throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__);
-    }
-    m_input_type = InputType::CompressedDataBuf;
-
-    // Configure input stream
-    reset_stream();
-    m_decompression_stream->next_in
-            = reinterpret_cast<Bytef*>(const_cast<char*>(compressed_data_buf));
-    m_decompression_stream->avail_in = compressed_data_buf_size;
-    m_decompression_stream->next_out = nullptr;
-    m_decompression_stream->avail_out = 0;
-}
-
-ErrorCode Decompressor::open(std::string const& compressed_file_path) {
-    if (InputType::NotInitialized != m_input_type) {
-        throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__);
-    }
-    m_input_type = InputType::MemoryMappedCompressedFile;
-
-    // Create memory mapping for compressed_file_path, use boost read only memory mapped file
-    boost::system::error_code boost_error_code;
-    size_t compressed_file_size
-            = boost::filesystem::file_size(compressed_file_path, boost_error_code);
-    if (boost_error_code) {
-        SPDLOG_ERROR(
-                "streaming_compression::zstd::Decompressor: Unable to obtain file size for "
-                "'{}' - {}.",
-                compressed_file_path.c_str(),
-                boost_error_code.message().c_str()
-        );
-        return ErrorCode_Failure;
-    }
-
-    boost::iostreams::mapped_file_params memory_map_params;
-    memory_map_params.path = compressed_file_path;
-    memory_map_params.flags = boost::iostreams::mapped_file::readonly;
-    memory_map_params.length = compressed_file_size;
-    memory_map_params.hint = m_memory_mapped_compressed_file.data(
-    );  // Try to map it to the same memory location as previous memory mapped file
-    m_memory_mapped_compressed_file.open(memory_map_params);
-    if (!m_memory_mapped_compressed_file.is_open()) {
-        SPDLOG_ERROR(
-                "streaming_compression::lzma::Decompressor: Unable to memory map the "
-                "compressed file with path: {}",
-                compressed_file_path.c_str()
-        );
-        return ErrorCode_Failure;
-    }
-
-    // Configure input stream
-    reset_stream();
-    m_decompression_stream->next_in
-            = reinterpret_cast<Bytef*>(const_cast<char*>(m_memory_mapped_compressed_file.data()));
-    m_decompression_stream->avail_in = compressed_file_size;
-    m_decompression_stream->next_out = nullptr;
-    m_decompression_stream->avail_out = 0;
-
-    return ErrorCode_Success;
-}
-
-void Decompressor::open(FileReader& file_reader, size_t file_read_buffer_capacity) {
-    if (InputType::NotInitialized != m_input_type) {
-        throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__);
-    }
-    m_input_type = InputType::File;
-
-    m_file_reader = &file_reader;
-    m_file_reader_initial_pos = m_file_reader->get_pos();
-
-    m_file_read_buffer_capacity = file_read_buffer_capacity;
-    m_file_read_buffer = std::make_unique<char[]>(m_file_read_buffer_capacity);
-    m_file_read_buffer_length = 0;
-
-    // Configure input stream
-    reset_stream();
-    m_decompression_stream->next_in = reinterpret_cast<Bytef*>(m_file_read_buffer.get());
-    m_decompression_stream->avail_in = m_file_read_buffer_length;
-    m_decompression_stream->next_out = nullptr;
-    m_decompression_stream->avail_out = 0;
-}
-
-ErrorCode Decompressor::get_decompressed_stream_region(
-        size_t decompressed_stream_pos,
-        char* extraction_buf,
-        size_t extraction_len
-) {
-    auto error_code = try_seek_from_begin(decompressed_stream_pos);
-    if (ErrorCode_Success != error_code) {
-        return error_code;
-    }
-
-    error_code = try_read_exact_length(extraction_buf, extraction_len);
-    return error_code;
-}
-
-void Decompressor::reset_stream() {
-    if (InputType::File == m_input_type) {
-        m_file_reader->seek_from_begin(m_file_reader_initial_pos);
-        m_file_read_buffer_length = 0;
-    }
-    m_decompressed_stream_pos = 0;
-    init_decoder(m_decompression_stream);
-}
-}  // namespace clp::streaming_compression::lzma
diff --git a/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp b/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp
deleted file mode 100644
index 5e90f5942..000000000
--- a/components/core/src/clp/streaming_compression/lzma/Decompressor.hpp
+++ /dev/null
@@ -1,163 +0,0 @@
-#ifndef CLP_STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP
-#define CLP_STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP
-
-// C++ standard libraries
-#include <memory>
-#include <string>
-
-// ZLIB library
-#include <zlib.h>
-
-#include <lzma.h>
-// Boost libraries
-#include <boost/iostreams/device/mapped_file.hpp>
-
-// Project headers
-#include "../../FileReader.hpp"
-#include "../../TraceableException.hpp"
-#include "../Decompressor.hpp"
-
-namespace clp::streaming_compression::lzma {
-class Decompressor : public ::clp::streaming_compression::Decompressor {
-public:
-    // Types
-    class OperationFailed : public TraceableException {
-    public:
-        // Constructors
-        OperationFailed(ErrorCode error_code, char const* const filename, int line_number)
-                : TraceableException(error_code, filename, line_number) {}
-
-        // Methods
-        char const* what() const noexcept override {
-            return "streaming_compression::lzma::Decompressor operation failed";
-        }
-    };
-
-    // Constructor
-    Decompressor();
-
-    // Destructor
-    ~Decompressor();
-
-    // Explicitly disable copy and move constructor/assignment
-    Decompressor(Decompressor const&) = delete;
-    Decompressor& operator=(Decompressor const&) = delete;
-
-    // Methods implementing the ReaderInterface
-    /**
-     * Tries to read up to a given number of bytes from the decompressor
-     * @param buf
-     * @param num_bytes_to_read The number of bytes to try and read
-     * @param num_bytes_read The actual number of bytes read
-     * @return Same as FileReader::try_read if the decompressor is attached to a file
-     * @return ErrorCode_NotInit if the decompressor is not open
-     * @return ErrorCode_BadParam if buf is invalid
-     * @return ErrorCode_EndOfFile on EOF
-     * @return ErrorCode_Failure on decompression failure
-     * @return ErrorCode_Success on success
-     */
-    ErrorCode try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) override;
-
-    /**
-     */
-    void exact_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read);
-
-    /**
-     * Tries to seek from the beginning to the given position
-     * @param pos
-     * @return ErrorCode_NotInit if the decompressor is not open
-     * @return Same as ReaderInterface::try_read_exact_length
-     * @return ErrorCode_Success on success
-     */
-    ErrorCode try_seek_from_begin(size_t pos) override;
-    /**
-     * Tries to get the current position of the read head
-     * @param pos Position of the read head in the file
-     * @return ErrorCode_NotInit if the decompressor is not open
-     * @return ErrorCode_Success on success
-     */
-    ErrorCode try_get_pos(size_t& pos) override;
-
-    // Methods implementing the Decompressor interface
-    void close() override;
-    /**
-     * Decompresses and copies the range of uncompressed data described by
-     * decompressed_stream_pos and extraction_len into extraction_buf
-     * @param decompressed_stream_pos
-     * @param extraction_buf
-     * @param extraction_len
-     * @return Same as streaming_compression::zstd::Decompressor::try_seek_from_begin
-     * @return Same as ReaderInterface::try_read_exact_length
-     */
-    ErrorCode get_decompressed_stream_region(
-            size_t decompressed_stream_pos,
-            char* extraction_buf,
-            size_t extraction_len
-    ) override;
-
-    // Methods
-    /***
-     * Initialize streaming decompressor to decompress from the specified compressed data buffer
-     * @param compressed_data_buf
-     * @param compressed_data_buf_size
-     */
-    void open(char const* compressed_data_buf, size_t compressed_data_buf_size) override;
-
-    /***
-     * Initialize streaming decompressor to decompress from a compressed file specified by the
-     * given path
-     * @param compressed_file_path
-     * @param decompressed_stream_block_size
-     * @return ErrorCode_Failure if the provided path cannot be memory mapped
-     * @return ErrorCode_Success on success
-     */
-    ErrorCode open(std::string const& compressed_file_path);
-
-    /**
-     * Initializes the decompressor to decompress from an open file
-     * @param file_reader
-     * @param file_read_buffer_capacity The maximum amount of data to read from a file at a time
-     */
-    void open(FileReader& file_reader, size_t file_read_buffer_capacity) override;
-
-private:
-    // Enum class
-    enum class InputType {
-        NotInitialized,  // Note: do nothing but generate an error to prevent this required
-                         // parameter is not initialized properly
-        CompressedDataBuf,
-        MemoryMappedCompressedFile,
-        File
-    };
-
-    // Methods
-    /**
-     * Reset streaming decompression state so it will start decompressing from the beginning of
-     * the stream afterwards
-     */
-    void reset_stream();
-
-    void init_decoder(lzma_stream* strm);
-
-    // Variables
-    InputType m_input_type;
-
-    // Compressed stream variables
-    lzma_stream* m_decompression_stream{nullptr};
-
-    boost::iostreams::mapped_file_source m_memory_mapped_compressed_file;
-    FileReader* m_file_reader;
-    size_t m_file_reader_initial_pos;
-    std::unique_ptr<char[]> m_file_read_buffer;
-    size_t m_file_read_buffer_length;
-    size_t m_file_read_buffer_capacity;
-
-    size_t m_decompressed_stream_pos;
-    size_t m_unused_decompressed_stream_block_size;
-    std::unique_ptr<char[]> m_unused_decompressed_stream_block_buffer;
-
-    char const* m_compressed_stream_block;
-    size_t m_compressed_stream_block_size;
-};
-}  // namespace clp::streaming_compression::lzma
-#endif  // CLP_STREAMING_COMPRESSION_LZMA_DECOMPRESSOR_HPP
diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp
index d58d4c1ce..6dac8ba52 100644
--- a/components/core/tests/test-StreamingCompression.cpp
+++ b/components/core/tests/test-StreamingCompression.cpp
@@ -16,7 +16,6 @@
 #include "../src/clp/streaming_compression/Compressor.hpp"
 #include "../src/clp/streaming_compression/Decompressor.hpp"
 #include "../src/clp/streaming_compression/lzma/Compressor.hpp"
-#include "../src/clp/streaming_compression/lzma/Decompressor.hpp"
 #include "../src/clp/streaming_compression/passthrough/Compressor.hpp"
 #include "../src/clp/streaming_compression/passthrough/Decompressor.hpp"
 #include "../src/clp/streaming_compression/zstd/Compressor.hpp"
@@ -59,7 +58,6 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") {
 
     SECTION("LZMA compression") {
         compressor = std::make_unique<clp::streaming_compression::lzma::Compressor>();
-        decompressor = std::make_unique<clp::streaming_compression::lzma::Decompressor>();
     }
 
     // Initialize buffers
@@ -81,6 +79,11 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") {
     file_writer.close();
 
     // Decompress and compare
+    if (nullptr == decompressor) {
+        boost::filesystem::remove(compressed_file_path);
+        return;
+    }
+
     clp::ReadOnlyMemoryMappedFile const memory_mapped_compressed_file{compressed_file_path};
     auto const compressed_file_view{memory_mapped_compressed_file.get_view()};
     decompressor->open(compressed_file_view.data(), compressed_file_view.size());

From 26b06638740d15c5657de301138d46977da25203 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Sat, 30 Nov 2024 02:11:46 -0500
Subject: [PATCH 09/65] Address review concerns

---
 .../streaming_compression/lzma/Compressor.cpp | 99 +++++++++----------
 .../streaming_compression/lzma/Compressor.hpp | 18 +---
 2 files changed, 50 insertions(+), 67 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index c7b46cd6c..6092207d6 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -15,17 +15,15 @@
 #include "Constants.hpp"
 
 namespace clp::streaming_compression::lzma {
-using clp::size_checked_pointer_cast;
-
-auto Compressor::init_lzma_encoder(LzmaStream* strm, int compression_level, size_t dict_size)
+auto Compressor::init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size)
         -> void {
-    LzmaOptionsLzma options;
+    lzma_options_lzma options;
     if (0 != lzma_lzma_preset(&options, compression_level)) {
         SPDLOG_ERROR("Failed to initialize LZMA options' compression level.");
         throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
     }
     options.dict_size = dict_size;
-    std::array<LzmaFilter, 2> filters{{
+    std::array<lzma_filter, 2> filters{{
             {.id = LZMA_FILTER_LZMA2, .options = &options},
             {.id = LZMA_VLI_UNKNOWN, .options = nullptr},
     }};
@@ -34,10 +32,10 @@ auto Compressor::init_lzma_encoder(LzmaStream* strm, int compression_level, size
     // to CRC64, which is the default in the xz command line tool. If
     // the .xz file needs to be decompressed with XZ Embedded, use
     // LZMA_CHECK_CRC32 instead.
-    auto const ret{lzma_stream_encoder(strm, filters.data(), LZMA_CHECK_CRC64)};
+    auto const rc{lzma_stream_encoder(strm, filters.data(), LZMA_CHECK_CRC64)};
 
     // Return successfully if the initialization went fine.
-    if (LZMA_OK == ret) {
+    if (LZMA_OK == rc) {
         return;
     }
 
@@ -46,7 +44,7 @@ auto Compressor::init_lzma_encoder(LzmaStream* strm, int compression_level, size
     // package or e.g. /usr/include/lzma/container.h depending on the
     // install prefix).
     char const* msg{nullptr};
-    switch (ret) {
+    switch (rc) {
         case LZMA_MEM_ERROR:
             msg = "Memory allocation failed";
             break;
@@ -60,17 +58,12 @@ auto Compressor::init_lzma_encoder(LzmaStream* strm, int compression_level, size
             break;
 
         default:
-            // This is most likely LZMA_PROG_ERROR indicating a bug in
-            // this program or in liblzma. It is inconvenient to have a
-            // separate error message for errors that should be impossible
-            // to occur, but knowing the error code is important for
-            // debugging. That's why it is good to print the error code
-            // at least when there is no good error message to show.
-            msg = "Unknown error, possibly a bug";
+            // This is most likely LZMA_PROG_ERROR indicating a bug in liblzma
+            msg = "Unknown error";
             break;
     }
 
-    SPDLOG_ERROR("Error initializing the encoder: {} (error code {})", msg, static_cast<int>(ret));
+    SPDLOG_ERROR("Error initializing the encoder: {} (error code {})", msg, static_cast<int>(rc));
     throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
 }
 
@@ -85,9 +78,12 @@ auto Compressor::open(FileWriter& file_writer, int compression_level) -> void {
 
     m_compression_stream = LZMA_STREAM_INIT;
     init_lzma_encoder(&m_compression_stream, compression_level, m_dict_size);
-    // Setup compressed stream parameters
+
+    // No input upon initialization
     m_compression_stream.next_in = nullptr;
     m_compression_stream.avail_in = 0;
+
+    // Attach output buffer to LZMA stream
     m_compression_stream.next_out = m_compressed_stream_block_buffer.data();
     m_compression_stream.avail_out = m_compressed_stream_block_buffer.size();
 
@@ -101,7 +97,13 @@ auto Compressor::close() -> void {
         throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
     }
 
-    flush_and_close_compression_stream();
+    run_lzma(LZMA_FINISH);
+    lzma_end(&m_compression_stream);
+
+    // Detach output buffer from LZMA stream
+    m_compression_stream.next_out = nullptr;
+    m_compression_stream.avail_out = 0;
+
     m_compressed_stream_file_writer = nullptr;
 }
 
@@ -119,27 +121,22 @@ auto Compressor::write(char const* data, size_t data_length) -> void {
         throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
     }
 
-    m_compression_stream.next_in = size_checked_pointer_cast<uint8_t const>(data);
+    // Attach input data to LZMA stream
+    m_compression_stream.next_in = clp::size_checked_pointer_cast<uint8_t const>(data);
     m_compression_stream.avail_in = data_length;
 
-    // Normal compression encoding workflow. Continue until the input buffer is
-    // exhausted.
-    compress(LZMA_RUN);
-
-    m_compression_stream.next_in = nullptr;
+    run_lzma(LZMA_RUN);
 
-    m_compression_stream_contains_data = true;
     m_uncompressed_stream_pos += data_length;
 }
 
 auto Compressor::flush() -> void {
-    if (false == m_compression_stream_contains_data) {
+    if (m_compression_stream_is_flushed) {
         return;
     }
 
     // Forces all the buffered data to be available at output
-    compress(LZMA_SYNC_FLUSH);
-    m_compression_stream_contains_data = false;
+    run_lzma(LZMA_SYNC_FLUSH);
 }
 
 auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode {
@@ -151,43 +148,39 @@ auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode {
     return ErrorCode_Success;
 }
 
-auto Compressor::flush_and_close_compression_stream() -> void {
-    if (nullptr == m_compressed_stream_file_writer) {
-        throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
-    }
-
-    // Same as flush but all the input data must have been given to the encoder
-    compress(LZMA_FINISH);
-
-    m_compression_stream_contains_data = false;
-
-    lzma_end(&m_compression_stream);
-    m_compression_stream.avail_out = 0;
-    m_compression_stream.next_out = nullptr;
-}
-
-auto Compressor::compress(LzmaAction action) -> void {
-    bool hit_stream_end{false};
+auto Compressor::run_lzma(lzma_action action) -> void {
+    m_compression_stream_is_flushed = false;
+    bool end_of_stream{false};
     while (true) {
+        if (0 == m_compression_stream.avail_in) {  // No more input data
+            if (LZMA_RUN == action) {
+                // All input data have been processed, so we can safely detach
+                // input data from LZMA stream.
+                m_compression_stream.next_in = nullptr;
+                break;
+            }
+        } else {
+            if (LZMA_FINISH == action) {
+                SPDLOG_ERROR("Tried to close LZMA compressor with unprocessed input data.");
+                throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
+            }
+        }
+
         auto const rc = lzma_code(&m_compression_stream, action);
         switch (rc) {
             case LZMA_OK:
             case LZMA_BUF_ERROR:
                 break;
             case LZMA_STREAM_END:
-                hit_stream_end = true;
+                end_of_stream = true;
                 break;
             default:
                 SPDLOG_ERROR("lzma() returned an unexpected value - {}.", static_cast<int>(rc));
                 throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
         }
 
-        if (LZMA_RUN == action && 0 == m_compression_stream.avail_in) {
-            // No more data to compress
-            break;
-        }
-
-        if (hit_stream_end) {
+        if (end_of_stream) {
+            m_compression_stream_is_flushed = true;
             break;
         }
 
@@ -205,7 +198,7 @@ auto Compressor::compress(LzmaAction action) -> void {
 
 auto Compressor::pipe_data() -> void {
     m_compressed_stream_file_writer->write(
-            size_checked_pointer_cast<char>(m_compressed_stream_block_buffer.data()),
+            clp::size_checked_pointer_cast<char>(m_compressed_stream_block_buffer.data()),
             cCompressedStreamBlockBufferSize - m_compression_stream.avail_out
     );
     m_compression_stream.next_out = m_compressed_stream_block_buffer.data();
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index d10810e88..5b1adb404 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -86,11 +86,6 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     auto open(FileWriter& file_writer, int compression_level) -> void;
 
 private:
-    using LzmaAction = lzma_action;
-    using LzmaFilter = lzma_filter;
-    using LzmaOptionsLzma = lzma_options_lzma;
-    using LzmaStream = lzma_stream;
-
     /**
      * Initialize the Lzma compression stream
      * @param strm A pre-allocated `lzma_stream` object
@@ -99,14 +94,9 @@ class Compressor : public ::clp::streaming_compression::Compressor {
      *                  recently processed uncompressed data is kept in memory
      */
     static auto
-    init_lzma_encoder(LzmaStream* strm, int compression_level, size_t dict_size) -> void;
+    init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size) -> void;
     static constexpr size_t cCompressedStreamBlockBufferSize{4096};  // 4KiB
 
-    /**
-     * Flushes the stream and closes it
-     */
-    auto flush_and_close_compression_stream() -> void;
-
     /**
      * Repeatedly invoke lzma_code() compression workflow until LZMA_STREAM_END
      * is reached.
@@ -115,7 +105,7 @@ class Compressor : public ::clp::streaming_compression::Compressor {
      *
      * @param action
      */
-    auto compress(lzma_action action) -> void;
+    auto run_lzma(lzma_action action) -> void;
 
     /**
      * Pipes the current compressed data in the lzma buffer to the output file
@@ -127,8 +117,8 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     FileWriter* m_compressed_stream_file_writer{nullptr};
 
     // Compressed stream variables
-    LzmaStream m_compression_stream;
-    bool m_compression_stream_contains_data{false};
+    lzma_stream m_compression_stream;
+    bool m_compression_stream_is_flushed{true};
     size_t m_dict_size{cDefaultDictionarySize};
 
     Array<uint8_t> m_compressed_stream_block_buffer{cCompressedStreamBlockBufferSize};

From 740bc1c1216f999c881dffd49564eeabcf1d4bbd Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Mon, 2 Dec 2024 10:20:12 -0500
Subject: [PATCH 10/65] Address review concern

---
 .../streaming_compression/lzma/Compressor.cpp | 26 +++++++++++++------
 .../streaming_compression/lzma/Compressor.hpp | 15 +++--------
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 6092207d6..a1d5dfaa2 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -14,13 +14,21 @@
 #include "../../type_utils.hpp"
 #include "Constants.hpp"
 
-namespace clp::streaming_compression::lzma {
-auto Compressor::init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size)
-        -> void {
+namespace {
+using clp::streaming_compression::lzma::Compressor;
+
+/**
+ * Initialize the Lzma compression stream
+ * @param strm A pre-allocated `lzma_stream` object
+ * @param compression_level
+ * @param dict_size Dictionary size that indicates how many bytes of the
+ *                  recently processed uncompressed data is kept in memory
+ */
+auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size) -> void {
     lzma_options_lzma options;
     if (0 != lzma_lzma_preset(&options, compression_level)) {
         SPDLOG_ERROR("Failed to initialize LZMA options' compression level.");
-        throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
+        throw Compressor::OperationFailed(clp::ErrorCode_BadParam, __FILENAME__, __LINE__);
     }
     options.dict_size = dict_size;
     std::array<lzma_filter, 2> filters{{
@@ -64,9 +72,11 @@ auto Compressor::init_lzma_encoder(lzma_stream* strm, int compression_level, siz
     }
 
     SPDLOG_ERROR("Error initializing the encoder: {} (error code {})", msg, static_cast<int>(rc));
-    throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
+    throw Compressor::OperationFailed(clp::ErrorCode_BadParam, __FILENAME__, __LINE__);
 }
+}  // namespace
 
+namespace clp::streaming_compression::lzma {
 auto Compressor::open(FileWriter& file_writer, int compression_level) -> void {
     if (nullptr != m_compressed_stream_file_writer) {
         throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__);
@@ -186,17 +196,17 @@ auto Compressor::run_lzma(lzma_action action) -> void {
 
         // Write output buffer to file if it's full
         if (0 == m_compression_stream.avail_out) {
-            pipe_data();
+            flush_stream_output_block_buffer();
         }
     }
 
     // Write remaining compressed data
     if (m_compression_stream.avail_out < cCompressedStreamBlockBufferSize) {
-        pipe_data();
+        flush_stream_output_block_buffer();
     }
 }
 
-auto Compressor::pipe_data() -> void {
+auto Compressor::flush_stream_output_block_buffer() -> void {
     m_compressed_stream_file_writer->write(
             clp::size_checked_pointer_cast<char>(m_compressed_stream_block_buffer.data()),
             cCompressedStreamBlockBufferSize - m_compression_stream.avail_out
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index 5b1adb404..4afdce36a 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -86,15 +86,6 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     auto open(FileWriter& file_writer, int compression_level) -> void;
 
 private:
-    /**
-     * Initialize the Lzma compression stream
-     * @param strm A pre-allocated `lzma_stream` object
-     * @param compression_level
-     * @param dict_size Dictionary size that indicates how many bytes of the
-     *                  recently processed uncompressed data is kept in memory
-     */
-    static auto
-    init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size) -> void;
     static constexpr size_t cCompressedStreamBlockBufferSize{4096};  // 4KiB
 
     /**
@@ -108,10 +99,10 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     auto run_lzma(lzma_action action) -> void;
 
     /**
-     * Pipes the current compressed data in the lzma buffer to the output file
-     * and reset the compression buffer to receive new data.
+     * Flushes the current compressed data in the lzma output buffer to the
+     * output file handler. Reset the compression buffer to receive new data.
      */
-    auto pipe_data() -> void;
+    auto flush_stream_output_block_buffer() -> void;
 
     // Variables
     FileWriter* m_compressed_stream_file_writer{nullptr};

From e2be8833595b3281cdeaaccfcd1255849ce33b29 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Mon, 2 Dec 2024 10:22:18 -0500
Subject: [PATCH 11/65] Simplify else-if

---
 .../core/src/clp/streaming_compression/lzma/Compressor.cpp     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index a1d5dfaa2..c40ca7652 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -169,8 +169,7 @@ auto Compressor::run_lzma(lzma_action action) -> void {
                 m_compression_stream.next_in = nullptr;
                 break;
             }
-        } else {
-            if (LZMA_FINISH == action) {
+        } else if (LZMA_FINISH == action) {
                 SPDLOG_ERROR("Tried to close LZMA compressor with unprocessed input data.");
                 throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
             }

From 905367d6e4e08174fb30b7da67d00e5455ad14de Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Mon, 2 Dec 2024 10:23:19 -0500
Subject: [PATCH 12/65] Fix else-if

---
 .../core/src/clp/streaming_compression/lzma/Compressor.cpp   | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index c40ca7652..610c7cc17 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -170,9 +170,8 @@ auto Compressor::run_lzma(lzma_action action) -> void {
                 break;
             }
         } else if (LZMA_FINISH == action) {
-                SPDLOG_ERROR("Tried to close LZMA compressor with unprocessed input data.");
-                throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
-            }
+            SPDLOG_ERROR("Tried to close LZMA compressor with unprocessed input data.");
+            throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
         }
 
         auto const rc = lzma_code(&m_compression_stream, action);

From 8ae88b2a86f880a02133ac9ee3cb3a1ed5921a9d Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Mon, 2 Dec 2024 10:44:30 -0500
Subject: [PATCH 13/65] Add lzma (xz) dep to MacOS

---
 components/core/tools/scripts/lib_install/macos/install-all.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/components/core/tools/scripts/lib_install/macos/install-all.sh b/components/core/tools/scripts/lib_install/macos/install-all.sh
index 97e41903d..cb24dd054 100755
--- a/components/core/tools/scripts/lib_install/macos/install-all.sh
+++ b/components/core/tools/scripts/lib_install/macos/install-all.sh
@@ -21,6 +21,7 @@ brew install \
   mongo-cxx-driver \
   msgpack-cxx \
   spdlog \
+  xz \
   zstd
 
 # Install pkg-config if it isn't already installed

From 0d0c20eaf35271572b068fe03887695d0f62f69d Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Mon, 2 Dec 2024 12:24:45 -0500
Subject: [PATCH 14/65] Refactor helper run_lzma()

---
 .../streaming_compression/lzma/Compressor.cpp | 79 +++++++++----------
 .../streaming_compression/lzma/Compressor.hpp |  2 +
 2 files changed, 38 insertions(+), 43 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 610c7cc17..11260c6e9 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -107,7 +107,14 @@ auto Compressor::close() -> void {
         throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
     }
 
-    run_lzma(LZMA_FINISH);
+    if (m_compression_stream.avail_in > 0) {
+        SPDLOG_ERROR("Tried to close LZMA compressor with unprocessed input data.");
+        throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
+    }
+
+    while (false == m_compression_stream_is_flushed) {
+        run_lzma(LZMA_FINISH);
+    }
     lzma_end(&m_compression_stream);
 
     // Detach output buffer from LZMA stream
@@ -134,19 +141,22 @@ auto Compressor::write(char const* data, size_t data_length) -> void {
     // Attach input data to LZMA stream
     m_compression_stream.next_in = clp::size_checked_pointer_cast<uint8_t const>(data);
     m_compression_stream.avail_in = data_length;
+    m_compression_stream_is_flushed = false;
 
-    run_lzma(LZMA_RUN);
+    while (m_compression_stream.avail_in > 0) {
+        run_lzma(LZMA_RUN);
+    }
+
+    // All input data have been encoded so detach input data
+    m_compression_stream.next_in = nullptr;
 
     m_uncompressed_stream_pos += data_length;
 }
 
 auto Compressor::flush() -> void {
-    if (m_compression_stream_is_flushed) {
-        return;
+    while (false == m_compression_stream_is_flushed) {
+        run_lzma(LZMA_SYNC_FLUSH);
     }
-
-    // Forces all the buffered data to be available at output
-    run_lzma(LZMA_SYNC_FLUSH);
 }
 
 auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode {
@@ -159,52 +169,35 @@ auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode {
 }
 
 auto Compressor::run_lzma(lzma_action action) -> void {
-    m_compression_stream_is_flushed = false;
-    bool end_of_stream{false};
-    while (true) {
-        if (0 == m_compression_stream.avail_in) {  // No more input data
-            if (LZMA_RUN == action) {
-                // All input data have been processed, so we can safely detach
-                // input data from LZMA stream.
-                m_compression_stream.next_in = nullptr;
-                break;
-            }
-        } else if (LZMA_FINISH == action) {
-            SPDLOG_ERROR("Tried to close LZMA compressor with unprocessed input data.");
-            throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
-        }
-
-        auto const rc = lzma_code(&m_compression_stream, action);
-        switch (rc) {
-            case LZMA_OK:
-            case LZMA_BUF_ERROR:
-                break;
-            case LZMA_STREAM_END:
-                end_of_stream = true;
-                break;
-            default:
-                SPDLOG_ERROR("lzma() returned an unexpected value - {}.", static_cast<int>(rc));
+    auto const rc = lzma_code(&m_compression_stream, action);
+    switch (rc) {
+        case LZMA_OK:
+            break;
+        case LZMA_BUF_ERROR:  // No encoding progress can be made
+            if (m_compression_stream.avail_in > 0) {
+                SPDLOG_ERROR("LZMA compressor input stream is corrupt.");
                 throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
-        }
-
-        if (end_of_stream) {
+            }
+            break;
+        case LZMA_STREAM_END:
             m_compression_stream_is_flushed = true;
             break;
-        }
-
-        // Write output buffer to file if it's full
-        if (0 == m_compression_stream.avail_out) {
-            flush_stream_output_block_buffer();
-        }
+        default:
+            SPDLOG_ERROR("lzma() returned an unexpected value - {}.", static_cast<int>(rc));
+            throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
     }
 
-    // Write remaining compressed data
-    if (m_compression_stream.avail_out < cCompressedStreamBlockBufferSize) {
+    // Write output buffer to file if it's full or flushed
+    if (0 == m_compression_stream.avail_out || m_compression_stream_is_flushed) {
         flush_stream_output_block_buffer();
     }
 }
 
 auto Compressor::flush_stream_output_block_buffer() -> void {
+    if (cCompressedStreamBlockBufferSize == m_compression_stream.avail_out) {
+        // Nothing to flush
+        return;
+    }
     m_compressed_stream_file_writer->write(
             clp::size_checked_pointer_cast<char>(m_compressed_stream_block_buffer.data()),
             cCompressedStreamBlockBufferSize - m_compression_stream.avail_out
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index 4afdce36a..1953001f2 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -53,6 +53,8 @@ class Compressor : public ::clp::streaming_compression::Compressor {
 
     /**
      * Writes any internally buffered data to file and ends the current frame
+     *
+     * Forces all the encoded data buffered by LZMA to be available at output
      */
     auto flush() -> void override;
 

From 559485d18c64eb32e7a72169ff303b3baae07d52 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Mon, 2 Dec 2024 12:36:07 -0500
Subject: [PATCH 15/65] Update function doc

---
 .../core/src/clp/streaming_compression/lzma/Compressor.hpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index 1953001f2..3eb062223 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -91,9 +91,10 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     static constexpr size_t cCompressedStreamBlockBufferSize{4096};  // 4KiB
 
     /**
-     * Repeatedly invoke lzma_code() compression workflow until LZMA_STREAM_END
-     * is reached.
-     * The workflow action needs to be kept the same throughout this process.
+     * Invoke lzma_code() encoding workflow for one time with the given action.
+     *
+     * Once flushing starts, the workflow action needs to stay the same until
+     * flushing is complete (aka LZMA_STREAM_END is reached).
      * See also: https://github.com/tukaani-project/xz/blob/master/src/liblzma/api/lzma/base.h#L274
      *
      * @param action

From 7c69c6919f6fd41a83d0f5e5865bb565014e9723 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Mon, 2 Dec 2024 12:48:02 -0500
Subject: [PATCH 16/65] Clarify unit test early termination

---
 components/core/tests/test-StreamingCompression.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp
index 6dac8ba52..a47012ca3 100644
--- a/components/core/tests/test-StreamingCompression.cpp
+++ b/components/core/tests/test-StreamingCompression.cpp
@@ -4,8 +4,10 @@
 #include <memory>
 #include <numeric>
 #include <string>
+#include <utility>
 
 #include <boost/filesystem/operations.hpp>
+#include <boost/pointer_cast.hpp>
 #include <Catch2/single_include/catch2/catch.hpp>
 #include <zstd.h>
 
@@ -78,12 +80,16 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") {
     compressor->close();
     file_writer.close();
 
-    // Decompress and compare
-    if (nullptr == decompressor) {
+    if (boost::dynamic_pointer_cast<clp::streaming_compression::lzma::Compressor>(
+                std::move(compressor)
+        ))
+    {
+        // TODO: remove this LZMA testing early termination
         boost::filesystem::remove(compressed_file_path);
         return;
     }
 
+    // Decompress and compare
     clp::ReadOnlyMemoryMappedFile const memory_mapped_compressed_file{compressed_file_path};
     auto const compressed_file_view{memory_mapped_compressed_file.get_view()};
     decompressor->open(compressed_file_view.data(), compressed_file_view.size());

From a6d68b8f66fa2e9978cad8486d2c6b9220b78c10 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingranhu98@gmail.com>
Date: Mon, 2 Dec 2024 12:52:29 -0500
Subject: [PATCH 17/65] Update
 components/core/tests/test-StreamingCompression.cpp

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
---
 components/core/tests/test-StreamingCompression.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp
index a47012ca3..a7f2ee78c 100644
--- a/components/core/tests/test-StreamingCompression.cpp
+++ b/components/core/tests/test-StreamingCompression.cpp
@@ -58,7 +58,9 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") {
         decompressor = std::make_unique<clp::streaming_compression::passthrough::Decompressor>();
     }
 
-    SECTION("LZMA compression") {
+    SECTION("LZMA compression (compression-only test)") {
+        // Note: Decompressor initialization is intentionally omitted as this is a
+        // compression-only test. See early termination logic below.
         compressor = std::make_unique<clp::streaming_compression::lzma::Compressor>();
     }
 

From 1519c21c7d88d4860c00e243abae6ca8443d5fa1 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Tue, 3 Dec 2024 02:34:23 -0500
Subject: [PATCH 18/65] Split LZMA_RUN from flush actions

---
 .../streaming_compression/lzma/Compressor.cpp | 86 ++++++++++++++-----
 .../streaming_compression/lzma/Compressor.hpp | 18 ++--
 2 files changed, 76 insertions(+), 28 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 11260c6e9..e6e95e7c8 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -17,6 +17,11 @@
 namespace {
 using clp::streaming_compression::lzma::Compressor;
 
+auto is_flush_action(lzma_action action) {
+    return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action || LZMA_FULL_BARRIER == action
+           || LZMA_FINISH == action;
+}
+
 /**
  * Initialize the Lzma compression stream
  * @param strm A pre-allocated `lzma_stream` object
@@ -42,7 +47,6 @@ auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_siz
     // LZMA_CHECK_CRC32 instead.
     auto const rc{lzma_stream_encoder(strm, filters.data(), LZMA_CHECK_CRC64)};
 
-    // Return successfully if the initialization went fine.
     if (LZMA_OK == rc) {
         return;
     }
@@ -112,9 +116,7 @@ auto Compressor::close() -> void {
         throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
     }
 
-    while (false == m_compression_stream_is_flushed) {
-        run_lzma(LZMA_FINISH);
-    }
+    flush_lzma(LZMA_FINISH);
     lzma_end(&m_compression_stream);
 
     // Detach output buffer from LZMA stream
@@ -141,10 +143,9 @@ auto Compressor::write(char const* data, size_t data_length) -> void {
     // Attach input data to LZMA stream
     m_compression_stream.next_in = clp::size_checked_pointer_cast<uint8_t const>(data);
     m_compression_stream.avail_in = data_length;
-    m_compression_stream_is_flushed = false;
 
     while (m_compression_stream.avail_in > 0) {
-        run_lzma(LZMA_RUN);
+        encode_lzma_once();
     }
 
     // All input data have been encoded so detach input data
@@ -154,9 +155,7 @@ auto Compressor::write(char const* data, size_t data_length) -> void {
 }
 
 auto Compressor::flush() -> void {
-    while (false == m_compression_stream_is_flushed) {
-        run_lzma(LZMA_SYNC_FLUSH);
-    }
+    flush_lzma(LZMA_SYNC_FLUSH);
 }
 
 auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode {
@@ -168,29 +167,70 @@ auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode {
     return ErrorCode_Success;
 }
 
-auto Compressor::run_lzma(lzma_action action) -> void {
-    auto const rc = lzma_code(&m_compression_stream, action);
+auto Compressor::encode_lzma_once() -> void {
+    if (0 == m_compression_stream.avail_in) {
+        return;
+    }
+
+    if (0 == m_compression_stream.avail_out) {
+        flush_stream_output_block_buffer();
+    }
+
+    auto const rc = lzma_code(&m_compression_stream, LZMA_RUN);
     switch (rc) {
         case LZMA_OK:
             break;
         case LZMA_BUF_ERROR:  // No encoding progress can be made
-            if (m_compression_stream.avail_in > 0) {
-                SPDLOG_ERROR("LZMA compressor input stream is corrupt.");
-                throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
-            }
-            break;
-        case LZMA_STREAM_END:
-            m_compression_stream_is_flushed = true;
-            break;
+            SPDLOG_ERROR("LZMA compressor input stream is corrupt.");
+            throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
         default:
-            SPDLOG_ERROR("lzma() returned an unexpected value - {}.", static_cast<int>(rc));
+            SPDLOG_ERROR("lzma_code() returned an unexpected value - {}.", static_cast<int>(rc));
             throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
     }
+}
 
-    // Write output buffer to file if it's full or flushed
-    if (0 == m_compression_stream.avail_out || m_compression_stream_is_flushed) {
-        flush_stream_output_block_buffer();
+auto Compressor::flush_lzma(lzma_action flush_action) -> void {
+    if (false == is_flush_action(flush_action)) {
+        SPDLOG_ERROR(
+                "lzma_code() supplied with invalid flush action - {}.",
+                static_cast<int>(flush_action)
+        );
+        throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
+    }
+
+    bool flushed{false};
+    while (false == flushed) {
+        auto const rc = lzma_code(&m_compression_stream, flush_action);
+        switch (rc) {
+            case LZMA_OK:
+                break;
+            case LZMA_STREAM_END:
+                // NOTE: this might not be true when multithreaded encoder is
+                // used with LZMA_FULL_BARRIER. For now, we skip this check.
+                flushed = true;
+                break;
+            case LZMA_BUF_ERROR:  // No encoding progress can be made
+                // NOTE: this can happen if we are using LZMA_FULL_FLUSH or
+                // LZMA_FULL_BARRIER. These two actions keeps encoding input
+                // data alongside flushing already encoded but buffered data.
+                SPDLOG_ERROR("LZMA compressor input stream is corrupt.");
+                throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
+            default:
+                SPDLOG_ERROR(
+                        "lzma_code() returned an unexpected value - {}.",
+                        static_cast<int>(rc)
+                );
+                throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
+        }
+
+        // Write output buffer to file if it's full
+        if (0 == m_compression_stream.avail_out) {
+            flush_stream_output_block_buffer();
+        }
     }
+
+    // Write the last chunk of output
+    flush_stream_output_block_buffer();
 }
 
 auto Compressor::flush_stream_output_block_buffer() -> void {
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index 3eb062223..045345829 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -91,15 +91,24 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     static constexpr size_t cCompressedStreamBlockBufferSize{4096};  // 4KiB
 
     /**
-     * Invoke lzma_code() encoding workflow for one time with the given action.
+     * Invoke lzma_code() encoding workflow once with LZMA_RUN
+     *
+     * The encoded data may be buffered and thus not immediately available at
+     * the output block.
+     */
+    auto encode_lzma_once() -> void;
+
+    /**
+     * Invoke lzma_code() repeatedly with the given flushing action until all
+     * encoded data is made available at the output block
      *
      * Once flushing starts, the workflow action needs to stay the same until
-     * flushing is complete (aka LZMA_STREAM_END is reached).
+     * flushing is signaled completed by LZMA (aka LZMA_STREAM_END is reached).
      * See also: https://github.com/tukaani-project/xz/blob/master/src/liblzma/api/lzma/base.h#L274
      *
-     * @param action
+     * @param flush_action
      */
-    auto run_lzma(lzma_action action) -> void;
+    auto flush_lzma(lzma_action flush_action) -> void;
 
     /**
      * Flushes the current compressed data in the lzma output buffer to the
@@ -112,7 +121,6 @@ class Compressor : public ::clp::streaming_compression::Compressor {
 
     // Compressed stream variables
     lzma_stream m_compression_stream;
-    bool m_compression_stream_is_flushed{true};
     size_t m_dict_size{cDefaultDictionarySize};
 
     Array<uint8_t> m_compressed_stream_block_buffer{cCompressedStreamBlockBufferSize};

From 655bb46dcf853e41bf790444d550506c66ff6163 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Tue, 3 Dec 2024 03:26:10 -0500
Subject: [PATCH 19/65] Refactor unit test

---
 .../core/tests/test-StreamingCompression.cpp  | 122 +++++++++---------
 1 file changed, 64 insertions(+), 58 deletions(-)

diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp
index a47012ca3..a52a42ef7 100644
--- a/components/core/tests/test-StreamingCompression.cpp
+++ b/components/core/tests/test-StreamingCompression.cpp
@@ -4,10 +4,10 @@
 #include <memory>
 #include <numeric>
 #include <string>
+#include <string_view>
 #include <utility>
 
 #include <boost/filesystem/operations.hpp>
-#include <boost/pointer_cast.hpp>
 #include <Catch2/single_include/catch2/catch.hpp>
 #include <zstd.h>
 
@@ -28,69 +28,39 @@ using clp::ErrorCode_Success;
 using clp::FileWriter;
 using clp::streaming_compression::Compressor;
 using clp::streaming_compression::Decompressor;
-
-TEST_CASE("StreamingCompression", "[StreamingCompression]") {
-    // Initialize constants
-    constexpr size_t cBufferSize{128L * 1024 * 1024};  // 128MB
-    constexpr auto cCompressionChunkSizes = std::to_array<size_t>(
-            {cBufferSize / 100,
-             cBufferSize / 50,
-             cBufferSize / 25,
-             cBufferSize / 10,
-             cBufferSize / 5,
-             cBufferSize / 2,
-             cBufferSize}
-    );
-    constexpr size_t cAlphabetLength{26};
-    std::string const compressed_file_path{"test_streaming_compressed_file.bin"};
-
-    // Initialize compression devices
-    std::unique_ptr<Compressor> compressor;
-    std::unique_ptr<Decompressor> decompressor;
-
-    SECTION("ZStd single phase compression") {
-        compressor = std::make_unique<clp::streaming_compression::zstd::Compressor>();
-        decompressor = std::make_unique<clp::streaming_compression::zstd::Decompressor>();
-    }
-
-    SECTION("Passthrough compression") {
-        compressor = std::make_unique<clp::streaming_compression::passthrough::Compressor>();
-        decompressor = std::make_unique<clp::streaming_compression::passthrough::Decompressor>();
-    }
-
-    SECTION("LZMA compression") {
-        compressor = std::make_unique<clp::streaming_compression::lzma::Compressor>();
-    }
-
-    // Initialize buffers
-    Array<char> uncompressed_buffer{cBufferSize};
-    for (size_t i{0}; i < cBufferSize; ++i) {
-        uncompressed_buffer.at(i) = static_cast<char>(('a' + (i % cAlphabetLength)));
-    }
-
-    Array<char> decompressed_buffer{cBufferSize};
-
-    // Compress
+using std::string;
+using std::string_view;
+
+namespace {
+constexpr string_view cCompressedFilePath{"test_streaming_compressed_file.bin"};
+constexpr size_t cBufferSize{128L * 1024 * 1024};  // 128MB
+constexpr auto cCompressionChunkSizes = std::to_array<size_t>(
+        {cBufferSize / 100,
+         cBufferSize / 50,
+         cBufferSize / 25,
+         cBufferSize / 10,
+         cBufferSize / 5,
+         cBufferSize / 2,
+         cBufferSize}
+);
+
+auto compress(std::unique_ptr<Compressor> compressor, char const* const src) -> void {
     FileWriter file_writer;
-    file_writer.open(compressed_file_path, FileWriter::OpenMode::CREATE_FOR_WRITING);
+    file_writer.open(string(cCompressedFilePath), FileWriter::OpenMode::CREATE_FOR_WRITING);
     compressor->open(file_writer);
     for (auto const chunk_size : cCompressionChunkSizes) {
-        compressor->write(uncompressed_buffer.data(), chunk_size);
+        compressor->write(src, chunk_size);
     }
     compressor->close();
     file_writer.close();
+}
 
-    if (boost::dynamic_pointer_cast<clp::streaming_compression::lzma::Compressor>(
-                std::move(compressor)
-        ))
-    {
-        // TODO: remove this LZMA testing early termination
-        boost::filesystem::remove(compressed_file_path);
-        return;
-    }
-
-    // Decompress and compare
-    clp::ReadOnlyMemoryMappedFile const memory_mapped_compressed_file{compressed_file_path};
+auto decompress_and_compare(
+        std::unique_ptr<Decompressor> decompressor,
+        Array<char> const& uncompressed_buffer,
+        Array<char>& decompressed_buffer
+) -> void {
+    clp::ReadOnlyMemoryMappedFile const memory_mapped_compressed_file{string(cCompressedFilePath)};
     auto const compressed_file_view{memory_mapped_compressed_file.get_view()};
     decompressor->open(compressed_file_view.data(), compressed_file_view.size());
 
@@ -123,7 +93,43 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") {
              )
              == num_uncompressed_bytes)
     );
+}
+}  // namespace
+
+TEST_CASE("StreamingCompression", "[StreamingCompression]") {
+    // Initialize constants
+    constexpr size_t cAlphabetLength{26};
+
+    // Initialize compression devices
+    std::unique_ptr<Compressor> compressor;
+    std::unique_ptr<Decompressor> decompressor;
+
+    // Initialize buffers
+    Array<char> decompressed_buffer{cBufferSize};
+    Array<char> uncompressed_buffer{cBufferSize};
+    for (size_t i{0}; i < cBufferSize; ++i) {
+        uncompressed_buffer.at(i) = static_cast<char>(('a' + (i % cAlphabetLength)));
+    }
+
+    SECTION("ZStd single phase compression") {
+        compressor = std::make_unique<clp::streaming_compression::zstd::Compressor>();
+        compress(std::move(compressor), uncompressed_buffer.data());
+        decompressor = std::make_unique<clp::streaming_compression::zstd::Decompressor>();
+        decompress_and_compare(std::move(decompressor), uncompressed_buffer, decompressed_buffer);
+    }
+
+    SECTION("Passthrough compression") {
+        compressor = std::make_unique<clp::streaming_compression::passthrough::Compressor>();
+        compress(std::move(compressor), uncompressed_buffer.data());
+        decompressor = std::make_unique<clp::streaming_compression::passthrough::Decompressor>();
+        decompress_and_compare(std::move(decompressor), uncompressed_buffer, decompressed_buffer);
+    }
+
+    SECTION("LZMA compression") {
+        compressor = std::make_unique<clp::streaming_compression::lzma::Compressor>();
+        compress(std::move(compressor), uncompressed_buffer.data());
+    }
 
     // Cleanup
-    boost::filesystem::remove(compressed_file_path);
+    boost::filesystem::remove(string(cCompressedFilePath));
 }

From 4fb6c0147a054fdf7970c10ccf64d2435eb13bce Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingranhu98@gmail.com>
Date: Tue, 3 Dec 2024 03:29:06 -0500
Subject: [PATCH 20/65] Update
 components/core/src/clp/streaming_compression/lzma/Compressor.cpp

Co-authored-by: haiqi96 <14502009+haiqi96@users.noreply.github.com>
---
 .../core/src/clp/streaming_compression/lzma/Compressor.cpp    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 11260c6e9..11bfdc5b5 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -21,8 +21,8 @@ using clp::streaming_compression::lzma::Compressor;
  * Initialize the Lzma compression stream
  * @param strm A pre-allocated `lzma_stream` object
  * @param compression_level
- * @param dict_size Dictionary size that indicates how many bytes of the
- *                  recently processed uncompressed data is kept in memory
+ * @param dict_size Dictionary size that specifies how many bytes of the
+ *                  recently processed uncompressed data to keep in the memory
  */
 auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size) -> void {
     lzma_options_lzma options;

From 2b85f01d7d6934a19df1203baa2beba94fc395f6 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Tue, 3 Dec 2024 03:35:42 -0500
Subject: [PATCH 21/65] Fix import

---
 components/core/tests/test-StreamingCompression.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp
index 9d28a5ec3..2b2dfe85f 100644
--- a/components/core/tests/test-StreamingCompression.cpp
+++ b/components/core/tests/test-StreamingCompression.cpp
@@ -26,7 +26,6 @@ using clp::Array;
 using clp::ErrorCode_Success;
 using clp::FileWriter;
 using clp::streaming_compression::Compressor;
-using clp::streaming_compression::Decompressor;
 using std::string;
 using std::string_view;
 

From eda7d6c97a4da5884a5439adfc582e0dab1aabe8 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingranhu98@gmail.com>
Date: Wed, 4 Dec 2024 10:19:35 -0500
Subject: [PATCH 22/65] Apply suggestions from code review

Co-authored-by: haiqi96 <14502009+haiqi96@users.noreply.github.com>
---
 .../core/src/clp/streaming_compression/lzma/Compressor.cpp    | 4 ++--
 .../core/src/clp/streaming_compression/lzma/Compressor.hpp    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 071ad77b8..f5c0fedd4 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -17,13 +17,13 @@
 namespace {
 using clp::streaming_compression::lzma::Compressor;
 
-auto is_flush_action(lzma_action action) {
+auto is_flush_action(lzma_action action) -> bool {
     return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action || LZMA_FULL_BARRIER == action
            || LZMA_FINISH == action;
 }
 
 /**
- * Initialize the Lzma compression stream
+ * Initialize the LZMA compression stream
  * @param strm A pre-allocated `lzma_stream` object
  * @param compression_level
  * @param dict_size Dictionary size that specifies how many bytes of the
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index 045345829..593c26835 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -100,7 +100,7 @@ class Compressor : public ::clp::streaming_compression::Compressor {
 
     /**
      * Invoke lzma_code() repeatedly with the given flushing action until all
-     * encoded data is made available at the output block
+     * encoded data is made available at the output block buffer
      *
      * Once flushing starts, the workflow action needs to stay the same until
      * flushing is signaled completed by LZMA (aka LZMA_STREAM_END is reached).
@@ -111,7 +111,7 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     auto flush_lzma(lzma_action flush_action) -> void;
 
     /**
-     * Flushes the current compressed data in the lzma output buffer to the
+     * Flushes the current compressed data in the LZMA output buffer to the
      * output file handler. Reset the compression buffer to receive new data.
      */
     auto flush_stream_output_block_buffer() -> void;

From 4164a9d43731dcae330d87338e19882bbc437e62 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Wed, 4 Dec 2024 11:01:41 -0500
Subject: [PATCH 23/65] Address review concern

---
 .../streaming_compression/lzma/Compressor.cpp | 59 +++++++++++--------
 .../streaming_compression/lzma/Compressor.hpp | 18 +++---
 .../core/tests/test-StreamingCompression.cpp  |  2 +
 3 files changed, 46 insertions(+), 33 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index f5c0fedd4..50a813ea4 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -144,12 +144,11 @@ auto Compressor::write(char const* data, size_t data_length) -> void {
     m_compression_stream.next_in = clp::size_checked_pointer_cast<uint8_t const>(data);
     m_compression_stream.avail_in = data_length;
 
-    while (m_compression_stream.avail_in > 0) {
-        encode_lzma_once();
-    }
+    encode_lzma();
 
     // All input data have been encoded so detach input data
     m_compression_stream.next_in = nullptr;
+    m_compression_stream.avail_in = 0;
 
     m_uncompressed_stream_pos += data_length;
 }
@@ -167,26 +166,31 @@ auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode {
     return ErrorCode_Success;
 }
 
-auto Compressor::encode_lzma_once() -> void {
-    if (0 == m_compression_stream.avail_in) {
-        return;
-    }
+auto Compressor::encode_lzma() -> void {
+    while (m_compression_stream.avail_in > 0) {
+        // Write output buffer to file if it's full
+        if (0 == m_compression_stream.avail_out) {
+            flush_stream_output_block_buffer();
+        }
 
-    if (0 == m_compression_stream.avail_out) {
-        flush_stream_output_block_buffer();
+        auto const rc = lzma_code(&m_compression_stream, LZMA_RUN);
+        switch (rc) {
+            case LZMA_OK:
+                break;
+            case LZMA_BUF_ERROR:  // No encoding progress can be made
+                SPDLOG_ERROR("LZMA compressor input stream is corrupt.");
+                throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
+            default:
+                SPDLOG_ERROR(
+                        "lzma_code() returned an unexpected value - {}.",
+                        static_cast<int>(rc)
+                );
+                throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
+        }
     }
 
-    auto const rc = lzma_code(&m_compression_stream, LZMA_RUN);
-    switch (rc) {
-        case LZMA_OK:
-            break;
-        case LZMA_BUF_ERROR:  // No encoding progress can be made
-            SPDLOG_ERROR("LZMA compressor input stream is corrupt.");
-            throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
-        default:
-            SPDLOG_ERROR("lzma_code() returned an unexpected value - {}.", static_cast<int>(rc));
-            throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
-    }
+    // Write the last chunk of output
+    flush_stream_output_block_buffer();
 }
 
 auto Compressor::flush_lzma(lzma_action flush_action) -> void {
@@ -198,8 +202,18 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void {
         throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
     }
 
+    /**
+     * Once flushing starts, the workflow action needs to stay the same until
+     * flushing is signaled completed by LZMA (aka LZMA_STREAM_END is reached).
+     * See also: https://github.com/tukaani-project/xz/blob/master/src/liblzma/api/lzma/base.h#L274
+     */
     bool flushed{false};
     while (false == flushed) {
+        // Write output buffer to file if it's full
+        if (0 == m_compression_stream.avail_out) {
+            flush_stream_output_block_buffer();
+        }
+
         auto const rc = lzma_code(&m_compression_stream, flush_action);
         switch (rc) {
             case LZMA_OK:
@@ -222,11 +236,6 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void {
                 );
                 throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
         }
-
-        // Write output buffer to file if it's full
-        if (0 == m_compression_stream.avail_out) {
-            flush_stream_output_block_buffer();
-        }
     }
 
     // Write the last chunk of output
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index 593c26835..c8c12b9cb 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -91,22 +91,24 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     static constexpr size_t cCompressedStreamBlockBufferSize{4096};  // 4KiB
 
     /**
-     * Invoke lzma_code() encoding workflow once with LZMA_RUN
+     * Invoke lzma_code() repeatedly with LZMA_RUN until the input is exhausted
      *
-     * The encoded data may be buffered and thus not immediately available at
-     * the output block.
+     * At the end of the workflow, the last bytes of encoded data may still be
+     * buffered and thus not immediately available at the output block buffer.
+     *
+     * Assumes input stream and output block buffer are both in valid states.
+     * @throw `OperationFailed` if LZMA returns an unexpected error value
      */
-    auto encode_lzma_once() -> void;
+    auto encode_lzma() -> void;
 
     /**
      * Invoke lzma_code() repeatedly with the given flushing action until all
      * encoded data is made available at the output block buffer
      *
-     * Once flushing starts, the workflow action needs to stay the same until
-     * flushing is signaled completed by LZMA (aka LZMA_STREAM_END is reached).
-     * See also: https://github.com/tukaani-project/xz/blob/master/src/liblzma/api/lzma/base.h#L274
-     *
+     * Assumes input stream and output block buffer are both in valid states.
      * @param flush_action
+     * @throw `OperationFailed` if the provided action is not an LZMA flush
+     *        action, or if LZMA returns an unexpected error value
      */
     auto flush_lzma(lzma_action flush_action) -> void;
 
diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp
index 2b2dfe85f..a52a42ef7 100644
--- a/components/core/tests/test-StreamingCompression.cpp
+++ b/components/core/tests/test-StreamingCompression.cpp
@@ -16,6 +16,7 @@
 #include "../src/clp/FileWriter.hpp"
 #include "../src/clp/ReadOnlyMemoryMappedFile.hpp"
 #include "../src/clp/streaming_compression/Compressor.hpp"
+#include "../src/clp/streaming_compression/Decompressor.hpp"
 #include "../src/clp/streaming_compression/lzma/Compressor.hpp"
 #include "../src/clp/streaming_compression/passthrough/Compressor.hpp"
 #include "../src/clp/streaming_compression/passthrough/Decompressor.hpp"
@@ -26,6 +27,7 @@ using clp::Array;
 using clp::ErrorCode_Success;
 using clp::FileWriter;
 using clp::streaming_compression::Compressor;
+using clp::streaming_compression::Decompressor;
 using std::string;
 using std::string_view;
 

From 8ab0653c8e555e3e1d62d9631c7077410d3f475b Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Wed, 4 Dec 2024 20:24:52 -0500
Subject: [PATCH 24/65] Add a comment

---
 .../core/src/clp/streaming_compression/lzma/Compressor.cpp      | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 50a813ea4..65445061a 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -117,6 +117,8 @@ auto Compressor::close() -> void {
     }
 
     flush_lzma(LZMA_FINISH);
+
+    // Deallocates LZMA stream's internal data structures
     lzma_end(&m_compression_stream);
 
     // Detach output buffer from LZMA stream

From c436f214669b5895f64cf429be383cf48f3e0f6a Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingranhu98@gmail.com>
Date: Fri, 6 Dec 2024 00:44:25 -0500
Subject: [PATCH 25/65] Apply suggestions from code review

Co-authored-by: haiqi96 <14502009+haiqi96@users.noreply.github.com>
---
 .../core/src/clp/streaming_compression/lzma/Compressor.cpp  | 5 ++++-
 .../core/src/clp/streaming_compression/lzma/Compressor.hpp  | 6 +++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 65445061a..8d518249c 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -156,6 +156,9 @@ auto Compressor::write(char const* data, size_t data_length) -> void {
 }
 
 auto Compressor::flush() -> void {
+    if (nullptr == m_compressed_stream_file_writer) {
+        throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
+    }
     flush_lzma(LZMA_SYNC_FLUSH);
 }
 
@@ -228,7 +231,7 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void {
             case LZMA_BUF_ERROR:  // No encoding progress can be made
                 // NOTE: this can happen if we are using LZMA_FULL_FLUSH or
                 // LZMA_FULL_BARRIER. These two actions keeps encoding input
-                // data alongside flushing already encoded but buffered data.
+                // data alongside flushing buffered encoded data.
                 SPDLOG_ERROR("LZMA compressor input stream is corrupt.");
                 throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
             default:
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index c8c12b9cb..323464545 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -94,7 +94,7 @@ class Compressor : public ::clp::streaming_compression::Compressor {
      * Invoke lzma_code() repeatedly with LZMA_RUN until the input is exhausted
      *
      * At the end of the workflow, the last bytes of encoded data may still be
-     * buffered and thus not immediately available at the output block buffer.
+     * buffered in the LZMA stream and thus not immediately available at the output block buffer.
      *
      * Assumes input stream and output block buffer are both in valid states.
      * @throw `OperationFailed` if LZMA returns an unexpected error value
@@ -113,8 +113,8 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     auto flush_lzma(lzma_action flush_action) -> void;
 
     /**
-     * Flushes the current compressed data in the LZMA output buffer to the
-     * output file handler. Reset the compression buffer to receive new data.
+     * Flushes the current compressed data in the output block buffer to the
+     * output file handler. Reset the output block buffer to receive new data.
      */
     auto flush_stream_output_block_buffer() -> void;
 

From 7bd34d256797514c0de5e19570f8bdf8d02cc6b1 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Fri, 6 Dec 2024 01:06:36 -0500
Subject: [PATCH 26/65] Update comment to 100-char length

---
 .../streaming_compression/lzma/Compressor.cpp | 30 ++++++++-----------
 .../streaming_compression/lzma/Compressor.hpp | 12 ++++----
 2 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 8d518249c..6c4a29206 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -41,20 +41,18 @@ auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_siz
             {.id = LZMA_VLI_UNKNOWN, .options = nullptr},
     }};
 
-    // Initialize the encoder using a preset. Set the integrity to check
-    // to CRC64, which is the default in the xz command line tool. If
-    // the .xz file needs to be decompressed with XZ Embedded, use
-    // LZMA_CHECK_CRC32 instead.
+    // Initialize the encoder using a preset. Set the integrity to check to CRC64, which is the
+    // default in the xz command line tool. If the .xz file needs to be decompressed with
+    // XZ-Embedded, use LZMA_CHECK_CRC32 instead.
     auto const rc{lzma_stream_encoder(strm, filters.data(), LZMA_CHECK_CRC64)};
 
     if (LZMA_OK == rc) {
         return;
     }
 
-    // Something went wrong. The possible errors are documented in
-    // lzma/container.h (src/liblzma/api/lzma/container.h in the source
-    // package or e.g. /usr/include/lzma/container.h depending on the
-    // install prefix).
+    // Something went wrong. The possible errors are documented in lzma/container.h
+    // (src/liblzma/api/lzma/container.h in the source package or e.g. /usr/include/lzma/container.h
+    // depending on the install prefix).
     char const* msg{nullptr};
     switch (rc) {
         case LZMA_MEM_ERROR:
@@ -193,9 +191,6 @@ auto Compressor::encode_lzma() -> void {
                 throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
         }
     }
-
-    // Write the last chunk of output
-    flush_stream_output_block_buffer();
 }
 
 auto Compressor::flush_lzma(lzma_action flush_action) -> void {
@@ -208,8 +203,8 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void {
     }
 
     /**
-     * Once flushing starts, the workflow action needs to stay the same until
-     * flushing is signaled completed by LZMA (aka LZMA_STREAM_END is reached).
+     * Once flushing starts, the workflow action needs to stay the same until flushing is signaled
+     * complete by LZMA (aka LZMA_STREAM_END is reached).
      * See also: https://github.com/tukaani-project/xz/blob/master/src/liblzma/api/lzma/base.h#L274
      */
     bool flushed{false};
@@ -224,14 +219,13 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void {
             case LZMA_OK:
                 break;
             case LZMA_STREAM_END:
-                // NOTE: this might not be true when multithreaded encoder is
-                // used with LZMA_FULL_BARRIER. For now, we skip this check.
+                // NOTE: this might not be true when multithreaded encoder is used with
+                // LZMA_FULL_BARRIER. For now, we skip this check.
                 flushed = true;
                 break;
             case LZMA_BUF_ERROR:  // No encoding progress can be made
-                // NOTE: this can happen if we are using LZMA_FULL_FLUSH or
-                // LZMA_FULL_BARRIER. These two actions keeps encoding input
-                // data alongside flushing buffered encoded data.
+                // NOTE: this can happen if we are using LZMA_FULL_FLUSH or LZMA_FULL_BARRIER. These
+                // two actions keeps encoding input data alongside flushing buffered encoded data.
                 SPDLOG_ERROR("LZMA compressor input stream is corrupt.");
                 throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
             default:
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index 323464545..286819893 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -93,8 +93,8 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     /**
      * Invoke lzma_code() repeatedly with LZMA_RUN until the input is exhausted
      *
-     * At the end of the workflow, the last bytes of encoded data may still be
-     * buffered in the LZMA stream and thus not immediately available at the output block buffer.
+     * At the end of the workflow, the last bytes of encoded data may still be buffered in the LZMA
+     * stream and thus not immediately available at the output block buffer.
      *
      * Assumes input stream and output block buffer are both in valid states.
      * @throw `OperationFailed` if LZMA returns an unexpected error value
@@ -102,8 +102,8 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     auto encode_lzma() -> void;
 
     /**
-     * Invoke lzma_code() repeatedly with the given flushing action until all
-     * encoded data is made available at the output block buffer
+     * Invoke lzma_code() repeatedly with the given flushing action until all encoded data is made
+     * available at the output block buffer
      *
      * Assumes input stream and output block buffer are both in valid states.
      * @param flush_action
@@ -113,8 +113,8 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     auto flush_lzma(lzma_action flush_action) -> void;
 
     /**
-     * Flushes the current compressed data in the output block buffer to the
-     * output file handler. Reset the output block buffer to receive new data.
+     * Flushes the current compressed data in the output block buffer to the output file handler.
+     * Reset the output block buffer to receive new data.
      */
     auto flush_stream_output_block_buffer() -> void;
 

From efd2b2759088c874c2d0a1191b8e4e1d1d16105f Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Tue, 10 Dec 2024 23:05:31 -0500
Subject: [PATCH 27/65] Fix according to coding style guidelines

---
 .../streaming_compression/lzma/Compressor.cpp  | 16 ++++++++++------
 .../streaming_compression/lzma/Compressor.hpp  | 18 +++++++++---------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 6c4a29206..dc2ca222f 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -17,18 +17,22 @@
 namespace {
 using clp::streaming_compression::lzma::Compressor;
 
-auto is_flush_action(lzma_action action) -> bool {
-    return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action || LZMA_FULL_BARRIER == action
-           || LZMA_FINISH == action;
-}
+auto is_flush_action(lzma_action action) -> bool;
 
 /**
- * Initialize the LZMA compression stream
+ * Initializes the LZMA compression stream
  * @param strm A pre-allocated `lzma_stream` object
  * @param compression_level
  * @param dict_size Dictionary size that specifies how many bytes of the
  *                  recently processed uncompressed data to keep in the memory
  */
+auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size) -> void;
+
+auto is_flush_action(lzma_action action) -> bool {
+    return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action || LZMA_FULL_BARRIER == action
+           || LZMA_FINISH == action;
+}
+
 auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size) -> void {
     lzma_options_lzma options;
     if (0 != lzma_lzma_preset(&options, compression_level)) {
@@ -41,7 +45,7 @@ auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_siz
             {.id = LZMA_VLI_UNKNOWN, .options = nullptr},
     }};
 
-    // Initialize the encoder using a preset. Set the integrity to check to CRC64, which is the
+    // Initializes the encoder using a preset. Set the integrity to check to CRC64, which is the
     // default in the xz command line tool. If the .xz file needs to be decompressed with
     // XZ-Embedded, use LZMA_CHECK_CRC32 instead.
     auto const rc{lzma_stream_encoder(strm, filters.data(), LZMA_CHECK_CRC64)};
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index 286819893..b4255cc1c 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -43,6 +43,13 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     Compressor(Compressor&&) noexcept = default;
     auto operator=(Compressor&&) noexcept -> Compressor& = default;
 
+    /**
+     * Initializes the compression stream with the given compression level
+     * @param file_writer
+     * @param compression_level
+     */
+    auto open(FileWriter& file_writer, int compression_level) -> void;
+
     // Methods implementing the WriterInterface
     /**
      * Writes the given data to the compressor
@@ -80,18 +87,11 @@ class Compressor : public ::clp::streaming_compression::Compressor {
         this->open(file_writer, cDefaultCompressionLevel);
     }
 
-    /**
-     * Initializes the compression stream with the given compression level
-     * @param file_writer
-     * @param compression_level
-     */
-    auto open(FileWriter& file_writer, int compression_level) -> void;
-
 private:
     static constexpr size_t cCompressedStreamBlockBufferSize{4096};  // 4KiB
 
     /**
-     * Invoke lzma_code() repeatedly with LZMA_RUN until the input is exhausted
+     * Invokes lzma_code() repeatedly with LZMA_RUN until the input is exhausted
      *
      * At the end of the workflow, the last bytes of encoded data may still be buffered in the LZMA
      * stream and thus not immediately available at the output block buffer.
@@ -102,7 +102,7 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     auto encode_lzma() -> void;
 
     /**
-     * Invoke lzma_code() repeatedly with the given flushing action until all encoded data is made
+     * Invokes lzma_code() repeatedly with the given flushing action until all encoded data is made
      * available at the output block buffer
      *
      * Assumes input stream and output block buffer are both in valid states.

From c530f9287ecf51350220bfca501347f3f79b1d5b Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingranhu98@gmail.com>
Date: Wed, 11 Dec 2024 21:10:58 -0500
Subject: [PATCH 28/65] Apply suggestions from code review

Co-authored-by: davidlion <davidlion2@protonmail.com>
---
 components/core/tools/scripts/lib_install/liblzma.sh            | 2 +-
 .../lib_install/ubuntu-focal/install-prebuilt-packages.sh       | 2 +-
 .../lib_install/ubuntu-jammy/install-prebuilt-packages.sh       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/components/core/tools/scripts/lib_install/liblzma.sh b/components/core/tools/scripts/lib_install/liblzma.sh
index 28766eced..a73ff79b9 100755
--- a/components/core/tools/scripts/lib_install/liblzma.sh
+++ b/components/core/tools/scripts/lib_install/liblzma.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 # Exit on any error
 set -e
diff --git a/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh
index f1e2ee4ff..b373cbe4d 100755
--- a/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh
+++ b/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh
@@ -19,8 +19,8 @@ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
   git \
   libcurl4 \
   libcurl4-openssl-dev \
-  libmariadb-dev \
   liblzma-dev \
+  libmariadb-dev \
   libssl-dev \
   make \
   openjdk-11-jdk \
diff --git a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh
index 4911a6a98..e2e17283b 100755
--- a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh
+++ b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh
@@ -19,8 +19,8 @@ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
   libboost-program-options-dev \
   libcurl4 \
   libcurl4-openssl-dev \
-  libmariadb-dev \
   liblzma-dev \
+  libmariadb-dev \
   libssl-dev \
   openjdk-11-jdk \
   pkg-config \

From e751ee6f5fe3d757713520b494a2e23edc1a6453 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Thu, 12 Dec 2024 01:27:10 -0500
Subject: [PATCH 29/65] Update CMakeLists.txt

---
 components/core/CMakeLists.txt | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index 312c6e2ef..9d0c51c9f 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -11,16 +11,16 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 # Set general compressor
 set(GENERAL_COMPRESSOR "zstd" CACHE STRING "The general-purpose compressor used as the 2nd-stage compressor")
-set_property(CACHE GENERAL_COMPRESSOR PROPERTY STRINGS passthrough zstd lzma)
-if ("${GENERAL_COMPRESSOR}" STREQUAL "passthrough")
+set_property(CACHE GENERAL_COMPRESSOR PROPERTY STRINGS lzma passthrough zstd)
+if ("${GENERAL_COMPRESSOR}" STREQUAL "lzma")
+    add_definitions(-DUSE_LZMA_COMPRESSION=1)
+    message(STATUS "Using Lempel–Ziv–Markov chain Algorithm compression")
+elseif ("${GENERAL_COMPRESSOR}" STREQUAL "passthrough")
     add_definitions(-DUSE_PASSTHROUGH_COMPRESSION=1)
     message(STATUS "Using passthrough compression")
 elseif ("${GENERAL_COMPRESSOR}" STREQUAL "zstd")
     add_definitions(-DUSE_ZSTD_COMPRESSION=1)
     message(STATUS "Using Zstandard compression")
-elseif ("${GENERAL_COMPRESSOR}" STREQUAL "lzma")
-    add_definitions(-DUSE_LZMA_COMPRESSION=1)
-    message(STATUS "Using Lempel–Ziv–Markov chain Algorithm compression")
 else()
     message(SEND_ERROR "GENERAL_COMPRESSOR=${GENERAL_COMPRESSOR} is unimplemented.")
 endif()
@@ -228,17 +228,17 @@ else()
 endif()
 
 # Find and setup LZMA Library
-# Notice that we don't have support to switch between static and shared libraries.
-# TODO: add a script in ./cmake/Modules to resolve .a vs. .so
+# TODO: Add support to enforce static linking against LZMA when desired. For a hack, we can set
+#       `CMAKE_FIND_LIBRARY_SUFFIXES` to ask CMake to prefer the static lib over the shared one.
 find_package(LibLZMA REQUIRED)
 if(LIBLZMA_FOUND)
     message(STATUS "Found Lzma ${LIBLZMA_VERSION_STRING}")
     message(STATUS "Lzma library location: ${LIBLZMA_LIBRARIES}")
+    message(STATUS "Lzma Include Dir: ${LIBLZMA_INCLUDE_DIRS}")
 else()
     message(FATAL_ERROR "Could not find ${CLP_LIBS_STRING} libraries for Lzma")
 endif()
 include_directories(${LIBLZMA_INCLUDE_DIRS})
-message("Lzma Include Dir: ${LIBLZMA_INCLUDE_DIRS}")
 
 # sqlite dependencies
 set(sqlite_DYNAMIC_LIBS "dl;m;pthread")

From 1c5efcdbb3567c16d9ed14a02eab50525f8ea426 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Thu, 12 Dec 2024 01:30:35 -0500
Subject: [PATCH 30/65] Address review concern

---
 .../clp/streaming_compression/lzma/Compressor.cpp   | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index dc2ca222f..1330da53f 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -21,19 +21,19 @@ auto is_flush_action(lzma_action action) -> bool;
 
 /**
  * Initializes the LZMA compression stream
- * @param strm A pre-allocated `lzma_stream` object
+ * @param stream A pre-allocated `lzma_stream` object
  * @param compression_level
  * @param dict_size Dictionary size that specifies how many bytes of the
  *                  recently processed uncompressed data to keep in the memory
  */
-auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size) -> void;
+auto init_lzma_encoder(lzma_stream* stream, int compression_level, size_t dict_size) -> void;
 
 auto is_flush_action(lzma_action action) -> bool {
     return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action || LZMA_FULL_BARRIER == action
            || LZMA_FINISH == action;
 }
 
-auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_size) -> void {
+auto init_lzma_encoder(lzma_stream* stream, int compression_level, size_t dict_size) -> void {
     lzma_options_lzma options;
     if (0 != lzma_lzma_preset(&options, compression_level)) {
         SPDLOG_ERROR("Failed to initialize LZMA options' compression level.");
@@ -48,7 +48,7 @@ auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_siz
     // Initializes the encoder using a preset. Set the integrity to check to CRC64, which is the
     // default in the xz command line tool. If the .xz file needs to be decompressed with
     // XZ-Embedded, use LZMA_CHECK_CRC32 instead.
-    auto const rc{lzma_stream_encoder(strm, filters.data(), LZMA_CHECK_CRC64)};
+    auto const rc = lzma_stream_encoder(stream, filters.data(), LZMA_CHECK_CRC64);
 
     if (LZMA_OK == rc) {
         return;
@@ -71,8 +71,11 @@ auto init_lzma_encoder(lzma_stream* strm, int compression_level, size_t dict_siz
             msg = "Specified integrity check is not supported";
             break;
 
+        case LZMA_PROG_ERROR:
+            msg = "Input arguments are not sane";
+            break;
+
         default:
-            // This is most likely LZMA_PROG_ERROR indicating a bug in liblzma
             msg = "Unknown error";
             break;
     }

From 856c7cb544a8122b8c9e7e9063d077a587da9913 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Thu, 12 Dec 2024 01:57:32 -0500
Subject: [PATCH 31/65] Update TODO

---
 components/core/CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index 9d0c51c9f..3b5f9aff4 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -228,8 +228,10 @@ else()
 endif()
 
 # Find and setup LZMA Library
-# TODO: Add support to enforce static linking against LZMA when desired. For a hack, we can set
-#       `CMAKE_FIND_LIBRARY_SUFFIXES` to ask CMake to prefer the static lib over the shared one.
+# TODO: Add a script in ./cmake/Modules to properly import LZMA in find_package()'s module mode
+if(CLP_USE_STATIC_LIBS)
+    set(LibLZMA_USE_STATIC_LIBS ON)
+endif()
 find_package(LibLZMA REQUIRED)
 if(LIBLZMA_FOUND)
     message(STATUS "Found Lzma ${LIBLZMA_VERSION_STRING}")

From 43e22d2ec5a4480b6f02a0be31eec6f8efc5406c Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Thu, 12 Dec 2024 01:59:24 -0500
Subject: [PATCH 32/65] Case fix

---
 components/core/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index 3b5f9aff4..160f6766d 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -230,7 +230,7 @@ endif()
 # Find and setup LZMA Library
 # TODO: Add a script in ./cmake/Modules to properly import LZMA in find_package()'s module mode
 if(CLP_USE_STATIC_LIBS)
-    set(LibLZMA_USE_STATIC_LIBS ON)
+    set(LIBLZMA_USE_STATIC_LIBS ON)
 endif()
 find_package(LibLZMA REQUIRED)
 if(LIBLZMA_FOUND)

From 829a6b2d7c8bde7011c451022db7926593335ebd Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Thu, 12 Dec 2024 12:02:42 -0500
Subject: [PATCH 33/65] Remove unnecessary function inline comments

---
 .../streaming_compression/lzma/Compressor.cpp | 113 ++++++++++--------
 .../streaming_compression/lzma/Compressor.hpp |   5 +
 .../core/tests/test-StreamingCompression.cpp  |  15 ++-
 3 files changed, 76 insertions(+), 57 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 1330da53f..7edd61ae9 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -8,6 +8,7 @@
 #include <lzma.h>
 #include <spdlog/spdlog.h>
 
+#include "../../Array.hpp"
 #include "../../ErrorCode.hpp"
 #include "../../FileWriter.hpp"
 #include "../../TraceableException.hpp"
@@ -15,25 +16,68 @@
 #include "Constants.hpp"
 
 namespace {
+using clp::Array;
 using clp::streaming_compression::lzma::Compressor;
 
+/**
+ * Attaches a pre-allocated block buffer to encoder's output stream
+ *
+ * Subsequent calls to this function resets the output buffer to its initial state.
+ * @param stream
+ * @param out_buffer
+ */
+auto attach_stream_output_buffer(lzma_stream* stream, Array<uint8_t>& out_buffer) -> void;
+
+auto detach_stream_input_src(lzma_stream* stream) -> void;
+
+auto detach_stream_output_buffer(lzma_stream* stream) -> void;
+
 auto is_flush_action(lzma_action action) -> bool;
 
 /**
- * Initializes the LZMA compression stream
- * @param stream A pre-allocated `lzma_stream` object
+ * Initializes an LZMA compression encoder and its streams
+ *
+ * @param stream A pre-allocated `lzma_stream` object that is to be initialized
  * @param compression_level
  * @param dict_size Dictionary size that specifies how many bytes of the
  *                  recently processed uncompressed data to keep in the memory
+ * @param check Type of integrity check calculated from the uncompressed data. LZMA_CHECK_CRC64 is
+ *              the default in the xz command line tool. If the .xz file needs to be decompressed
+ *              with XZ-Embedded, use LZMA_CHECK_CRC32 instead.
  */
-auto init_lzma_encoder(lzma_stream* stream, int compression_level, size_t dict_size) -> void;
+auto init_lzma_encoder(
+        lzma_stream* stream,
+        int compression_level,
+        size_t dict_size,
+        lzma_check check = LZMA_CHECK_CRC64
+) -> void;
+
+auto attach_stream_output_buffer(lzma_stream* stream, Array<uint8_t>& out_buffer) -> void {
+    stream->next_out = out_buffer.data();
+    stream->avail_out = out_buffer.size();
+}
+
+auto detach_stream_input_src(lzma_stream* stream) -> void {
+    stream->next_in = nullptr;
+    stream->avail_in = 0;
+}
+
+auto detach_stream_output_buffer(lzma_stream* stream) -> void {
+    stream->next_out = nullptr;
+    stream->avail_out = 0;
+}
 
 auto is_flush_action(lzma_action action) -> bool {
     return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action || LZMA_FULL_BARRIER == action
            || LZMA_FINISH == action;
 }
 
-auto init_lzma_encoder(lzma_stream* stream, int compression_level, size_t dict_size) -> void {
+auto init_lzma_encoder(
+        lzma_stream* stream,
+        int compression_level,
+        size_t dict_size,
+        lzma_check check
+) -> void {
     lzma_options_lzma options;
     if (0 != lzma_lzma_preset(&options, compression_level)) {
         SPDLOG_ERROR("Failed to initialize LZMA options' compression level.");
@@ -45,18 +89,11 @@ auto init_lzma_encoder(lzma_stream* stream, int compression_level, size_t dict_s
             {.id = LZMA_VLI_UNKNOWN, .options = nullptr},
     }};
 
-    // Initializes the encoder using a preset. Set the integrity to check to CRC64, which is the
-    // default in the xz command line tool. If the .xz file needs to be decompressed with
-    // XZ-Embedded, use LZMA_CHECK_CRC32 instead.
-    auto const rc = lzma_stream_encoder(stream, filters.data(), LZMA_CHECK_CRC64);
-
+    auto const rc = lzma_stream_encoder(stream, filters.data(), check);
     if (LZMA_OK == rc) {
         return;
     }
 
-    // Something went wrong. The possible errors are documented in lzma/container.h
-    // (src/liblzma/api/lzma/container.h in the source package or e.g. /usr/include/lzma/container.h
-    // depending on the install prefix).
     char const* msg{nullptr};
     switch (rc) {
         case LZMA_MEM_ERROR:
@@ -97,17 +134,9 @@ auto Compressor::open(FileWriter& file_writer, int compression_level) -> void {
 
     m_compression_stream = LZMA_STREAM_INIT;
     init_lzma_encoder(&m_compression_stream, compression_level, m_dict_size);
-
-    // No input upon initialization
-    m_compression_stream.next_in = nullptr;
-    m_compression_stream.avail_in = 0;
-
-    // Attach output buffer to LZMA stream
-    m_compression_stream.next_out = m_compressed_stream_block_buffer.data();
-    m_compression_stream.avail_out = m_compressed_stream_block_buffer.size();
-
+    detach_stream_input_src(&m_compression_stream);
+    attach_stream_output_buffer(&m_compression_stream, m_compressed_stream_block_buffer);
     m_compressed_stream_file_writer = &file_writer;
-
     m_uncompressed_stream_pos = 0;
 }
 
@@ -122,14 +151,9 @@ auto Compressor::close() -> void {
     }
 
     flush_lzma(LZMA_FINISH);
-
     // Deallocates LZMA stream's internal data structures
     lzma_end(&m_compression_stream);
-
-    // Detach output buffer from LZMA stream
-    m_compression_stream.next_out = nullptr;
-    m_compression_stream.avail_out = 0;
-
+    detach_stream_output_buffer(&m_compression_stream);
     m_compressed_stream_file_writer = nullptr;
 }
 
@@ -139,7 +163,6 @@ auto Compressor::write(char const* data, size_t data_length) -> void {
     }
 
     if (0 == data_length) {
-        // Nothing needs to be done because we do not need to compress anything
         return;
     }
 
@@ -147,16 +170,10 @@ auto Compressor::write(char const* data, size_t data_length) -> void {
         throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
     }
 
-    // Attach input data to LZMA stream
     m_compression_stream.next_in = clp::size_checked_pointer_cast<uint8_t const>(data);
     m_compression_stream.avail_in = data_length;
-
     encode_lzma();
-
-    // All input data have been encoded so detach input data
-    m_compression_stream.next_in = nullptr;
-    m_compression_stream.avail_in = 0;
-
+    detach_stream_input_src(&m_compression_stream);
     m_uncompressed_stream_pos += data_length;
 }
 
@@ -178,7 +195,6 @@ auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode {
 
 auto Compressor::encode_lzma() -> void {
     while (m_compression_stream.avail_in > 0) {
-        // Write output buffer to file if it's full
         if (0 == m_compression_stream.avail_out) {
             flush_stream_output_block_buffer();
         }
@@ -187,8 +203,10 @@ auto Compressor::encode_lzma() -> void {
         switch (rc) {
             case LZMA_OK:
                 break;
-            case LZMA_BUF_ERROR:  // No encoding progress can be made
-                SPDLOG_ERROR("LZMA compressor input stream is corrupt.");
+            case LZMA_BUF_ERROR:
+                SPDLOG_ERROR(
+                        "LZMA compressor input stream is corrupt. No encoding progress can be made."
+                );
                 throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
             default:
                 SPDLOG_ERROR(
@@ -209,14 +227,8 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void {
         throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
     }
 
-    /**
-     * Once flushing starts, the workflow action needs to stay the same until flushing is signaled
-     * complete by LZMA (aka LZMA_STREAM_END is reached).
-     * See also: https://github.com/tukaani-project/xz/blob/master/src/liblzma/api/lzma/base.h#L274
-     */
     bool flushed{false};
     while (false == flushed) {
-        // Write output buffer to file if it's full
         if (0 == m_compression_stream.avail_out) {
             flush_stream_output_block_buffer();
         }
@@ -230,10 +242,12 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void {
                 // LZMA_FULL_BARRIER. For now, we skip this check.
                 flushed = true;
                 break;
-            case LZMA_BUF_ERROR:  // No encoding progress can be made
+            case LZMA_BUF_ERROR:
                 // NOTE: this can happen if we are using LZMA_FULL_FLUSH or LZMA_FULL_BARRIER. These
                 // two actions keeps encoding input data alongside flushing buffered encoded data.
-                SPDLOG_ERROR("LZMA compressor input stream is corrupt.");
+                SPDLOG_ERROR(
+                        "LZMA compressor input stream is corrupt. No encoding progress can be made."
+                );
                 throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
             default:
                 SPDLOG_ERROR(
@@ -244,20 +258,17 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void {
         }
     }
 
-    // Write the last chunk of output
     flush_stream_output_block_buffer();
 }
 
 auto Compressor::flush_stream_output_block_buffer() -> void {
     if (cCompressedStreamBlockBufferSize == m_compression_stream.avail_out) {
-        // Nothing to flush
         return;
     }
     m_compressed_stream_file_writer->write(
             clp::size_checked_pointer_cast<char>(m_compressed_stream_block_buffer.data()),
             cCompressedStreamBlockBufferSize - m_compression_stream.avail_out
     );
-    m_compression_stream.next_out = m_compressed_stream_block_buffer.data();
-    m_compression_stream.avail_out = cCompressedStreamBlockBufferSize;
+    attach_stream_output_buffer(&m_compression_stream, m_compressed_stream_block_buffer);
 }
 }  // namespace clp::streaming_compression::lzma
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index b4255cc1c..986137aa2 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -45,6 +45,7 @@ class Compressor : public ::clp::streaming_compression::Compressor {
 
     /**
      * Initializes the compression stream with the given compression level
+     *
      * @param file_writer
      * @param compression_level
      */
@@ -105,6 +106,10 @@ class Compressor : public ::clp::streaming_compression::Compressor {
      * Invokes lzma_code() repeatedly with the given flushing action until all encoded data is made
      * available at the output block buffer
      *
+     * Once flushing starts, the workflow action needs to stay the same until flushing is signaled
+     * complete by LZMA (aka LZMA_STREAM_END is reached).
+     * See also: https://github.com/tukaani-project/xz/blob/master/src/liblzma/api/lzma/base.h#L274
+     *
      * Assumes input stream and output block buffer are both in valid states.
      * @param flush_action
      * @throw `OperationFailed` if the provided action is not an LZMA flush
diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp
index a52a42ef7..4076eb88f 100644
--- a/components/core/tests/test-StreamingCompression.cpp
+++ b/components/core/tests/test-StreamingCompression.cpp
@@ -44,7 +44,15 @@ constexpr auto cCompressionChunkSizes = std::to_array<size_t>(
          cBufferSize}
 );
 
-auto compress(std::unique_ptr<Compressor> compressor, char const* const src) -> void {
+auto compress(std::unique_ptr<Compressor> compressor, char const* src) -> void;
+
+auto decompress_and_compare(
+        std::unique_ptr<Decompressor> decompressor,
+        Array<char> const& uncompressed_buffer,
+        Array<char>& decompressed_buffer
+) -> void;
+
+auto compress(std::unique_ptr<Compressor> compressor, char const* src) -> void {
     FileWriter file_writer;
     file_writer.open(string(cCompressedFilePath), FileWriter::OpenMode::CREATE_FOR_WRITING);
     compressor->open(file_writer);
@@ -84,7 +92,6 @@ auto decompress_and_compare(
         num_uncompressed_bytes += chunk_size;
     }
 
-    // Sanity check
     REQUIRE(
             (std::accumulate(
                      cCompressionChunkSizes.cbegin(),
@@ -97,14 +104,11 @@ auto decompress_and_compare(
 }  // namespace
 
 TEST_CASE("StreamingCompression", "[StreamingCompression]") {
-    // Initialize constants
     constexpr size_t cAlphabetLength{26};
 
-    // Initialize compression devices
     std::unique_ptr<Compressor> compressor;
     std::unique_ptr<Decompressor> decompressor;
 
-    // Initialize buffers
     Array<char> decompressed_buffer{cBufferSize};
     Array<char> uncompressed_buffer{cBufferSize};
     for (size_t i{0}; i < cBufferSize; ++i) {
@@ -130,6 +134,5 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") {
         compress(std::move(compressor), uncompressed_buffer.data());
     }
 
-    // Cleanup
     boost::filesystem::remove(string(cCompressedFilePath));
 }

From 81e180795cccd1b1d7380f989f0068e669d19b6b Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Thu, 12 Dec 2024 12:07:56 -0500
Subject: [PATCH 34/65] Improve comment

---
 .../core/src/clp/streaming_compression/lzma/Compressor.cpp      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 7edd61ae9..36a5038b4 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -238,7 +238,7 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void {
             case LZMA_OK:
                 break;
             case LZMA_STREAM_END:
-                // NOTE: this might not be true when multithreaded encoder is used with
+                // NOTE: flush may not have completed if a multithreaded encoder is using action
                 // LZMA_FULL_BARRIER. For now, we skip this check.
                 flushed = true;
                 break;

From 09b73c7ff6413066aa3d98a5694a04c130b50a4f Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Tue, 17 Dec 2024 00:56:19 -0500
Subject: [PATCH 35/65] Refactor lzma stream related functions into a nested
 helper class

---
 .../streaming_compression/lzma/Compressor.cpp | 194 +++++++-----------
 .../streaming_compression/lzma/Compressor.hpp |  56 ++++-
 2 files changed, 123 insertions(+), 127 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 36a5038b4..4a43e93e8 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -8,134 +8,25 @@
 #include <lzma.h>
 #include <spdlog/spdlog.h>
 
-#include "../../Array.hpp"
 #include "../../ErrorCode.hpp"
 #include "../../FileWriter.hpp"
 #include "../../TraceableException.hpp"
 #include "../../type_utils.hpp"
 #include "Constants.hpp"
 
-namespace {
-using clp::Array;
-using clp::streaming_compression::lzma::Compressor;
-
-/**
- * Attaches a pre-allocated block buffer to encoder's output stream
- *
- * Subsequent calls to this function resets the output buffer to its initial state.
- * @param stream
- * @param out_buffer
- */
-auto attach_stream_output_buffer(lzma_stream* stream, Array<uint8_t>& out_buffer) -> void;
-
-auto detach_stream_input_src(lzma_stream* stream) -> void;
-
-auto detach_stream_output_buffer(lzma_stream* stream) -> void;
-
-auto is_flush_action(lzma_action action) -> bool;
-
-/**
- * Initializes an LZMA compression encoder and its streams
- *
- * @param stream A pre-allocated `lzma_stream` object that is to be initialized
- * @param compression_level
- * @param dict_size Dictionary size that specifies how many bytes of the
- *                  recently processed uncompressed data to keep in the memory
- * @param check Type of integrity check calculated from the uncompressed data. LZMA_CHECK_CRC64 is
- *              the default in the xz command line tool. If the .xz file needs to be decompressed
- *              with XZ-Embedded, use LZMA_CHECK_CRC32 instead.
- */
-auto init_lzma_encoder(
-        lzma_stream* stream,
-        int compression_level,
-        size_t dict_size,
-        lzma_check check = LZMA_CHECK_CRC64
-) -> void;
-
-auto attach_stream_output_buffer(lzma_stream* stream, Array<uint8_t>& out_buffer) -> void {
-    stream->next_out = out_buffer.data();
-    stream->avail_out = out_buffer.size();
-}
-
-auto detach_stream_input_src(lzma_stream* stream) -> void {
-    stream->next_in = nullptr;
-    stream->avail_in = 0;
-}
-
-auto detach_stream_output_buffer(lzma_stream* stream) -> void {
-    stream->next_out = nullptr;
-    stream->avail_out = 0;
-}
-
-auto is_flush_action(lzma_action action) -> bool {
-    return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action || LZMA_FULL_BARRIER == action
-           || LZMA_FINISH == action;
-}
-
-auto init_lzma_encoder(
-        lzma_stream* stream,
-        int compression_level,
-        size_t dict_size,
-        lzma_check check
-) -> void {
-    lzma_options_lzma options;
-    if (0 != lzma_lzma_preset(&options, compression_level)) {
-        SPDLOG_ERROR("Failed to initialize LZMA options' compression level.");
-        throw Compressor::OperationFailed(clp::ErrorCode_BadParam, __FILENAME__, __LINE__);
-    }
-    options.dict_size = dict_size;
-    std::array<lzma_filter, 2> filters{{
-            {.id = LZMA_FILTER_LZMA2, .options = &options},
-            {.id = LZMA_VLI_UNKNOWN, .options = nullptr},
-    }};
-
-    auto const rc = lzma_stream_encoder(stream, filters.data(), check);
-    if (LZMA_OK == rc) {
-        return;
-    }
-
-    char const* msg{nullptr};
-    switch (rc) {
-        case LZMA_MEM_ERROR:
-            msg = "Memory allocation failed";
-            break;
-
-        case LZMA_OPTIONS_ERROR:
-            msg = "Specified preset is not supported";
-            break;
-
-        case LZMA_UNSUPPORTED_CHECK:
-            msg = "Specified integrity check is not supported";
-            break;
-
-        case LZMA_PROG_ERROR:
-            msg = "Input arguments are not sane";
-            break;
-
-        default:
-            msg = "Unknown error";
-            break;
-    }
-
-    SPDLOG_ERROR("Error initializing the encoder: {} (error code {})", msg, static_cast<int>(rc));
-    throw Compressor::OperationFailed(clp::ErrorCode_BadParam, __FILENAME__, __LINE__);
-}
-}  // namespace
-
 namespace clp::streaming_compression::lzma {
 auto Compressor::open(FileWriter& file_writer, int compression_level) -> void {
     if (nullptr != m_compressed_stream_file_writer) {
         throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__);
     }
-
     if (compression_level < cMinCompressionLevel || compression_level > cMaxCompressionLevel) {
         throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__);
     }
+    m_compression_level = compression_level;
 
-    m_compression_stream = LZMA_STREAM_INIT;
-    init_lzma_encoder(&m_compression_stream, compression_level, m_dict_size);
-    detach_stream_input_src(&m_compression_stream);
-    attach_stream_output_buffer(&m_compression_stream, m_compressed_stream_block_buffer);
+    m_lzma_ops.init_lzma_encoder();
+    m_lzma_ops.detach_input_src();
+    m_lzma_ops.attach_output_buffer();
     m_compressed_stream_file_writer = &file_writer;
     m_uncompressed_stream_pos = 0;
 }
@@ -153,7 +44,7 @@ auto Compressor::close() -> void {
     flush_lzma(LZMA_FINISH);
     // Deallocates LZMA stream's internal data structures
     lzma_end(&m_compression_stream);
-    detach_stream_output_buffer(&m_compression_stream);
+    m_lzma_ops.detach_output_buffer();
     m_compressed_stream_file_writer = nullptr;
 }
 
@@ -173,7 +64,7 @@ auto Compressor::write(char const* data, size_t data_length) -> void {
     m_compression_stream.next_in = clp::size_checked_pointer_cast<uint8_t const>(data);
     m_compression_stream.avail_in = data_length;
     encode_lzma();
-    detach_stream_input_src(&m_compression_stream);
+    m_lzma_ops.detach_input_src();
     m_uncompressed_stream_pos += data_length;
 }
 
@@ -188,7 +79,6 @@ auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode {
     if (nullptr == m_compressed_stream_file_writer) {
         return ErrorCode_NotInit;
     }
-
     pos = m_uncompressed_stream_pos;
     return ErrorCode_Success;
 }
@@ -198,7 +88,6 @@ auto Compressor::encode_lzma() -> void {
         if (0 == m_compression_stream.avail_out) {
             flush_stream_output_block_buffer();
         }
-
         auto const rc = lzma_code(&m_compression_stream, LZMA_RUN);
         switch (rc) {
             case LZMA_OK:
@@ -219,7 +108,7 @@ auto Compressor::encode_lzma() -> void {
 }
 
 auto Compressor::flush_lzma(lzma_action flush_action) -> void {
-    if (false == is_flush_action(flush_action)) {
+    if (false == LzmaStreamOperations::is_flush_action(flush_action)) {
         SPDLOG_ERROR(
                 "lzma_code() supplied with invalid flush action - {}.",
                 static_cast<int>(flush_action)
@@ -232,7 +121,6 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void {
         if (0 == m_compression_stream.avail_out) {
             flush_stream_output_block_buffer();
         }
-
         auto const rc = lzma_code(&m_compression_stream, flush_action);
         switch (rc) {
             case LZMA_OK:
@@ -257,7 +145,6 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void {
                 throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
         }
     }
-
     flush_stream_output_block_buffer();
 }
 
@@ -269,6 +156,71 @@ auto Compressor::flush_stream_output_block_buffer() -> void {
             clp::size_checked_pointer_cast<char>(m_compressed_stream_block_buffer.data()),
             cCompressedStreamBlockBufferSize - m_compression_stream.avail_out
     );
-    attach_stream_output_buffer(&m_compression_stream, m_compressed_stream_block_buffer);
+    m_lzma_ops.attach_output_buffer();
+}
+
+auto Compressor::LzmaStreamOperations::is_flush_action(lzma_action action) -> bool {
+    return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action || LZMA_FULL_BARRIER == action
+           || LZMA_FINISH == action;
+}
+
+auto Compressor::LzmaStreamOperations::attach_output_buffer() -> void {
+    m_p->m_compression_stream.next_out = m_p->m_compressed_stream_block_buffer.data();
+    m_p->m_compression_stream.avail_out = m_p->m_compressed_stream_block_buffer.size();
+}
+
+auto Compressor::LzmaStreamOperations::detach_input_src() -> void {
+    m_p->m_compression_stream.next_in = nullptr;
+    m_p->m_compression_stream.avail_in = 0;
+}
+
+auto Compressor::LzmaStreamOperations::detach_output_buffer() -> void {
+    m_p->m_compression_stream.next_out = nullptr;
+    m_p->m_compression_stream.avail_out = 0;
+}
+
+auto Compressor::LzmaStreamOperations::init_lzma_encoder(lzma_check check) -> void {
+    lzma_options_lzma options;
+    if (0 != lzma_lzma_preset(&options, m_p->m_compression_level)) {
+        SPDLOG_ERROR("Failed to initialize LZMA options' compression level.");
+        throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
+    }
+    options.dict_size = m_p->m_dict_size;
+    std::array<lzma_filter, 2> filters{{
+            {.id = LZMA_FILTER_LZMA2, .options = &options},
+            {.id = LZMA_VLI_UNKNOWN, .options = nullptr},
+    }};
+
+    m_p->m_compression_stream = LZMA_STREAM_INIT;
+    auto const rc = lzma_stream_encoder(&m_p->m_compression_stream, filters.data(), check);
+    if (LZMA_OK == rc) {
+        return;
+    }
+
+    char const* msg{nullptr};
+    switch (rc) {
+        case LZMA_MEM_ERROR:
+            msg = "Memory allocation failed";
+            break;
+
+        case LZMA_OPTIONS_ERROR:
+            msg = "Specified preset is not supported";
+            break;
+
+        case LZMA_UNSUPPORTED_CHECK:
+            msg = "Specified integrity check is not supported";
+            break;
+
+        case LZMA_PROG_ERROR:
+            msg = "Input arguments are not sane";
+            break;
+
+        default:
+            msg = "Unknown error";
+            break;
+    }
+
+    SPDLOG_ERROR("Error initializing the encoder: {} (error code {})", msg, static_cast<int>(rc));
+    throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
 }
 }  // namespace clp::streaming_compression::lzma
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index 986137aa2..3e7af18ff 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -2,7 +2,7 @@
 #define CLP_STREAMING_COMPRESSION_LZMA_COMPRESSOR_HPP
 
 #include <cstddef>
-#include <memory>
+#include <cstdint>
 
 #include <lzma.h>
 
@@ -89,6 +89,48 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     }
 
 private:
+    class LzmaStreamOperations {
+    public:
+        // Constructor
+        LzmaStreamOperations(Compressor* parent) : m_p(parent) {}
+
+        // Destructor
+        ~LzmaStreamOperations() = default;
+
+        // Delete copy constructor and assignment operator
+        LzmaStreamOperations(LzmaStreamOperations const&) = delete;
+        auto operator=(LzmaStreamOperations const&) -> LzmaStreamOperations& = delete;
+
+        // Default move constructor and assignment operator
+        LzmaStreamOperations(LzmaStreamOperations&&) noexcept = default;
+        auto operator=(LzmaStreamOperations&&) noexcept -> LzmaStreamOperations& = default;
+
+        [[nodiscard]] static auto is_flush_action(lzma_action action) -> bool;
+
+        /**
+         * Attaches a pre-allocated block buffer to the encoder's output stream
+         *
+         * Subsequent calls to this function resets the output buffer to its initial state.
+         */
+        auto attach_output_buffer() -> void;
+
+        auto detach_input_src() -> void;
+
+        auto detach_output_buffer() -> void;
+
+        /**
+         * Initializes an LZMA compression encoder and its streams
+         *
+         * @param check Type of integrity check calculated from the uncompressed data.
+         * LZMA_CHECK_CRC64 is the default in the xz command line tool. If the .xz file needs to be
+         * decompressed with XZ-Embedded, use LZMA_CHECK_CRC32 instead.
+         */
+        auto init_lzma_encoder(lzma_check check = LZMA_CHECK_CRC64) -> void;
+
+    private:
+        Compressor* m_p;
+    };
+
     static constexpr size_t cCompressedStreamBlockBufferSize{4096};  // 4KiB
 
     /**
@@ -119,7 +161,8 @@ class Compressor : public ::clp::streaming_compression::Compressor {
 
     /**
      * Flushes the current compressed data in the output block buffer to the output file handler.
-     * Reset the output block buffer to receive new data.
+     *
+     * Also resets the output block buffer to receive new data.
      */
     auto flush_stream_output_block_buffer() -> void;
 
@@ -127,11 +170,12 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     FileWriter* m_compressed_stream_file_writer{nullptr};
 
     // Compressed stream variables
-    lzma_stream m_compression_stream;
-    size_t m_dict_size{cDefaultDictionarySize};
-
+    LzmaStreamOperations m_lzma_ops{this};
     Array<uint8_t> m_compressed_stream_block_buffer{cCompressedStreamBlockBufferSize};
-
+    int m_compression_level{cDefaultCompressionLevel};
+    lzma_stream m_compression_stream = LZMA_STREAM_INIT;
+    // Specifies how many bytes of the recently processed uncompressed data to keep in the memory
+    size_t m_dict_size{cDefaultDictionarySize};
     size_t m_uncompressed_stream_pos{0};
 };
 }  // namespace clp::streaming_compression::lzma

From 7cedb25ef7831583219598374a14a433e80a9564 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Wed, 18 Dec 2024 21:27:35 -0500
Subject: [PATCH 36/65] Adress coderabbit suggestions

---
 .../core/src/clp/streaming_compression/lzma/Compressor.cpp   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 4a43e93e8..3e6bb0254 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -37,8 +37,9 @@ auto Compressor::close() -> void {
     }
 
     if (m_compression_stream.avail_in > 0) {
-        SPDLOG_ERROR("Tried to close LZMA compressor with unprocessed input data.");
-        throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
+        SPDLOG_WARN("Trying to close LZMA compressor with unprocessed input data. Processing and "
+                    "flushing remaining data.");
+        flush_lzma(LZMA_FULL_FLUSH);
     }
 
     flush_lzma(LZMA_FINISH);

From 3dbe388342f6e4a8053f75bafcf75ed6a103b32b Mon Sep 17 00:00:00 2001
From: haiqi96 <14502009+haiqi96@users.noreply.github.com>
Date: Wed, 27 Nov 2024 11:54:47 -0500
Subject: [PATCH 37/65] feat(clp-package): Add support for deleting archives
 that are exclusively within a time range. (#594)

Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com>
---
 .../clp_package_utils/__init__.py             |  13 ++
 .../clp_package_utils/general.py              |   2 +-
 .../clp_package_utils/scripts/compress.py     |  10 +-
 .../clp_package_utils/scripts/decompress.py   |  14 +-
 .../clp_package_utils/scripts/del_archives.py | 103 +++++++++++++
 .../scripts/native/compress.py                |   8 -
 .../scripts/native/decompress.py              |   8 -
 .../scripts/native/del_archives.py            | 139 ++++++++++++++++++
 .../scripts/native/search.py                  |   8 -
 .../clp_package_utils/scripts/search.py       |  10 +-
 .../clp_package_utils/scripts/start_clp.py    |  10 +-
 .../clp_package_utils/scripts/stop_clp.py     |  10 +-
 .../src/sbin/admin-tools/del-archives.sh      |   9 ++
 13 files changed, 272 insertions(+), 72 deletions(-)
 create mode 100644 components/clp-package-utils/clp_package_utils/scripts/del_archives.py
 create mode 100644 components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py
 create mode 100755 components/package-template/src/sbin/admin-tools/del-archives.sh

diff --git a/components/clp-package-utils/clp_package_utils/__init__.py b/components/clp-package-utils/clp_package_utils/__init__.py
index e69de29bb..5253a87e5 100644
--- a/components/clp-package-utils/clp_package_utils/__init__.py
+++ b/components/clp-package-utils/clp_package_utils/__init__.py
@@ -0,0 +1,13 @@
+import logging
+
+# Set up console logging
+logging_console_handler = logging.StreamHandler()
+logging_formatter = logging.Formatter(
+    "%(asctime)s.%(msecs)03d %(levelname)s [%(module)s] %(message)s", datefmt="%Y-%m-%dT%H:%M:%S"
+)
+logging_console_handler.setFormatter(logging_formatter)
+
+# Set up root logger
+root_logger = logging.getLogger()
+root_logger.setLevel(logging.INFO)
+root_logger.addHandler(logging_console_handler)
diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py
index f42542ebc..5fae8166f 100644
--- a/components/clp-package-utils/clp_package_utils/general.py
+++ b/components/clp-package-utils/clp_package_utils/general.py
@@ -107,7 +107,7 @@ def get_clp_home():
     return clp_home.resolve()
 
 
-def generate_container_name(job_type: JobType) -> str:
+def generate_container_name(job_type: str) -> str:
     """
     :param job_type:
     :return: A unique container name for the given job type.
diff --git a/components/clp-package-utils/clp_package_utils/scripts/compress.py b/components/clp-package-utils/clp_package_utils/scripts/compress.py
index d0aa30913..efd3180ae 100755
--- a/components/clp-package-utils/clp_package_utils/scripts/compress.py
+++ b/components/clp-package-utils/clp_package_utils/scripts/compress.py
@@ -18,15 +18,7 @@
     validate_and_load_db_credentials_file,
 )
 
-# Setup logging
-# Create logger
 logger = logging.getLogger(__file__)
-logger.setLevel(logging.INFO)
-# Setup console logging
-logging_console_handler = logging.StreamHandler()
-logging_formatter = logging.Formatter("%(asctime)s [%(levelname)s] [%(name)s] %(message)s")
-logging_console_handler.setFormatter(logging_formatter)
-logger.addHandler(logging_console_handler)
 
 
 def main(argv):
@@ -66,7 +58,7 @@ def main(argv):
         logger.exception("Failed to load config.")
         return -1
 
-    container_name = generate_container_name(JobType.COMPRESSION)
+    container_name = generate_container_name(str(JobType.COMPRESSION))
 
     container_clp_config, mounts = generate_container_config(clp_config, clp_home)
     generated_config_path_on_container, generated_config_path_on_host = dump_container_config(
diff --git a/components/clp-package-utils/clp_package_utils/scripts/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/decompress.py
index 9085fb162..325f2add6 100755
--- a/components/clp-package-utils/clp_package_utils/scripts/decompress.py
+++ b/components/clp-package-utils/clp_package_utils/scripts/decompress.py
@@ -25,15 +25,7 @@
     validate_path_could_be_dir,
 )
 
-# Setup logging
-# Create logger
-logger = logging.getLogger("clp")
-logger.setLevel(logging.DEBUG)
-# Setup console logging
-logging_console_handler = logging.StreamHandler()
-logging_formatter = logging.Formatter("%(asctime)s [%(levelname)s] [%(name)s] %(message)s")
-logging_console_handler.setFormatter(logging_formatter)
-logger.addHandler(logging_console_handler)
+logger = logging.getLogger(__file__)
 
 
 def validate_and_load_config(
@@ -89,7 +81,7 @@ def handle_extract_file_cmd(
     if clp_config is None:
         return -1
 
-    container_name = generate_container_name(JobType.FILE_EXTRACTION)
+    container_name = generate_container_name(str(JobType.FILE_EXTRACTION))
     container_clp_config, mounts = generate_container_config(clp_config, clp_home)
     generated_config_path_on_container, generated_config_path_on_host = dump_container_config(
         container_clp_config, clp_config, container_name
@@ -164,7 +156,7 @@ def handle_extract_stream_cmd(
     if clp_config is None:
         return -1
 
-    container_name = generate_container_name(JobType.IR_EXTRACTION)
+    container_name = generate_container_name(str(JobType.IR_EXTRACTION))
     container_clp_config, mounts = generate_container_config(clp_config, clp_home)
     generated_config_path_on_container, generated_config_path_on_host = dump_container_config(
         container_clp_config, clp_config, container_name
diff --git a/components/clp-package-utils/clp_package_utils/scripts/del_archives.py b/components/clp-package-utils/clp_package_utils/scripts/del_archives.py
new file mode 100644
index 000000000..54d959771
--- /dev/null
+++ b/components/clp-package-utils/clp_package_utils/scripts/del_archives.py
@@ -0,0 +1,103 @@
+import argparse
+import logging
+import subprocess
+import sys
+from pathlib import Path
+
+from clp_package_utils.general import (
+    CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH,
+    dump_container_config,
+    generate_container_config,
+    generate_container_name,
+    generate_container_start_cmd,
+    get_clp_home,
+    load_config_file,
+    validate_and_load_db_credentials_file,
+)
+
+logger = logging.getLogger(__file__)
+
+
+def main(argv):
+    clp_home = get_clp_home()
+    default_config_file_path = clp_home / CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH
+
+    args_parser = argparse.ArgumentParser(
+        description="Deletes archives that fall within the specified time range."
+    )
+    args_parser.add_argument(
+        "--config",
+        "-c",
+        default=str(default_config_file_path),
+        help="CLP package configuration file.",
+    )
+    args_parser.add_argument(
+        "--begin-ts",
+        type=int,
+        default=0,
+        help="Time-range lower-bound (inclusive) as milliseconds from the UNIX epoch.",
+    )
+    args_parser.add_argument(
+        "--end-ts",
+        type=int,
+        required=True,
+        help="Time-range upper-bound (include) as milliseconds from the UNIX epoch.",
+    )
+    parsed_args = args_parser.parse_args(argv[1:])
+
+    # Validate and load config file
+    try:
+        config_file_path = Path(parsed_args.config)
+        clp_config = load_config_file(config_file_path, default_config_file_path, clp_home)
+        clp_config.validate_logs_dir()
+
+        # Validate and load necessary credentials
+        validate_and_load_db_credentials_file(clp_config, clp_home, False)
+    except:
+        logger.exception("Failed to load config.")
+        return -1
+
+    # Validate the input timestamp
+    begin_ts = parsed_args.begin_ts
+    end_ts = parsed_args.end_ts
+    if begin_ts > end_ts:
+        logger.error("begin-ts must be <= end-ts")
+        return -1
+    if end_ts < 0 or begin_ts < 0:
+        logger.error("begin_ts and end_ts must be non-negative.")
+        return -1
+
+    container_name = generate_container_name("del-archives")
+
+    container_clp_config, mounts = generate_container_config(clp_config, clp_home)
+    generated_config_path_on_container, generated_config_path_on_host = dump_container_config(
+        container_clp_config, clp_config, container_name
+    )
+
+    necessary_mounts = [mounts.clp_home, mounts.logs_dir, mounts.archives_output_dir]
+    container_start_cmd = generate_container_start_cmd(
+        container_name, necessary_mounts, clp_config.execution_container
+    )
+
+    # fmt: off
+    del_archive_cmd = [
+        "python3",
+        "-m", "clp_package_utils.scripts.native.del_archives",
+        "--config", str(generated_config_path_on_container),
+        str(begin_ts),
+        str(end_ts)
+
+    ]
+    # fmt: on
+
+    cmd = container_start_cmd + del_archive_cmd
+    subprocess.run(cmd, check=True)
+
+    # Remove generated files
+    generated_config_path_on_host.unlink()
+
+    return 0
+
+
+if "__main__" == __name__:
+    sys.exit(main(sys.argv))
diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/compress.py b/components/clp-package-utils/clp_package_utils/scripts/native/compress.py
index cb495204f..b6d9bb7eb 100755
--- a/components/clp-package-utils/clp_package_utils/scripts/native/compress.py
+++ b/components/clp-package-utils/clp_package_utils/scripts/native/compress.py
@@ -23,15 +23,7 @@
     load_config_file,
 )
 
-# Setup logging
-# Create logger
 logger = logging.getLogger(__file__)
-logger.setLevel(logging.INFO)
-# Setup console logging
-logging_console_handler = logging.StreamHandler()
-logging_formatter = logging.Formatter("%(asctime)s [%(levelname)s] [%(name)s] %(message)s")
-logging_console_handler.setFormatter(logging_formatter)
-logger.addHandler(logging_console_handler)
 
 
 def print_compression_job_status(job_row, current_time):
diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py
index 7cce5d92a..d16cdcb6f 100755
--- a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py
+++ b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py
@@ -32,15 +32,7 @@
     wait_for_query_job,
 )
 
-# Setup logging
-# Create logger
 logger = logging.getLogger(__file__)
-logger.setLevel(logging.INFO)
-# Setup console logging
-logging_console_handler = logging.StreamHandler()
-logging_formatter = logging.Formatter("%(asctime)s [%(levelname)s] [%(name)s] %(message)s")
-logging_console_handler.setFormatter(logging_formatter)
-logger.addHandler(logging_console_handler)
 
 
 def get_orig_file_id(db_config: Database, path: str) -> Optional[str]:
diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py b/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py
new file mode 100644
index 000000000..735bf299d
--- /dev/null
+++ b/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py
@@ -0,0 +1,139 @@
+import argparse
+import logging
+import shutil
+import sys
+from contextlib import closing
+from pathlib import Path
+from typing import List
+
+from clp_py_utils.clp_config import Database
+from clp_py_utils.sql_adapter import SQL_Adapter
+
+from clp_package_utils.general import (
+    CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH,
+    get_clp_home,
+    load_config_file,
+)
+
+logger = logging.getLogger(__file__)
+
+
+def main(argv):
+    clp_home = get_clp_home()
+    default_config_file_path = clp_home / CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH
+
+    args_parser = argparse.ArgumentParser(
+        description="Deletes archives that fall within the specified time range."
+    )
+    args_parser.add_argument(
+        "--config",
+        "-c",
+        required=True,
+        default=str(default_config_file_path),
+        help="CLP configuration file.",
+    )
+    args_parser.add_argument(
+        "begin_ts",
+        type=int,
+        help="Time-range lower-bound (inclusive) as milliseconds from the UNIX epoch.",
+    )
+    args_parser.add_argument(
+        "end_ts",
+        type=int,
+        help="Time-range upper-bound (include) as milliseconds from the UNIX epoch.",
+    )
+    parsed_args = args_parser.parse_args(argv[1:])
+
+    # Validate and load config file
+    config_file_path = Path(parsed_args.config)
+    try:
+        clp_config = load_config_file(config_file_path, default_config_file_path, clp_home)
+        clp_config.validate_logs_dir()
+    except:
+        logger.exception("Failed to load config.")
+        return -1
+
+    database_config = clp_config.database
+    archives_dir = clp_config.archive_output.directory
+    if not archives_dir.exists():
+        logger.error("`archive_output.directory` doesn't exist.")
+        return -1
+
+    return _delete_archives(
+        archives_dir,
+        database_config,
+        parsed_args.begin_ts,
+        parsed_args.end_ts,
+    )
+
+
+def _delete_archives(
+    archives_dir: Path,
+    database_config: Database,
+    begin_ts: int,
+    end_ts: int,
+) -> int:
+    """
+    Deletes all archives where `begin_ts <= archive.begin_timestamp` and
+    `archive.end_timestamp <= end_ts` from both the metadata database and disk.
+    :param archives_dir:
+    :param database_config:
+    :param begin_ts:
+    :param end_ts:
+    :return: 0 on success, -1 otherwise.
+    """
+
+    archive_ids: List[str]
+    logger.info("Starting to delete archives from the database.")
+    try:
+        sql_adapter = SQL_Adapter(database_config)
+        clp_db_connection_params = database_config.get_clp_connection_params_and_type(True)
+        table_prefix = clp_db_connection_params["table_prefix"]
+        with closing(sql_adapter.create_connection(True)) as db_conn, closing(
+            db_conn.cursor(dictionary=True)
+        ) as db_cursor:
+            db_cursor.execute(
+                f"""
+                DELETE FROM `{table_prefix}archives`
+                WHERE begin_timestamp >= %s AND end_timestamp <= %s
+                RETURNING id
+                """,
+                (begin_ts, end_ts),
+            )
+            results = db_cursor.fetchall()
+
+            if 0 == len(results):
+                logger.info("No archives (exclusively) within the specified time range.")
+                return 0
+
+            archive_ids = [result["id"] for result in results]
+            db_cursor.execute(
+                f"""
+                DELETE FROM `{table_prefix}files`
+                WHERE archive_id in ({', '.join(['%s'] * len(archive_ids))})
+                """,
+                archive_ids,
+            )
+            db_conn.commit()
+    except Exception:
+        logger.exception("Failed to delete archives from the database. Aborting deletion.")
+        return -1
+
+    logger.info(f"Finished deleting archives from the database.")
+
+    for archive_id in archive_ids:
+        archive_path = archives_dir / archive_id
+        if not archive_path.is_dir():
+            logger.warning(f"Archive {archive_id} is not a directory. Skipping deletion.")
+            continue
+
+        logger.info(f"Deleting archive {archive_id} from disk.")
+        shutil.rmtree(archive_path)
+
+    logger.info(f"Finished deleting archives from disk.")
+
+    return 0
+
+
+if "__main__" == __name__:
+    sys.exit(main(sys.argv))
diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/search.py b/components/clp-package-utils/clp_package_utils/scripts/native/search.py
index 7dd247fa5..d166cf35f 100755
--- a/components/clp-package-utils/clp_package_utils/scripts/native/search.py
+++ b/components/clp-package-utils/clp_package_utils/scripts/native/search.py
@@ -26,15 +26,7 @@
     wait_for_query_job,
 )
 
-# Setup logging
-# Create logger
 logger = logging.getLogger(__file__)
-logger.setLevel(logging.INFO)
-# Setup console logging
-logging_console_handler = logging.StreamHandler()
-logging_formatter = logging.Formatter("%(asctime)s [%(levelname)s] [%(name)s] %(message)s")
-logging_console_handler.setFormatter(logging_formatter)
-logger.addHandler(logging_console_handler)
 
 
 def create_and_monitor_job_in_db(
diff --git a/components/clp-package-utils/clp_package_utils/scripts/search.py b/components/clp-package-utils/clp_package_utils/scripts/search.py
index f3f02046d..beb7fb0b0 100755
--- a/components/clp-package-utils/clp_package_utils/scripts/search.py
+++ b/components/clp-package-utils/clp_package_utils/scripts/search.py
@@ -20,15 +20,7 @@
     validate_and_load_db_credentials_file,
 )
 
-# Setup logging
-# Create logger
 logger = logging.getLogger(__file__)
-logger.setLevel(logging.INFO)
-# Setup console logging
-logging_console_handler = logging.StreamHandler()
-logging_formatter = logging.Formatter("%(asctime)s [%(levelname)s] [%(name)s] %(message)s")
-logging_console_handler.setFormatter(logging_formatter)
-logger.addHandler(logging_console_handler)
 
 
 def main(argv):
@@ -82,7 +74,7 @@ def main(argv):
         logger.exception("Failed to load config.")
         return -1
 
-    container_name = generate_container_name(JobType.SEARCH)
+    container_name = generate_container_name(str(JobType.SEARCH))
 
     container_clp_config, mounts = generate_container_config(clp_config, clp_home)
     generated_config_path_on_container, generated_config_path_on_host = dump_container_config(
diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py
index 6732ded0b..8097929f1 100755
--- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py
+++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py
@@ -59,15 +59,7 @@
     validate_worker_config,
 )
 
-# Setup logging
-# Create logger
-logger = logging.getLogger("clp")
-logger.setLevel(logging.INFO)
-# Setup console logging
-logging_console_handler = logging.StreamHandler()
-logging_formatter = logging.Formatter("%(asctime)s [%(levelname)s] [%(name)s] %(message)s")
-logging_console_handler.setFormatter(logging_formatter)
-logger.addHandler(logging_console_handler)
+logger = logging.getLogger(__file__)
 
 
 def container_exists(container_name):
diff --git a/components/clp-package-utils/clp_package_utils/scripts/stop_clp.py b/components/clp-package-utils/clp_package_utils/scripts/stop_clp.py
index f100a098a..a55d7a795 100755
--- a/components/clp-package-utils/clp_package_utils/scripts/stop_clp.py
+++ b/components/clp-package-utils/clp_package_utils/scripts/stop_clp.py
@@ -31,15 +31,7 @@
     validate_and_load_queue_credentials_file,
 )
 
-# Setup logging
-# Create logger
-logger = logging.getLogger("clp")
-logger.setLevel(logging.INFO)
-# Setup console logging
-logging_console_handler = logging.StreamHandler()
-logging_formatter = logging.Formatter("%(asctime)s [%(levelname)s] [%(name)s] %(message)s")
-logging_console_handler.setFormatter(logging_formatter)
-logger.addHandler(logging_console_handler)
+logger = logging.getLogger(__file__)
 
 
 def stop_running_container(container_name: str, already_exited_containers: List[str], force: bool):
diff --git a/components/package-template/src/sbin/admin-tools/del-archives.sh b/components/package-template/src/sbin/admin-tools/del-archives.sh
new file mode 100755
index 000000000..4d7ebc6b7
--- /dev/null
+++ b/components/package-template/src/sbin/admin-tools/del-archives.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+package_root="$script_dir/../.."
+
+PYTHONPATH=$(readlink -f "$package_root/lib/python3/site-packages") \
+    python3 \
+    -m clp_package_utils.scripts.del_archives \
+    "$@"

From 5d900544aff94dc0ab1c38205a9dbfb4acc169f6 Mon Sep 17 00:00:00 2001
From: wraymo <37269683+wraymo@users.noreply.github.com>
Date: Wed, 27 Nov 2024 16:36:34 -0500
Subject: [PATCH 38/65] feat(clp-s): Add the write path for single-file
 archives. (#563)

Co-authored-by: Devin Gibson <gibber9809@users.noreply.github.com>
---
 components/core/src/clp_s/ArchiveWriter.cpp   | 192 +++++++++++++++---
 components/core/src/clp_s/ArchiveWriter.hpp   |  66 +++++-
 .../core/src/clp_s/CommandLineArguments.cpp   |   4 +
 .../core/src/clp_s/CommandLineArguments.hpp   |   3 +
 components/core/src/clp_s/JsonParser.cpp      |   1 +
 components/core/src/clp_s/JsonParser.hpp      |   1 +
 .../core/src/clp_s/SingleFileArchiveDefs.hpp  |  59 ++++++
 .../src/clp_s/TimestampDictionaryWriter.cpp   |  60 ++----
 .../src/clp_s/TimestampDictionaryWriter.hpp   |  43 ++--
 components/core/src/clp_s/TimestampEntry.cpp  |  23 ++-
 components/core/src/clp_s/TimestampEntry.hpp  |   6 +-
 components/core/src/clp_s/Utils.hpp           |  12 ++
 .../core/src/clp_s/archive_constants.hpp      |   3 +
 components/core/src/clp_s/clp-s.cpp           |   1 +
 14 files changed, 359 insertions(+), 115 deletions(-)
 create mode 100644 components/core/src/clp_s/SingleFileArchiveDefs.hpp

diff --git a/components/core/src/clp_s/ArchiveWriter.cpp b/components/core/src/clp_s/ArchiveWriter.cpp
index 7118ce88b..d627479de 100644
--- a/components/core/src/clp_s/ArchiveWriter.cpp
+++ b/components/core/src/clp_s/ArchiveWriter.cpp
@@ -1,6 +1,8 @@
 #include "ArchiveWriter.hpp"
 
 #include <algorithm>
+#include <filesystem>
+#include <sstream>
 
 #include <json/single_include/nlohmann/json.hpp>
 
@@ -13,18 +15,23 @@ void ArchiveWriter::open(ArchiveWriterOption const& option) {
     m_id = boost::uuids::to_string(option.id);
     m_compression_level = option.compression_level;
     m_print_archive_stats = option.print_archive_stats;
+    m_single_file_archive = option.single_file_archive;
     m_min_table_size = option.min_table_size;
-    auto archive_path = boost::filesystem::path(option.archives_dir) / m_id;
+    m_archives_dir = option.archives_dir;
+    std::string working_dir_name = m_id;
+    if (option.single_file_archive) {
+        working_dir_name += constants::cTmpPostfix;
+    }
+    auto archive_path = std::filesystem::path(option.archives_dir) / working_dir_name;
 
-    boost::system::error_code boost_error_code;
-    bool path_exists = boost::filesystem::exists(archive_path, boost_error_code);
-    if (path_exists) {
+    std::error_code ec;
+    if (std::filesystem::exists(archive_path, ec)) {
         SPDLOG_ERROR("Archive path already exists: {}", archive_path.c_str());
         throw OperationFailed(ErrorCodeUnsupported, __FILENAME__, __LINE__);
     }
 
     m_archive_path = archive_path.string();
-    if (false == boost::filesystem::create_directory(m_archive_path)) {
+    if (false == std::filesystem::create_directory(m_archive_path, ec)) {
         throw OperationFailed(ErrorCodeErrno, __FILENAME__, __LINE__);
     }
 
@@ -39,20 +46,42 @@ void ArchiveWriter::open(ArchiveWriterOption const& option) {
     std::string array_dict_path = m_archive_path + constants::cArchiveArrayDictFile;
     m_array_dict = std::make_shared<LogTypeDictionaryWriter>();
     m_array_dict->open(array_dict_path, m_compression_level, UINT64_MAX);
-
-    std::string timestamp_dict_path = m_archive_path + constants::cArchiveTimestampDictFile;
-    m_timestamp_dict = std::make_shared<TimestampDictionaryWriter>();
-    m_timestamp_dict->open(timestamp_dict_path, m_compression_level);
 }
 
 void ArchiveWriter::close() {
-    m_compressed_size += m_var_dict->close();
-    m_compressed_size += m_log_dict->close();
-    m_compressed_size += m_array_dict->close();
-    m_compressed_size += m_timestamp_dict->close();
-    m_compressed_size += m_schema_tree.store(m_archive_path, m_compression_level);
-    m_compressed_size += m_schema_map.store(m_archive_path, m_compression_level);
-    m_compressed_size += store_tables();
+    auto var_dict_compressed_size = m_var_dict->close();
+    auto log_dict_compressed_size = m_log_dict->close();
+    auto array_dict_compressed_size = m_array_dict->close();
+    auto schema_tree_compressed_size = m_schema_tree.store(m_archive_path, m_compression_level);
+    auto schema_map_compressed_size = m_schema_map.store(m_archive_path, m_compression_level);
+    auto [table_metadata_compressed_size, table_compressed_size] = store_tables();
+
+    if (m_single_file_archive) {
+        std::vector<ArchiveFileInfo> files{
+                {constants::cArchiveSchemaTreeFile, schema_tree_compressed_size},
+                {constants::cArchiveSchemaMapFile, schema_map_compressed_size},
+                {constants::cArchiveTableMetadataFile, table_metadata_compressed_size},
+                {constants::cArchiveVarDictFile, var_dict_compressed_size},
+                {constants::cArchiveLogDictFile, log_dict_compressed_size},
+                {constants::cArchiveArrayDictFile, array_dict_compressed_size},
+                {constants::cArchiveTablesFile, table_compressed_size}
+        };
+        uint64_t offset = 0;
+        for (auto& file : files) {
+            uint64_t original_size = file.o;
+            file.o = offset;
+            offset += original_size;
+        }
+        write_single_file_archive(files);
+    } else {
+        // Timestamp dictionary written separately here until we transition to moving it inside of
+        // the metadata region of multi-file archives.
+        auto timestamp_dict_compressed_size = write_timestamp_dict();
+        m_compressed_size = var_dict_compressed_size + log_dict_compressed_size
+                            + array_dict_compressed_size + timestamp_dict_compressed_size
+                            + schema_tree_compressed_size + schema_map_compressed_size
+                            + table_metadata_compressed_size + table_compressed_size;
+    }
 
     if (m_metadata_db) {
         update_metadata_db();
@@ -65,12 +94,130 @@ void ArchiveWriter::close() {
     m_id_to_schema_writer.clear();
     m_schema_tree.clear();
     m_schema_map.clear();
+    m_timestamp_dict.clear();
     m_encoded_message_size = 0UL;
     m_uncompressed_size = 0UL;
     m_compressed_size = 0UL;
     m_next_log_event_id = 0;
 }
 
+size_t ArchiveWriter::write_timestamp_dict() {
+    std::string timestamp_dict_path = m_archive_path + constants::cArchiveTimestampDictFile;
+    FileWriter timestamp_dict_file_writer;
+    ZstdCompressor timestamp_dict_compressor;
+    timestamp_dict_file_writer.open(timestamp_dict_path, FileWriter::OpenMode::CreateForWriting);
+    timestamp_dict_compressor.open(timestamp_dict_file_writer, m_compression_level);
+    std::stringstream timestamp_dict_stream;
+    m_timestamp_dict.write(timestamp_dict_stream);
+    std::string encoded_timestamp_dict = timestamp_dict_stream.str();
+    timestamp_dict_compressor.write(encoded_timestamp_dict.data(), encoded_timestamp_dict.size());
+    timestamp_dict_compressor.close();
+    auto compressed_size = timestamp_dict_file_writer.get_pos();
+    timestamp_dict_file_writer.close();
+    return compressed_size;
+}
+
+void ArchiveWriter::write_single_file_archive(std::vector<ArchiveFileInfo> const& files) {
+    std::string single_file_archive_path = (std::filesystem::path(m_archives_dir) / m_id).string();
+    FileWriter archive_writer;
+    archive_writer.open(single_file_archive_path, FileWriter::OpenMode::CreateForWriting);
+
+    write_archive_metadata(archive_writer, files);
+    size_t metadata_section_size = archive_writer.get_pos() - sizeof(ArchiveHeader);
+    write_archive_files(archive_writer, files);
+    m_compressed_size = archive_writer.get_pos();
+    write_archive_header(archive_writer, metadata_section_size);
+
+    archive_writer.close();
+    std::error_code ec;
+    if (false == std::filesystem::remove(m_archive_path, ec)) {
+        throw OperationFailed(ErrorCodeFileExists, __FILENAME__, __LINE__);
+    }
+}
+
+void ArchiveWriter::write_archive_metadata(
+        FileWriter& archive_writer,
+        std::vector<ArchiveFileInfo> const& files
+) {
+    archive_writer.seek_from_begin(sizeof(ArchiveHeader));
+
+    ZstdCompressor compressor;
+    compressor.open(archive_writer, m_compression_level);
+    compressor.write_numeric_value(static_cast<uint8_t>(3U));  // Number of packets
+
+    // Write archive info
+    ArchiveInfoPacket archive_info{.num_segments = 1};
+    std::stringstream msgpack_buffer;
+    msgpack::pack(msgpack_buffer, archive_info);
+    std::string archive_info_str = msgpack_buffer.str();
+    compressor.write_numeric_value(ArchiveMetadataPacketType::ArchiveInfo);
+    compressor.write_numeric_value(static_cast<uint32_t>(archive_info_str.size()));
+    compressor.write_string(archive_info_str);
+
+    // Write archive file info
+    ArchiveFileInfoPacket archive_file_info{.files{files}};
+    msgpack_buffer = std::stringstream{};
+    msgpack::pack(msgpack_buffer, archive_file_info);
+    std::string archive_file_info_str = msgpack_buffer.str();
+    compressor.write_numeric_value(ArchiveMetadataPacketType::ArchiveFileInfo);
+    compressor.write_numeric_value(static_cast<uint32_t>(archive_file_info_str.size()));
+    compressor.write_string(archive_file_info_str);
+
+    // Write timestamp dictionary
+    compressor.write_numeric_value(ArchiveMetadataPacketType::TimestampDictionary);
+    std::stringstream timestamp_dict_stream;
+    m_timestamp_dict.write(timestamp_dict_stream);
+    std::string encoded_timestamp_dict = timestamp_dict_stream.str();
+    compressor.write_numeric_value(static_cast<uint32_t>(encoded_timestamp_dict.size()));
+    compressor.write(encoded_timestamp_dict.data(), encoded_timestamp_dict.size());
+
+    compressor.close();
+}
+
+void ArchiveWriter::write_archive_files(
+        FileWriter& archive_writer,
+        std::vector<ArchiveFileInfo> const& files
+) {
+    FileReader reader;
+    for (auto const& file : files) {
+        std::string file_path = m_archive_path + file.n;
+        reader.open(file_path);
+        char read_buffer[cReadBlockSize];
+        while (true) {
+            size_t num_bytes_read{0};
+            ErrorCode const error_code
+                    = reader.try_read(read_buffer, cReadBlockSize, num_bytes_read);
+            if (ErrorCodeEndOfFile == error_code) {
+                break;
+            } else if (ErrorCodeSuccess != error_code) {
+                throw OperationFailed(error_code, __FILENAME__, __LINE__);
+            }
+            archive_writer.write(read_buffer, num_bytes_read);
+        }
+        reader.close();
+        if (false == std::filesystem::remove(file_path)) {
+            throw OperationFailed(ErrorCodeFileExists, __FILENAME__, __LINE__);
+        }
+    }
+}
+
+void ArchiveWriter::write_archive_header(FileWriter& archive_writer, size_t metadata_section_size) {
+    ArchiveHeader header{
+            .magic_number{0},
+            .version
+            = (cArchiveMajorVersion << 24) | (cArchiveMinorVersion << 16) | cArchivePatchVersion,
+            .uncompressed_size = m_uncompressed_size,
+            .compressed_size = m_compressed_size,
+            .reserved_padding{0},
+            .metadata_section_size = static_cast<uint32_t>(metadata_section_size),
+            .compression_type = static_cast<uint16_t>(ArchiveCompressionType::Zstd),
+            .padding = 0
+    };
+    std::memcpy(&header.magic_number, cStructuredSFAMagicNumber, sizeof(header.magic_number));
+    archive_writer.seek_from_begin(0);
+    archive_writer.write(reinterpret_cast<char const*>(&header), sizeof(header));
+}
+
 void ArchiveWriter::append_message(
         int32_t schema_id,
         Schema const& schema,
@@ -132,8 +279,7 @@ void ArchiveWriter::initialize_schema_writer(SchemaWriter* writer, Schema const&
     }
 }
 
-size_t ArchiveWriter::store_tables() {
-    size_t compressed_size = 0;
+std::pair<size_t, size_t> ArchiveWriter::store_tables() {
     m_tables_file_writer.open(
             m_archive_path + constants::cArchiveTablesFile,
             FileWriter::OpenMode::CreateForWriting
@@ -243,13 +389,13 @@ size_t ArchiveWriter::store_tables() {
     }
     m_table_metadata_compressor.close();
 
-    compressed_size += m_table_metadata_file_writer.get_pos();
-    compressed_size += m_tables_file_writer.get_pos();
+    auto table_metadata_compressed_size = m_table_metadata_file_writer.get_pos();
+    auto table_compressed_size = m_tables_file_writer.get_pos();
 
     m_table_metadata_file_writer.close();
     m_tables_file_writer.close();
 
-    return compressed_size;
+    return {table_metadata_compressed_size, table_compressed_size};
 }
 
 void ArchiveWriter::update_metadata_db() {
@@ -262,8 +408,8 @@ void ArchiveWriter::update_metadata_db() {
     metadata.increment_static_compressed_size(m_compressed_size);
     metadata.increment_static_uncompressed_size(m_uncompressed_size);
     metadata.expand_time_range(
-            m_timestamp_dict->get_begin_timestamp(),
-            m_timestamp_dict->get_end_timestamp()
+            m_timestamp_dict.get_begin_timestamp(),
+            m_timestamp_dict.get_end_timestamp()
     );
 
     m_metadata_db->add_archive(m_id, metadata);
diff --git a/components/core/src/clp_s/ArchiveWriter.hpp b/components/core/src/clp_s/ArchiveWriter.hpp
index 87e9d11e5..3b13f4426 100644
--- a/components/core/src/clp_s/ArchiveWriter.hpp
+++ b/components/core/src/clp_s/ArchiveWriter.hpp
@@ -14,6 +14,7 @@
 #include "SchemaMap.hpp"
 #include "SchemaTree.hpp"
 #include "SchemaWriter.hpp"
+#include "SingleFileArchiveDefs.hpp"
 #include "TimestampDictionaryWriter.hpp"
 
 namespace clp_s {
@@ -22,6 +23,7 @@ struct ArchiveWriterOption {
     std::string archives_dir;
     int compression_level;
     bool print_archive_stats;
+    bool single_file_archive;
     size_t min_table_size;
 };
 
@@ -125,7 +127,7 @@ class ArchiveWriter {
             std::string const& timestamp,
             uint64_t& pattern_id
     ) {
-        return m_timestamp_dict->ingest_entry(key, node_id, timestamp, pattern_id);
+        return m_timestamp_dict.ingest_entry(key, node_id, timestamp, pattern_id);
     }
 
     /**
@@ -135,21 +137,24 @@ class ArchiveWriter {
      * @param timestamp
      */
     void ingest_timestamp_entry(std::string const& key, int32_t node_id, double timestamp) {
-        m_timestamp_dict->ingest_entry(key, node_id, timestamp);
+        m_timestamp_dict.ingest_entry(key, node_id, timestamp);
     }
 
     void ingest_timestamp_entry(std::string const& key, int32_t node_id, int64_t timestamp) {
-        m_timestamp_dict->ingest_entry(key, node_id, timestamp);
+        m_timestamp_dict.ingest_entry(key, node_id, timestamp);
     }
 
     /**
-     * Increments the size of the compressed data written to the archive
+     * Increments the size of the original (uncompressed) logs ingested into the archive. This size
+     * tracks the raw input size before any encoding or compression.
      * @param size
      */
     void increment_uncompressed_size(size_t size) { m_uncompressed_size += size; }
 
     /**
-     * @return Size of the uncompressed data written to the archive
+     * @return The total size of the encoded (uncompressed) data written to the archive. This
+     * reflects the size of the data after encoding but before compression.
+     * TODO: Add the size of schema tree, schema map and timestamp dictionary
      */
     size_t get_data_size();
 
@@ -162,10 +167,40 @@ class ArchiveWriter {
     void initialize_schema_writer(SchemaWriter* writer, Schema const& schema);
 
     /**
-     * Stores the tables
-     * @return Size of the compressed data in bytes
+     * Compresses and stores the tables.
+     * @return A pair containing:
+     *         - The size of the compressed table metadata in bytes.
+     *         - The size of the compressed tables in bytes.
      */
-    [[nodiscard]] size_t store_tables();
+    [[nodiscard]] std::pair<size_t, size_t> store_tables();
+
+    /**
+     * Writes the archive to a single file
+     * @param files
+     */
+    void write_single_file_archive(std::vector<ArchiveFileInfo> const& files);
+
+    /**
+     * Writes the metadata section of the single file archive
+     * @param archive_writer
+     * @param files
+     */
+    void
+    write_archive_metadata(FileWriter& archive_writer, std::vector<ArchiveFileInfo> const& files);
+
+    /**
+     * Writes the file section of the single file archive
+     * @param archive_writer
+     * @param files
+     */
+    void write_archive_files(FileWriter& archive_writer, std::vector<ArchiveFileInfo> const& files);
+
+    /**
+     * Writes the header section of the single file archive
+     * @param archive_writer
+     * @param metadata_section_size
+     */
+    void write_archive_header(FileWriter& archive_writer, size_t metadata_section_size);
 
     /**
      * Updates the metadata db with the archive's metadata (id, size, timestamp ranges, etc.)
@@ -177,6 +212,17 @@ class ArchiveWriter {
      */
     void print_archive_stats();
 
+    /**
+     * Write the timestamp dictionary as a dedicated file for multi-file archives.
+     *
+     * Note: the timestamp dictionary will be moved into the metadata region of multi-file archives
+     * in a follow-up PR.
+     * @return the compressed size of the Timestamp Dictionary in bytes
+     */
+    size_t write_timestamp_dict();
+
+    static constexpr size_t cReadBlockSize = 4 * 1024;
+
     size_t m_encoded_message_size{};
     size_t m_uncompressed_size{};
     size_t m_compressed_size{};
@@ -184,16 +230,18 @@ class ArchiveWriter {
 
     std::string m_id;
 
+    std::string m_archives_dir;
     std::string m_archive_path;
     std::string m_encoded_messages_dir;
 
     std::shared_ptr<VariableDictionaryWriter> m_var_dict;
     std::shared_ptr<LogTypeDictionaryWriter> m_log_dict;
     std::shared_ptr<LogTypeDictionaryWriter> m_array_dict;  // log type dictionary for arrays
-    std::shared_ptr<TimestampDictionaryWriter> m_timestamp_dict;
+    TimestampDictionaryWriter m_timestamp_dict;
     std::shared_ptr<clp::GlobalMySQLMetadataDB> m_metadata_db;
     int m_compression_level{};
     bool m_print_archive_stats{};
+    bool m_single_file_archive{};
     size_t m_min_table_size{};
 
     SchemaMap m_schema_map;
diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp
index d174b4a23..99539b627 100644
--- a/components/core/src/clp_s/CommandLineArguments.cpp
+++ b/components/core/src/clp_s/CommandLineArguments.cpp
@@ -190,6 +190,10 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
                     "print-archive-stats",
                     po::bool_switch(&m_print_archive_stats),
                     "Print statistics (json) about the archive after it's compressed."
+            )(
+                    "single-file-archive",
+                    po::bool_switch(&m_single_file_archive),
+                    "Create a single archive file instead of multiple files."
             )(
                     "structurize-arrays",
                     po::bool_switch(&m_structurize_arrays),
diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp
index 913e27fbc..a87e9b6bd 100644
--- a/components/core/src/clp_s/CommandLineArguments.hpp
+++ b/components/core/src/clp_s/CommandLineArguments.hpp
@@ -102,6 +102,8 @@ class CommandLineArguments {
 
     OutputHandlerType get_output_handler_type() const { return m_output_handler_type; }
 
+    bool get_single_file_archive() const { return m_single_file_archive; }
+
     bool get_structurize_arrays() const { return m_structurize_arrays; }
 
     bool get_ordered_decompression() const { return m_ordered_decompression; }
@@ -176,6 +178,7 @@ class CommandLineArguments {
     size_t m_target_encoded_size{8ULL * 1024 * 1024 * 1024};  // 8 GiB
     bool m_print_archive_stats{false};
     size_t m_max_document_size{512ULL * 1024 * 1024};  // 512 MB
+    bool m_single_file_archive{false};
     bool m_structurize_arrays{false};
     bool m_ordered_decompression{false};
     size_t m_target_ordered_chunk_size{};
diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp
index 9e8293510..d14a221b3 100644
--- a/components/core/src/clp_s/JsonParser.cpp
+++ b/components/core/src/clp_s/JsonParser.cpp
@@ -37,6 +37,7 @@ JsonParser::JsonParser(JsonParserOption const& option)
     m_archive_options.archives_dir = option.archives_dir;
     m_archive_options.compression_level = option.compression_level;
     m_archive_options.print_archive_stats = option.print_archive_stats;
+    m_archive_options.single_file_archive = option.single_file_archive;
     m_archive_options.min_table_size = option.min_table_size;
     m_archive_options.id = m_generator();
 
diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp
index d7cc5a2fe..bfd423c22 100644
--- a/components/core/src/clp_s/JsonParser.hpp
+++ b/components/core/src/clp_s/JsonParser.hpp
@@ -38,6 +38,7 @@ struct JsonParserOption {
     bool print_archive_stats{};
     bool structurize_arrays{};
     bool record_log_order{true};
+    bool single_file_archive{false};
     std::shared_ptr<clp::GlobalMySQLMetadataDB> metadata_db;
 };
 
diff --git a/components/core/src/clp_s/SingleFileArchiveDefs.hpp b/components/core/src/clp_s/SingleFileArchiveDefs.hpp
new file mode 100644
index 000000000..7eabeb6db
--- /dev/null
+++ b/components/core/src/clp_s/SingleFileArchiveDefs.hpp
@@ -0,0 +1,59 @@
+#ifndef CLP_S_ARCHIVEDEFS_HPP
+#define CLP_S_ARCHIVEDEFS_HPP
+
+#include <string>
+
+#include "msgpack.hpp"
+
+namespace clp_s {
+// define the version
+constexpr uint8_t cArchiveMajorVersion = 0;
+constexpr uint8_t cArchiveMinorVersion = 2;
+constexpr uint16_t cArchivePatchVersion = 0;
+
+// define the magic number
+constexpr uint8_t cStructuredSFAMagicNumber[] = {0xFD, 0x2F, 0xC5, 0x30};
+
+struct ArchiveHeader {
+    uint8_t magic_number[4];
+    uint32_t version;
+    uint64_t uncompressed_size;
+    uint64_t compressed_size;
+    uint64_t reserved_padding[4];
+    uint32_t metadata_section_size;
+    uint16_t compression_type;
+    uint16_t padding;
+};
+
+enum class ArchiveCompressionType : uint16_t {
+    Zstd = 0,
+};
+
+enum struct ArchiveMetadataPacketType : uint8_t {
+    ArchiveInfo = 0,
+    ArchiveFileInfo = 1,
+    TimestampDictionary = 2,
+};
+
+struct ArchiveInfoPacket {
+    uint64_t num_segments;
+    // TODO: Add more fields in the future
+
+    MSGPACK_DEFINE_MAP(num_segments);
+};
+
+struct ArchiveFileInfo {
+    std::string n;  // name
+    uint64_t o;  // offset
+
+    MSGPACK_DEFINE_MAP(n, o);
+};
+
+struct ArchiveFileInfoPacket {
+    std::vector<ArchiveFileInfo> files;
+
+    MSGPACK_DEFINE_MAP(files);
+};
+}  // namespace clp_s
+
+#endif  // CLP_S_ARCHIVEDEFS_HPP
diff --git a/components/core/src/clp_s/TimestampDictionaryWriter.cpp b/components/core/src/clp_s/TimestampDictionaryWriter.cpp
index 7b02fd3a5..39e66a6af 100644
--- a/components/core/src/clp_s/TimestampDictionaryWriter.cpp
+++ b/components/core/src/clp_s/TimestampDictionaryWriter.cpp
@@ -1,63 +1,34 @@
 #include "TimestampDictionaryWriter.hpp"
 
+#include <sstream>
+
 #include "Utils.hpp"
 
 namespace clp_s {
 void TimestampDictionaryWriter::write_timestamp_entries(
         std::map<std::string, TimestampEntry> const& ranges,
-        ZstdCompressor& compressor
+        std::stringstream& stream
 ) {
-    compressor.write_numeric_value<uint64_t>(ranges.size());
+    write_numeric_value<uint64_t>(stream, ranges.size());
 
     for (auto const& range : ranges) {
-        range.second.write_to_file(compressor);
+        range.second.write_to_stream(stream);
     }
 }
 
-void TimestampDictionaryWriter::write_and_flush_to_disk() {
-    write_timestamp_entries(m_column_key_to_range, m_dictionary_compressor);
+void TimestampDictionaryWriter::write(std::stringstream& stream) {
+    merge_range();
+    write_timestamp_entries(m_column_key_to_range, stream);
 
-    m_dictionary_compressor.write_numeric_value<uint64_t>(m_pattern_to_id.size());
+    write_numeric_value<uint64_t>(stream, m_pattern_to_id.size());
     for (auto& it : m_pattern_to_id) {
         // write pattern ID
-        m_dictionary_compressor.write_numeric_value<uint64_t>(it.second);
+        write_numeric_value<uint64_t>(stream, it.second);
 
         std::string const& pattern = it.first->get_format();
-        m_dictionary_compressor.write_numeric_value<uint64_t>(pattern.length());
-        m_dictionary_compressor.write_string(pattern);
-    }
-
-    m_dictionary_compressor.flush();
-    m_dictionary_file_writer.flush();
-}
-
-void TimestampDictionaryWriter::open(std::string const& dictionary_path, int compression_level) {
-    if (m_is_open) {
-        throw OperationFailed(ErrorCodeNotReady, __FILENAME__, __LINE__);
-    }
-
-    m_dictionary_file_writer.open(dictionary_path, FileWriter::OpenMode::CreateForWriting);
-    m_dictionary_compressor.open(m_dictionary_file_writer, compression_level);
-
-    m_next_id = 0;
-    m_is_open = true;
-}
-
-size_t TimestampDictionaryWriter::close() {
-    if (false == m_is_open) {
-        throw OperationFailed(ErrorCodeNotInit, __FILENAME__, __LINE__);
+        write_numeric_value<uint64_t>(stream, pattern.length());
+        stream.write(pattern.data(), pattern.size());
     }
-
-    // merge before writing overall archive because this
-    // happens before the last sub-archive is written
-    merge_range();
-    write_and_flush_to_disk();
-    m_dictionary_compressor.close();
-    size_t compressed_size = m_dictionary_file_writer.get_pos();
-    m_dictionary_file_writer.close();
-
-    m_is_open = false;
-    return compressed_size;
 }
 
 uint64_t TimestampDictionaryWriter::get_pattern_id(TimestampPattern const* pattern) {
@@ -180,4 +151,11 @@ epochtime_t TimestampDictionaryWriter::get_end_timestamp() const {
 
     return it->second.get_end_timestamp();
 }
+
+void TimestampDictionaryWriter::clear() {
+    m_next_id = 0;
+    m_pattern_to_id.clear();
+    m_column_key_to_range.clear();
+    m_column_id_to_range.clear();
+}
 }  // namespace clp_s
diff --git a/components/core/src/clp_s/TimestampDictionaryWriter.hpp b/components/core/src/clp_s/TimestampDictionaryWriter.hpp
index 81266b187..29288fd48 100644
--- a/components/core/src/clp_s/TimestampDictionaryWriter.hpp
+++ b/components/core/src/clp_s/TimestampDictionaryWriter.hpp
@@ -1,15 +1,15 @@
 #ifndef CLP_S_TIMESTAMPDICTIONARYWRITER_HPP
 #define CLP_S_TIMESTAMPDICTIONARYWRITER_HPP
 
+#include <map>
+#include <sstream>
 #include <string>
 #include <unordered_map>
 #include <utility>
 
-#include "FileWriter.hpp"
 #include "SchemaTree.hpp"
 #include "TimestampEntry.hpp"
 #include "TimestampPattern.hpp"
-#include "ZstdCompressor.hpp"
 
 namespace clp_s {
 class TimestampDictionaryWriter {
@@ -23,25 +23,13 @@ class TimestampDictionaryWriter {
     };
 
     // Constructors
-    TimestampDictionaryWriter() : m_is_open(false) {}
+    TimestampDictionaryWriter() {}
 
     /**
-     * Opens the timestamp dictionary for writing
-     * @param dictionary_path
-     * @param compression_level
+     * Writes the timestamp dictionary to a buffered stream.
+     * @param stream
      */
-    void open(std::string const& dictionary_path, int compression_level);
-
-    /**
-     * Closes the timestamp dictionary
-     * @return the compressed size of the global timestamp dictionary in bytes
-     */
-    [[nodiscard]] size_t close();
-
-    /**
-     * Writes the timestamp dictionary to disk
-     */
-    void write_and_flush_to_disk();
+    void write(std::stringstream& stream);
 
     /**
      * Gets the pattern id for a given pattern
@@ -91,33 +79,30 @@ class TimestampDictionaryWriter {
      */
     epochtime_t get_end_timestamp() const;
 
+    /**
+     * Clears and resets all internal state.
+     */
+    void clear();
+
 private:
     /**
-     * Merges timestamp ranges with the same key name
+     * Merges timestamp ranges with the same key name but different node ids.
      */
     void merge_range();
 
     /**
-     * Writes timestamp entries to the disk
+     * Writes timestamp entries to a buffered stream.
      * @param ranges
      * @param compressor
      */
     static void write_timestamp_entries(
             std::map<std::string, TimestampEntry> const& ranges,
-            ZstdCompressor& compressor
+            std::stringstream& stream
     );
 
     using pattern_to_id_t = std::unordered_map<TimestampPattern const*, uint64_t>;
 
     // Variables
-    bool m_is_open;
-
-    // Variables related to on-disk storage
-    FileWriter m_dictionary_file_writer;
-    ZstdCompressor m_dictionary_compressor;
-    FileWriter m_dictionary_file_writer_local;
-    ZstdCompressor m_dictionary_compressor_local;
-
     pattern_to_id_t m_pattern_to_id;
     uint64_t m_next_id{};
 
diff --git a/components/core/src/clp_s/TimestampEntry.cpp b/components/core/src/clp_s/TimestampEntry.cpp
index 54b27d22e..19d422066 100644
--- a/components/core/src/clp_s/TimestampEntry.cpp
+++ b/components/core/src/clp_s/TimestampEntry.cpp
@@ -1,6 +1,9 @@
 #include "TimestampEntry.hpp"
 
 #include <cmath>
+#include <sstream>
+
+#include "Utils.hpp"
 
 namespace clp_s {
 void TimestampEntry::ingest_timestamp(epochtime_t timestamp) {
@@ -54,21 +57,21 @@ void TimestampEntry::merge_range(TimestampEntry const& entry) {
     }
 }
 
-void TimestampEntry::write_to_file(ZstdCompressor& compressor) const {
-    compressor.write_numeric_value<uint64_t>(m_key_name.size());
-    compressor.write_string(m_key_name);
-    compressor.write_numeric_value<uint64_t>(m_column_ids.size());
+void TimestampEntry::write_to_stream(std::stringstream& stream) const {
+    write_numeric_value<uint64_t>(stream, m_key_name.size());
+    stream.write(m_key_name.data(), m_key_name.size());
+    write_numeric_value<uint64_t>(stream, m_column_ids.size());
     for (auto const& id : m_column_ids) {
-        compressor.write_numeric_value<int32_t>(id);
+        write_numeric_value<int32_t>(stream, id);
     }
 
-    compressor.write_numeric_value<TimestampEncoding>(m_encoding);
+    write_numeric_value<TimestampEncoding>(stream, m_encoding);
     if (m_encoding == Epoch) {
-        compressor.write_numeric_value<epochtime_t>(m_epoch_start);
-        compressor.write_numeric_value<epochtime_t>(m_epoch_end);
+        write_numeric_value<epochtime_t>(stream, m_epoch_start);
+        write_numeric_value<epochtime_t>(stream, m_epoch_end);
     } else if (m_encoding == DoubleEpoch) {
-        compressor.write_numeric_value<double>(m_epoch_start_double);
-        compressor.write_numeric_value<double>(m_epoch_end_double);
+        write_numeric_value<double>(stream, m_epoch_start_double);
+        write_numeric_value<double>(stream, m_epoch_end_double);
     }
 }
 
diff --git a/components/core/src/clp_s/TimestampEntry.hpp b/components/core/src/clp_s/TimestampEntry.hpp
index ad40b4b89..326ed9d73 100644
--- a/components/core/src/clp_s/TimestampEntry.hpp
+++ b/components/core/src/clp_s/TimestampEntry.hpp
@@ -1,6 +1,7 @@
 #ifndef CLP_S_TIMESTAMPENTRY_HPP
 #define CLP_S_TIMESTAMPENTRY_HPP
 
+#include <sstream>
 #include <string>
 #include <unordered_set>
 #include <variant>
@@ -9,7 +10,6 @@
 #include "ErrorCode.hpp"
 #include "search/FilterOperation.hpp"
 #include "Utils.hpp"
-#include "ZstdCompressor.hpp"
 #include "ZstdDecompressor.hpp"
 
 using clp_s::search::FilterOperation;
@@ -66,10 +66,10 @@ class TimestampEntry {
     void merge_range(TimestampEntry const& entry);
 
     /**
-     * Write the timestamp entry to a file
+     * Write the timestamp entry to a buffered stream.
      * @param compressor
      */
-    void write_to_file(ZstdCompressor& compressor) const;
+    void write_to_stream(std::stringstream& stream) const;
 
     /**
      * Try to read the timestamp entry from a file
diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp
index d6deb3280..553f7e608 100644
--- a/components/core/src/clp_s/Utils.hpp
+++ b/components/core/src/clp_s/Utils.hpp
@@ -3,6 +3,7 @@
 
 #include <charconv>
 #include <cstring>
+#include <sstream>
 #include <string>
 
 #include <boost/filesystem.hpp>
@@ -254,6 +255,17 @@ inline T2 bit_cast(T1 t1) {
     return t2;
 }
 
+/**
+ * Writes a numeric value to a stringstream.
+ * @param stream
+ * @param value
+ * @tparam ValueType
+ */
+template <typename ValueType>
+void write_numeric_value(std::stringstream& stream, ValueType value) {
+    stream.write(reinterpret_cast<char*>(&value), sizeof(value));
+}
+
 /**
  * A span of memory where the underlying memory may not be aligned correctly for type T.
  *
diff --git a/components/core/src/clp_s/archive_constants.hpp b/components/core/src/clp_s/archive_constants.hpp
index 604c97f66..b76af2944 100644
--- a/components/core/src/clp_s/archive_constants.hpp
+++ b/components/core/src/clp_s/archive_constants.hpp
@@ -4,6 +4,9 @@
 #include <cstdint>
 
 namespace clp_s::constants {
+// Single file archive
+constexpr char cTmpPostfix[] = ".tmp";
+
 // Schema files
 constexpr char cArchiveSchemaMapFile[] = "/schema_ids";
 constexpr char cArchiveSchemaTreeFile[] = "/schema_tree";
diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp
index a74693e33..b76683caf 100644
--- a/components/core/src/clp_s/clp-s.cpp
+++ b/components/core/src/clp_s/clp-s.cpp
@@ -95,6 +95,7 @@ bool compress(CommandLineArguments const& command_line_arguments) {
     option.compression_level = command_line_arguments.get_compression_level();
     option.timestamp_key = command_line_arguments.get_timestamp_key();
     option.print_archive_stats = command_line_arguments.print_archive_stats();
+    option.single_file_archive = command_line_arguments.get_single_file_archive();
     option.structurize_arrays = command_line_arguments.get_structurize_arrays();
     option.record_log_order = command_line_arguments.get_record_log_order();
 

From 2b88c6fcf397ac3729303c1eff5ca0772a955e1e Mon Sep 17 00:00:00 2001
From: "Xiaochong(Eddy) Wei" <40865608+anlowee@users.noreply.github.com>
Date: Thu, 28 Nov 2024 11:52:30 -0500
Subject: [PATCH 39/65] test: Allow multiple trials when unittesting http
 headers (#613)

Co-authored-by: Xiaochong Wei <xiaochong.wei@yscope.com>
---
 components/core/tests/test-NetworkReader.cpp | 38 ++++++++++++--------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/components/core/tests/test-NetworkReader.cpp b/components/core/tests/test-NetworkReader.cpp
index f2995f141..552775ea8 100644
--- a/components/core/tests/test-NetworkReader.cpp
+++ b/components/core/tests/test-NetworkReader.cpp
@@ -196,26 +196,36 @@ TEST_CASE("network_reader_with_valid_http_header_kv_pairs", "[NetworkReader]") {
     std::unordered_map<std::string, std::string> valid_http_header_kv_pairs;
     // We use httpbin (https://httpbin.org/) to test the user-specified headers. On success, it is
     // supposed to respond all the user-specified headers as key-value pairs in JSON form.
-    constexpr int cNumHttpHeaderKeyValuePairs{10};
+    constexpr size_t cNumHttpHeaderKeyValuePairs{10};
     for (size_t i{0}; i < cNumHttpHeaderKeyValuePairs; ++i) {
         valid_http_header_kv_pairs.emplace(
                 fmt::format("Unit-Test-Key{}", i),
                 fmt::format("Unit-Test-Value{}", i)
         );
     }
-    clp::NetworkReader reader{
-            "https://httpbin.org/headers",
-            0,
-            false,
-            clp::CurlDownloadHandler::cDefaultOverallTimeout,
-            clp::CurlDownloadHandler::cDefaultConnectionTimeout,
-            clp::NetworkReader::cDefaultBufferPoolSize,
-            clp::NetworkReader::cDefaultBufferSize,
-            valid_http_header_kv_pairs
-    };
-    auto const content{get_content(reader)};
-    REQUIRE(assert_curl_error_code(CURLE_OK, reader));
-    auto const parsed_content = nlohmann::json::parse(content);
+    std::optional<std::vector<char>> optional_content;
+    // Retry the unit test a limited number of times to handle transient server-side HTTP errors.
+    // This ensures the test is not marked as failed due to temporary issues beyond our control.
+    constexpr size_t cNumMaxTrials{10};
+    for (size_t i{0}; i < cNumMaxTrials; ++i) {
+        clp::NetworkReader reader{
+                "https://httpbin.org/headers",
+                0,
+                false,
+                clp::CurlDownloadHandler::cDefaultOverallTimeout,
+                clp::CurlDownloadHandler::cDefaultConnectionTimeout,
+                clp::NetworkReader::cDefaultBufferPoolSize,
+                clp::NetworkReader::cDefaultBufferSize,
+                valid_http_header_kv_pairs
+        };
+        auto const content = get_content(reader);
+        if (assert_curl_error_code(CURLE_OK, reader)) {
+            optional_content.emplace(content);
+            break;
+        }
+    }
+    REQUIRE(optional_content.has_value());
+    auto const parsed_content = nlohmann::json::parse(optional_content.value());
     auto const& headers{parsed_content.at("headers")};
     for (auto const& [key, value] : valid_http_header_kv_pairs) {
         REQUIRE((value == headers.at(key).get<std::string_view>()));

From 290ede3cb30b4bfe4e813212cf78c2de85e10db4 Mon Sep 17 00:00:00 2001
From: Junhao Liao <junhao.liao@yscope.com>
Date: Fri, 29 Nov 2024 03:54:23 -0500
Subject: [PATCH 40/65] chore(log-viewer-webui): Update `yscope-log-viewer` to
 the latest version. (#615)

---
 components/log-viewer-webui/yscope-log-viewer | 2 +-
 deps-tasks.yml                                | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/components/log-viewer-webui/yscope-log-viewer b/components/log-viewer-webui/yscope-log-viewer
index 4c69bc11d..969ff35b2 160000
--- a/components/log-viewer-webui/yscope-log-viewer
+++ b/components/log-viewer-webui/yscope-log-viewer
@@ -1 +1 @@
-Subproject commit 4c69bc11dbe8a5d87b5fbfb0e43a2f2a06f04866
+Subproject commit 969ff35b2387bcdc3580b441907e3656640ce16d
diff --git a/deps-tasks.yml b/deps-tasks.yml
index 64a218a47..3c60af001 100644
--- a/deps-tasks.yml
+++ b/deps-tasks.yml
@@ -421,8 +421,8 @@ tasks:
         vars:
           DEST: "{{.DEST}}"
           FLAGS: "--extract"
-          SRC_NAME: "yscope-log-viewer-4c69bc11dbe8a5d87b5fbfb0e43a2f2a06f04866"
-          SRC_URL: "https://github.com/y-scope/yscope-log-viewer/archive/4c69bc1.zip"
+          SRC_NAME: "yscope-log-viewer-969ff35b2387bcdc3580b441907e3656640ce16d"
+          SRC_URL: "https://github.com/y-scope/yscope-log-viewer/archive/969ff35.zip"
       # This command must be last
       - task: ":utils:compute-checksum"
         vars:

From 2c0e053c938ca70cedd378bedd9b73f6b613ca2b Mon Sep 17 00:00:00 2001
From: Abigail Matthews <abigail.v.matthews@gmail.com>
Date: Mon, 2 Dec 2024 00:48:07 -0500
Subject: [PATCH 41/65] test(clp-s): Add end-to-end test case for compression
 and extraction. (#595)

---
 components/core/CMakeLists.txt                |  54 +++++-
 .../core/tests/test-clp_s-end_to_end.cpp      | 158 ++++++++++++++++++
 .../test_no_floats_sorted.jsonl               |   4 +
 .../install-prebuilt-packages.sh              |   2 +
 .../ubuntu-focal/install-prebuilt-packages.sh |   1 +
 .../ubuntu-jammy/install-prebuilt-packages.sh |   1 +
 6 files changed, 219 insertions(+), 1 deletion(-)
 create mode 100644 components/core/tests/test-clp_s-end_to_end.cpp
 create mode 100644 components/core/tests/test_log_files/test_no_floats_sorted.jsonl

diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index 160f6766d..9e14498b0 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -259,6 +259,42 @@ add_subdirectory(src/clp_s)
 add_subdirectory(src/reducer)
 
 set(SOURCE_FILES_clp_s_unitTest
+    src/clp_s/ArchiveReader.cpp
+    src/clp_s/ArchiveReader.hpp
+    src/clp_s/ArchiveWriter.cpp
+    src/clp_s/ArchiveWriter.hpp
+    src/clp_s/ColumnReader.cpp
+    src/clp_s/ColumnReader.hpp
+    src/clp_s/ColumnWriter.cpp
+    src/clp_s/ColumnWriter.hpp
+    src/clp_s/DictionaryEntry.cpp
+    src/clp_s/DictionaryEntry.hpp
+    src/clp_s/DictionaryWriter.cpp
+    src/clp_s/DictionaryWriter.hpp
+    src/clp_s/FileReader.cpp
+    src/clp_s/FileReader.hpp
+    src/clp_s/FileWriter.cpp
+    src/clp_s/FileWriter.hpp
+    src/clp_s/JsonConstructor.cpp
+    src/clp_s/JsonConstructor.hpp
+    src/clp_s/JsonFileIterator.cpp
+    src/clp_s/JsonFileIterator.hpp
+    src/clp_s/JsonParser.cpp
+    src/clp_s/JsonParser.hpp
+    src/clp_s/PackedStreamReader.cpp
+    src/clp_s/PackedStreamReader.hpp
+    src/clp_s/ReaderUtils.cpp
+    src/clp_s/ReaderUtils.hpp
+    src/clp_s/Schema.cpp
+    src/clp_s/Schema.hpp
+    src/clp_s/SchemaMap.cpp
+    src/clp_s/SchemaMap.hpp
+    src/clp_s/SchemaReader.cpp
+    src/clp_s/SchemaReader.hpp
+    src/clp_s/SchemaTree.cpp
+    src/clp_s/SchemaTree.hpp
+    src/clp_s/SchemaWriter.cpp
+    src/clp_s/SchemaWriter.hpp
     src/clp_s/search/AndExpr.cpp
     src/clp_s/search/AndExpr.hpp
     src/clp_s/search/BooleanLiteral.cpp
@@ -291,11 +327,24 @@ set(SOURCE_FILES_clp_s_unitTest
     src/clp_s/search/StringLiteral.hpp
     src/clp_s/search/Transformation.hpp
     src/clp_s/search/Value.hpp
-    src/clp_s/SchemaTree.hpp
+    src/clp_s/TimestampDictionaryReader.cpp
+    src/clp_s/TimestampDictionaryReader.hpp
+    src/clp_s/TimestampDictionaryWriter.cpp
+    src/clp_s/TimestampDictionaryWriter.hpp
+    src/clp_s/TimestampEntry.cpp
+    src/clp_s/TimestampEntry.hpp
     src/clp_s/TimestampPattern.cpp
     src/clp_s/TimestampPattern.hpp
     src/clp_s/Utils.cpp
     src/clp_s/Utils.hpp
+    src/clp_s/VariableDecoder.cpp
+    src/clp_s/VariableDecoder.hpp
+    src/clp_s/VariableEncoder.cpp
+    src/clp_s/VariableEncoder.hpp
+    src/clp_s/ZstdCompressor.cpp
+    src/clp_s/ZstdCompressor.hpp
+    src/clp_s/ZstdDecompressor.cpp
+    src/clp_s/ZstdDecompressor.hpp
 )
 
 set(SOURCE_FILES_unitTest
@@ -520,6 +569,7 @@ set(SOURCE_FILES_unitTest
         tests/LogSuppressor.hpp
         tests/test-Array.cpp
         tests/test-BufferedFileReader.cpp
+        tests/test-clp_s-end_to_end.cpp
         tests/test-EncodedVariableInterpreter.cpp
         tests/test-encoding_methods.cpp
         tests/test-ffi_IrUnitHandlerInterface.cpp
@@ -563,6 +613,8 @@ target_link_libraries(unitTest
         log_surgeon::log_surgeon
         LibArchive::LibArchive
         MariaDBClient::MariaDBClient
+        ${MONGOCXX_TARGET}
+        simdjson
         spdlog::spdlog
         OpenSSL::Crypto
         ${sqlite_LIBRARY_DEPENDENCIES}
diff --git a/components/core/tests/test-clp_s-end_to_end.cpp b/components/core/tests/test-clp_s-end_to_end.cpp
new file mode 100644
index 000000000..3f138b472
--- /dev/null
+++ b/components/core/tests/test-clp_s-end_to_end.cpp
@@ -0,0 +1,158 @@
+#include <sys/wait.h>
+
+#include <cstdlib>
+#include <filesystem>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include <Catch2/single_include/catch2/catch.hpp>
+#include <fmt/format.h>
+
+#include "../src/clp_s/JsonConstructor.hpp"
+#include "../src/clp_s/JsonParser.hpp"
+
+constexpr std::string_view cTestEndToEndArchiveDirectory{"test-end-to-end-archive"};
+constexpr std::string_view cTestEndToEndOutputDirectory{"test-end-to-end-out"};
+constexpr std::string_view cTestEndToEndOutputSortedJson{"test-end-to-end_sorted.jsonl"};
+constexpr std::string_view cTestEndToEndInputFileDirectory{"test_log_files"};
+constexpr std::string_view cTestEndToEndInputFile{"test_no_floats_sorted.jsonl"};
+
+namespace {
+/**
+ * A class that deletes the directories and files created by test cases, both before and after each
+ * test case where the class is instantiated.
+ */
+class TestOutputCleaner {
+public:
+    TestOutputCleaner() { delete_files(); }
+
+    ~TestOutputCleaner() { delete_files(); }
+
+    // Delete copy & move constructors and assignment operators
+    TestOutputCleaner(TestOutputCleaner const&) = delete;
+    TestOutputCleaner(TestOutputCleaner&&) = delete;
+    auto operator=(TestOutputCleaner const&) -> TestOutputCleaner& = delete;
+    auto operator=(TestOutputCleaner&&) -> TestOutputCleaner& = delete;
+
+private:
+    static void delete_files() {
+        std::filesystem::remove_all(cTestEndToEndArchiveDirectory);
+        std::filesystem::remove_all(cTestEndToEndOutputDirectory);
+        std::filesystem::remove(cTestEndToEndOutputSortedJson);
+    }
+};
+
+auto get_test_input_path_relative_to_tests_dir() -> std::filesystem::path;
+auto get_test_input_local_path() -> std::string;
+void compress(bool structurize_arrays);
+auto extract() -> std::filesystem::path;
+void compare(std::filesystem::path const& extracted_json_path);
+
+auto get_test_input_path_relative_to_tests_dir() -> std::filesystem::path {
+    return std::filesystem::path{cTestEndToEndInputFileDirectory} / cTestEndToEndInputFile;
+}
+
+auto get_test_input_local_path() -> std::string {
+    std::filesystem::path const current_file_path{__FILE__};
+    auto const tests_dir{current_file_path.parent_path()};
+    return (tests_dir / get_test_input_path_relative_to_tests_dir()).string();
+}
+
+void compress(bool structurize_arrays) {
+    constexpr auto cDefaultTargetEncodedSize = 8ULL * 1024 * 1024 * 1024;  // 8 GiB
+    constexpr auto cDefaultMaxDocumentSize = 512ULL * 1024 * 1024;  // 512 MiB
+    constexpr auto cDefaultMinTableSize = 1ULL * 1024 * 1024;  // 1 MiB
+    constexpr auto cDefaultCompressionLevel = 3;
+    constexpr auto cDefaultPrintArchiveStats = false;
+
+    std::filesystem::create_directory(cTestEndToEndArchiveDirectory);
+    REQUIRE((std::filesystem::is_directory(cTestEndToEndArchiveDirectory)));
+
+    clp_s::JsonParserOption parser_option{};
+    parser_option.file_paths.push_back(get_test_input_local_path());
+    parser_option.archives_dir = cTestEndToEndArchiveDirectory;
+    parser_option.target_encoded_size = cDefaultTargetEncodedSize;
+    parser_option.max_document_size = cDefaultMaxDocumentSize;
+    parser_option.min_table_size = cDefaultMinTableSize;
+    parser_option.compression_level = cDefaultCompressionLevel;
+    parser_option.print_archive_stats = cDefaultPrintArchiveStats;
+    parser_option.structurize_arrays = structurize_arrays;
+
+    clp_s::JsonParser parser{parser_option};
+    REQUIRE(parser.parse());
+    parser.store();
+
+    REQUIRE((false == std::filesystem::is_empty(cTestEndToEndArchiveDirectory)));
+}
+
+auto extract() -> std::filesystem::path {
+    constexpr auto cDefaultOrdered = false;
+    constexpr auto cDefaultTargetOrderedChunkSize = 0;
+
+    std::filesystem::create_directory(cTestEndToEndOutputDirectory);
+    REQUIRE(std::filesystem::is_directory(cTestEndToEndOutputDirectory));
+
+    clp_s::JsonConstructorOption constructor_option{};
+    constructor_option.archives_dir = cTestEndToEndArchiveDirectory;
+    constructor_option.output_dir = cTestEndToEndOutputDirectory;
+    constructor_option.ordered = cDefaultOrdered;
+    constructor_option.target_ordered_chunk_size = cDefaultTargetOrderedChunkSize;
+    for (auto const& entry : std::filesystem::directory_iterator(constructor_option.archives_dir)) {
+        if (false == entry.is_directory()) {
+            // Skip non-directories
+            continue;
+        }
+
+        constructor_option.archive_id = entry.path().filename();
+        clp_s::JsonConstructor constructor{constructor_option};
+        constructor.store();
+    }
+    std::filesystem::path extracted_json_path{cTestEndToEndOutputDirectory};
+    extracted_json_path /= "original";
+    REQUIRE(std::filesystem::exists(extracted_json_path));
+
+    return extracted_json_path;
+}
+
+// Silence the checks below since our use of `std::system` is safe in the context of testing.
+// NOLINTBEGIN(cert-env33-c,concurrency-mt-unsafe)
+void compare(std::filesystem::path const& extracted_json_path) {
+    int result{std::system("command -v jq >/dev/null 2>&1")};
+    REQUIRE((0 == result));
+    auto command = fmt::format(
+            "jq --sort-keys --compact-output '.' {} | sort > {}",
+            extracted_json_path.string(),
+            cTestEndToEndOutputSortedJson
+    );
+    result = std::system(command.c_str());
+    REQUIRE((0 == result));
+
+    REQUIRE((false == std::filesystem::is_empty(cTestEndToEndOutputSortedJson)));
+
+    result = std::system("command -v diff >/dev/null 2>&1");
+    REQUIRE((0 == result));
+    command = fmt::format(
+            "diff --unified {} {}  > /dev/null",
+            cTestEndToEndOutputSortedJson,
+            get_test_input_local_path()
+    );
+    result = std::system(command.c_str());
+    REQUIRE((true == WIFEXITED(result)));
+    REQUIRE((0 == WEXITSTATUS(result)));
+}
+
+// NOLINTEND(cert-env33-c,concurrency-mt-unsafe)
+}  // namespace
+
+TEST_CASE("clp-s-compress-extract-no-floats", "[clp-s][end-to-end]") {
+    auto structurize_arrays = GENERATE(true, false);
+
+    TestOutputCleaner const test_cleanup;
+
+    compress(structurize_arrays);
+
+    auto extracted_json_path = extract();
+
+    compare(extracted_json_path);
+}
diff --git a/components/core/tests/test_log_files/test_no_floats_sorted.jsonl b/components/core/tests/test_log_files/test_no_floats_sorted.jsonl
new file mode 100644
index 000000000..8dfcd85f6
--- /dev/null
+++ b/components/core/tests/test_log_files/test_no_floats_sorted.jsonl
@@ -0,0 +1,4 @@
+{"clp_string":"uid=0, CPU usage:99.99%, \"user_name\"=YScope","empty_array":[],"empty_object":{},"false":false,"int16_max":32767,"int16_min":-32768,"int32_max":2147483647,"int32_min":-2147483648,"int64_max_jq_losslessly_represents":9824299763229016,"int64_min_jq_losslessly_represents":-9007199254740992,"int8_max":127,"int8_min":-128,"null":null,"string":"short_string","true":true}
+{"clp_string":"uid=0, CPU usage:99.99%, \"user_name\"=YScope","empty_array":[],"false":false,"int16_max":32767,"int16_min":-32768,"int32_max":2147483647,"int32_min":-2147483648,"int64_max_jq_losslessly_represents":9824299763229016,"int64_min_jq_losslessly_represents":-9007199254740992,"int8_max":127,"int8_min":-128,"nonempty_object":{"clp_string":"uid=0, CPU usage:99.99%, \"user_name\"=YScope","empty_array":[],"empty_object":{},"false":false,"int16_max":32767,"int16_min":-32768,"int32_max":2147483647,"int32_min":-2147483648,"int64_max_jq_losslessly_represents":9824299763229016,"int64_min_jq_losslessly_represents":-9007199254740992,"int8_max":127,"int8_min":-128,"null":null,"string":"short_string","true":true},"null":null,"string":"short_string","true":true}
+{"clp_string":"uid=0, CPU usage:99.99%, \"user_name\"=YScope","empty_array":[],"false":false,"int16_max":32767,"int16_min":-32768,"int32_max":2147483647,"int32_min":-2147483648,"int64_max_jq_losslessly_represents":9824299763229016,"int64_min_jq_losslessly_represents":-9007199254740992,"int8_max":127,"int8_min":-128,"nonempty_object":{"clp_string":"uid=0, CPU usage:99.99%, \"user_name\"=YScope","empty_array":[],"false":false,"int16_max":32767,"int16_min":-32768,"int32_max":2147483647,"int32_min":-2147483648,"int64_max_jq_losslessly_represents":9824299763229016,"int64_min_jq_losslessly_represents":-9007199254740992,"int8_max":127,"int8_min":-128,"non_empty_object2":{"clp_string":"uid=0, CPU usage:99.99%, \"user_name\"=YScope","empty_array":[],"empty_object":{},"false":false,"int16_max":32767,"int16_min":-32768,"int32_max":2147483647,"int32_min":-2147483648,"int64_max_jq_losslessly_represents":9824299763229016,"int64_min_jq_losslessly_represents":-9007199254740992,"int8_max":127,"int8_min":-128,"null":null,"string":"short_string","true":true},"null":null,"string":"short_string","true":true},"null":null,"string":"short_string","true":true}
+{"clp_string":"uid=0, CPU usage:99.99%, \"user_name\"=YScope","empty_object":{},"false":false,"int16_max":32767,"int16_min":-32768,"int32_max":2147483647,"int32_min":-2147483648,"int64_max_jq_losslessly_represents":9824299763229016,"int64_min_jq_losslessly_represents":-9007199254740992,"int8_max":127,"int8_min":-128,"nonempty_array":[1,2,3,4,5],"null":null,"string":"short_string","true":true}
diff --git a/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh
index eede5e004..c51a521c1 100755
--- a/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh
+++ b/components/core/tools/scripts/lib_install/centos-stream-9/install-prebuilt-packages.sh
@@ -8,9 +8,11 @@ set -u
 
 dnf install -y \
     cmake \
+    diffutils \
     gcc-c++ \
     git \
     java-11-openjdk \
+    jq \
     libarchive-devel \
     libcurl-devel \
     libzstd-devel \
diff --git a/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh
index b373cbe4d..3ea3b3ed5 100755
--- a/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh
+++ b/components/core/tools/scripts/lib_install/ubuntu-focal/install-prebuilt-packages.sh
@@ -17,6 +17,7 @@ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
   gcc \
   gcc-10 \
   git \
+  jq \
   libcurl4 \
   libcurl4-openssl-dev \
   liblzma-dev \
diff --git a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh
index e2e17283b..ca1f5f59e 100755
--- a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh
+++ b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh
@@ -14,6 +14,7 @@ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
   curl \
   build-essential \
   git \
+  jq \
   libboost-filesystem-dev \
   libboost-iostreams-dev \
   libboost-program-options-dev \

From cbf8bf9224490bb46b70f97ff5218f773dd2c8ba Mon Sep 17 00:00:00 2001
From: Devin Gibson <gibber9809@users.noreply.github.com>
Date: Mon, 2 Dec 2024 15:16:04 -0500
Subject: [PATCH 42/65] docs(clp-json): Update list of characters that requires
 escaping in queries. (#617)

Co-authored-by: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com>
---
 .../reference-json-search-syntax.md           | 36 +++++++++++++++++--
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/docs/src/user-guide/reference-json-search-syntax.md b/docs/src/user-guide/reference-json-search-syntax.md
index ca6898984..18d0e4267 100644
--- a/docs/src/user-guide/reference-json-search-syntax.md
+++ b/docs/src/user-guide/reference-json-search-syntax.md
@@ -33,15 +33,18 @@ To search for a key or value with multiple words, you must quote the key/value w
 "multi-word key": "multi-word value"
 ```
 
-Queries for keys or values with the following literal characters must escape the characters using a
-`\` (backslash): `\`, `(`, `)`, `:`, `<`, `>`, `"`, `*`, `{`, `}`.
-
 :::{caution}
 Currently, a query that contains spaces is interpreted as a substring search, i.e., it will match
 log events that contain the value as a *substring*. In a future version of CLP, these queries will
 be interpreted as _exact_ searches unless they include [wildcards](#wildcards-in-values).
 :::
 
+:::{note}
+Certain characters have special meanings when used in keys or values, so to search for the
+characters literally, you must escape them. For a list of such characters, see
+[Escaping special characters](#escaping-special-characters).
+:::
+
 ### Querying nested kv-pairs
 
 If the kv-pair is nested in one or more objects, you can specify the key in one of two ways:
@@ -161,6 +164,33 @@ There are three supported boolean operators:
 
 You can use parentheses (`()`) to apply an operator to a group of expressions.
 
+### Escaping special characters
+
+Keys containing the following literal characters must escape the characters using a `\` (backslash):
+
+* `\`
+* `"`
+* `.`
+
+Values containing the following literal characters must escape the characters using a `\`
+(backslash):
+
+* `\`
+* `"`
+* `?`
+* `*`
+
+_Unquoted_ keys or values containing the following literal characters must also escape the
+characters using a `\` (backslash):
+
+* `(`
+* `)`
+* `:`
+* `<`
+* `>`
+* `{`
+* `}`
+
 ## Examples
 
 **Search for log events that contain a specific key-value pair:**

From 44b0f2b8ce3feb3be1757bea8b4c71ac87434fcb Mon Sep 17 00:00:00 2001
From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com>
Date: Mon, 2 Dec 2024 16:00:18 -0500
Subject: [PATCH 43/65] feat(core): Add `ErrorCode` template to standardize
 conversion of user-defined error code enums to `std::error_code`. (#486)

---
 components/core/CMakeLists.txt                |   2 +
 .../core/src/clp/error_handling/ErrorCode.hpp | 150 ++++++++++++++++++
 components/core/tests/test-error_handling.cpp | 141 ++++++++++++++++
 3 files changed, 293 insertions(+)
 create mode 100644 components/core/src/clp/error_handling/ErrorCode.hpp
 create mode 100644 components/core/tests/test-error_handling.cpp

diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index 9e14498b0..f974e5c7e 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -386,6 +386,7 @@ set(SOURCE_FILES_unitTest
         src/clp/DictionaryEntry.hpp
         src/clp/DictionaryReader.hpp
         src/clp/DictionaryWriter.hpp
+        src/clp/error_handling/ErrorCode.hpp
         src/clp/EncodedVariableInterpreter.cpp
         src/clp/EncodedVariableInterpreter.hpp
         src/clp/ErrorCode.hpp
@@ -572,6 +573,7 @@ set(SOURCE_FILES_unitTest
         tests/test-clp_s-end_to_end.cpp
         tests/test-EncodedVariableInterpreter.cpp
         tests/test-encoding_methods.cpp
+        tests/test-error_handling.cpp
         tests/test-ffi_IrUnitHandlerInterface.cpp
         tests/test-ffi_KeyValuePairLogEvent.cpp
         tests/test-ffi_SchemaTree.cpp
diff --git a/components/core/src/clp/error_handling/ErrorCode.hpp b/components/core/src/clp/error_handling/ErrorCode.hpp
new file mode 100644
index 000000000..2612e7768
--- /dev/null
+++ b/components/core/src/clp/error_handling/ErrorCode.hpp
@@ -0,0 +1,150 @@
+#ifndef CLP_ERROR_HANDLING_ERRORCODE_HPP
+#define CLP_ERROR_HANDLING_ERRORCODE_HPP
+
+#include <concepts>
+#include <string>
+#include <system_error>
+#include <type_traits>
+
+namespace clp::error_handling {
+/**
+ * Concept that defines a template parameter of an integer-based error code enumeration.
+ * @tparam Type
+ */
+template <typename Type>
+concept ErrorCodeEnumType = std::is_enum_v<Type> && requires(Type type) {
+    {
+        static_cast<std::underlying_type_t<Type>>(type)
+    } -> std::convertible_to<int>;
+};
+
+/**
+ * Template that defines a `std::error_category` of the given set of error code enumeration.
+ * @tparam ErrorCodeEnum
+ */
+template <ErrorCodeEnumType ErrorCodeEnum>
+class ErrorCategory : public std::error_category {
+public:
+    // Methods implementing `std::error_category`
+    /**
+     * Gets the error category name.
+     * Note: A specialization must be explicitly implemented for each valid `ErrorCodeEnum`.
+     * @return The name of the error category.
+     */
+    [[nodiscard]] auto name() const noexcept -> char const* override;
+
+    /**
+     * Gets the descriptive message associated with the given error.
+     * @param error_num
+     * @return The descriptive message for the error.
+     */
+    [[nodiscard]] auto message(int error_num) const -> std::string override {
+        return message(static_cast<ErrorCodeEnum>(error_num));
+    }
+
+    /**
+     * @param error_num
+     * @param condition
+     * @return Whether the error condition of the given error matches the given condition.
+     */
+    [[nodiscard]] auto equivalent(
+            int error_num,
+            std::error_condition const& condition
+    ) const noexcept -> bool override {
+        return equivalent(static_cast<ErrorCodeEnum>(error_num), condition);
+    }
+
+    // Methods
+    /**
+     * Gets the descriptive message associated with the given error.
+     * Note: A specialization must be explicitly implemented for each valid `ErrorCodeEnum`.
+     * @param error_enum.
+     * @return The descriptive message for the error.
+     */
+    [[nodiscard]] auto message(ErrorCodeEnum error_enum) const -> std::string;
+
+    /**
+     * Note: A specialization can be implemented to create error enum to error condition mappings.
+     * @param error_num
+     * @param condition
+     * @return Whether the error condition of the given error matches the given condition.
+     */
+    [[nodiscard]] auto equivalent(
+            ErrorCodeEnum error_enum,
+            std::error_condition const& condition
+    ) const noexcept -> bool;
+};
+
+/**
+ * Template class that defines an error code. An error code is represented by a error enum value and
+ * the associated error category. This template class is designed to be `std::error_code`
+ * compatible, meaning that every instance of this class can be used to construct a corresponded
+ * `std::error_code` instance, or compare with a `std::error_code` instance to inspect a specific
+ * error.
+ * @tparam ErrorCodeEnum
+ */
+template <ErrorCodeEnumType ErrorCodeEnum>
+class ErrorCode {
+public:
+    // Constructor
+    ErrorCode(ErrorCodeEnum error) : m_error{error} {}
+
+    /**
+     * @return The underlying error code enum.
+     */
+    [[nodiscard]] auto get_error() const -> ErrorCodeEnum { return m_error; }
+
+    /**
+     * @return The error code as an error number.
+     */
+    [[nodiscard]] auto get_error_num() const -> int { return static_cast<int>(m_error); }
+
+    /**
+     * @return The reference to the singleton of the corresponded error category.
+     */
+    [[nodiscard]] constexpr static auto get_category() -> ErrorCategory<ErrorCodeEnum> const& {
+        return cCategory;
+    }
+
+private:
+    static inline ErrorCategory<ErrorCodeEnum> const cCategory;
+
+    ErrorCodeEnum m_error;
+};
+
+/**
+ * @tparam ErrorCodeEnum
+ * @param error
+ * @return Constructed `std::error_code` from the given `ErrorCode` instance.
+ */
+template <typename ErrorCodeEnum>
+[[nodiscard]] auto make_error_code(ErrorCode<ErrorCodeEnum> error) -> std::error_code;
+
+template <ErrorCodeEnumType ErrorCodeEnum>
+auto ErrorCategory<ErrorCodeEnum>::equivalent(
+        ErrorCodeEnum error_enum,
+        std::error_condition const& condition
+) const noexcept -> bool {
+    return std::error_category::default_error_condition(static_cast<int>(error_enum)) == condition;
+}
+
+template <typename ErrorCodeEnum>
+auto make_error_code(ErrorCode<ErrorCodeEnum> error) -> std::error_code {
+    return {error.get_error_num(), ErrorCode<ErrorCodeEnum>::get_category()};
+}
+}  // namespace clp::error_handling
+
+/**
+ * The macro to create a specialization of `std::is_error_code_enum` for a given type T. Only types
+ * that are marked with this macro will be considered as a valid CLP error code enum, and thus used
+ * to specialize `ErrorCode` and `ErrorCategory` templates.
+ */
+// NOLINTBEGIN(bugprone-macro-parentheses, cppcoreguidelines-macro-usage)
+#define CLP_ERROR_HANDLING_MARK_AS_ERROR_CODE_ENUM(T) \
+    template <> \
+    struct std::is_error_code_enum<clp::error_handling::ErrorCode<T>> : std::true_type { \
+        static_assert(std::is_enum_v<T>); \
+    };
+// NOLINTEND(bugprone-macro-parentheses, cppcoreguidelines-macro-usage)
+
+#endif  // CLP_ERROR_HANDLING_ERRORCODE_HPP
diff --git a/components/core/tests/test-error_handling.cpp b/components/core/tests/test-error_handling.cpp
new file mode 100644
index 000000000..2d640ed57
--- /dev/null
+++ b/components/core/tests/test-error_handling.cpp
@@ -0,0 +1,141 @@
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <system_error>
+#include <type_traits>
+
+#include <Catch2/single_include/catch2/catch.hpp>
+
+#include "../src/clp/error_handling/ErrorCode.hpp"
+
+using clp::error_handling::ErrorCategory;
+using clp::error_handling::ErrorCode;
+using std::string;
+using std::string_view;
+
+namespace {
+enum class AlwaysSuccessErrorCodeEnum : uint8_t {
+    Success = 0
+};
+
+enum class BinaryErrorCodeEnum : uint8_t {
+    Success = 0,
+    Failure
+};
+
+using AlwaysSuccessErrorCode = ErrorCode<AlwaysSuccessErrorCodeEnum>;
+using AlwaysSuccessErrorCategory = ErrorCategory<AlwaysSuccessErrorCodeEnum>;
+using BinaryErrorCode = ErrorCode<BinaryErrorCodeEnum>;
+using BinaryErrorCategory = ErrorCategory<BinaryErrorCodeEnum>;
+
+constexpr string_view cAlwaysSuccessErrorCategoryName{"Always Success Error Code"};
+constexpr string_view cBinaryTestErrorCategoryName{"Binary Error Code"};
+constexpr string_view cSuccessErrorMsg{"Success"};
+constexpr string_view cFailureErrorMsg{"Failure"};
+constexpr string_view cUnrecognizedErrorCode{"Unrecognized Error Code"};
+constexpr std::array cFailureConditions{std::errc::not_connected, std::errc::timed_out};
+constexpr std::array cNoneFailureConditions{std::errc::broken_pipe, std::errc::address_in_use};
+}  // namespace
+
+CLP_ERROR_HANDLING_MARK_AS_ERROR_CODE_ENUM(AlwaysSuccessErrorCodeEnum);
+CLP_ERROR_HANDLING_MARK_AS_ERROR_CODE_ENUM(BinaryErrorCodeEnum);
+
+template <>
+auto AlwaysSuccessErrorCategory::name() const noexcept -> char const* {
+    return cAlwaysSuccessErrorCategoryName.data();
+}
+
+template <>
+auto AlwaysSuccessErrorCategory::message(AlwaysSuccessErrorCodeEnum error_enum) const -> string {
+    switch (error_enum) {
+        case AlwaysSuccessErrorCodeEnum::Success:
+            return string{cSuccessErrorMsg};
+        default:
+            return string{cUnrecognizedErrorCode};
+    }
+}
+
+template <>
+auto BinaryErrorCategory::name() const noexcept -> char const* {
+    return cBinaryTestErrorCategoryName.data();
+}
+
+template <>
+auto BinaryErrorCategory::message(BinaryErrorCodeEnum error_enum) const -> string {
+    switch (error_enum) {
+        case BinaryErrorCodeEnum::Success:
+            return string{cSuccessErrorMsg};
+        case BinaryErrorCodeEnum::Failure:
+            return string{cFailureErrorMsg};
+        default:
+            return string{cUnrecognizedErrorCode};
+    }
+}
+
+template <>
+auto BinaryErrorCategory::equivalent(
+        BinaryErrorCodeEnum error_enum,
+        std::error_condition const& condition
+) const noexcept -> bool {
+    switch (error_enum) {
+        case BinaryErrorCodeEnum::Failure:
+            return std::any_of(
+                    cFailureConditions.cbegin(),
+                    cFailureConditions.cend(),
+                    [&](auto failure_condition) -> bool { return condition == failure_condition; }
+            );
+        default:
+            return false;
+    }
+}
+
+TEST_CASE("test_error_code_implementation", "[error_handling][ErrorCode]") {
+    // Test error codes within the same error category
+    BinaryErrorCode const success{BinaryErrorCodeEnum::Success};
+    std::error_code const success_error_code{success};
+    REQUIRE((success == success_error_code));
+    REQUIRE((cSuccessErrorMsg == success_error_code.message()));
+    REQUIRE((BinaryErrorCode::get_category() == success_error_code.category()));
+    REQUIRE((cBinaryTestErrorCategoryName == success_error_code.category().name()));
+
+    BinaryErrorCode const failure{BinaryErrorCodeEnum::Failure};
+    std::error_code const failure_error_code{failure};
+    REQUIRE((failure == failure_error_code));
+    REQUIRE((cFailureErrorMsg == failure_error_code.message()));
+    REQUIRE((BinaryErrorCode::get_category() == failure_error_code.category()));
+    REQUIRE((cBinaryTestErrorCategoryName == failure_error_code.category().name()));
+    std::for_each(
+            cFailureConditions.cbegin(),
+            cFailureConditions.cend(),
+            [&](auto failure_condition) { REQUIRE((failure_error_code == failure_condition)); }
+    );
+    std::for_each(
+            cNoneFailureConditions.cbegin(),
+            cNoneFailureConditions.cend(),
+            [&](auto none_failure_condition) {
+                REQUIRE((failure_error_code != none_failure_condition));
+            }
+    );
+
+    REQUIRE((success_error_code != failure_error_code));
+    REQUIRE((success_error_code.category() == failure_error_code.category()));
+
+    AlwaysSuccessErrorCode const always_success{AlwaysSuccessErrorCodeEnum::Success};
+    std::error_code const always_success_error_code{always_success};
+    REQUIRE((always_success_error_code == always_success));
+    REQUIRE((cSuccessErrorMsg == always_success_error_code.message()));
+    REQUIRE((AlwaysSuccessErrorCode::get_category() == always_success_error_code.category()));
+    REQUIRE((cAlwaysSuccessErrorCategoryName == always_success_error_code.category().name()));
+
+    // Compare error codes from different error category
+    // Error codes that have the same value or message won't be the same with each other if they are
+    // from different error categories.
+    REQUIRE((success_error_code.value() == always_success_error_code.value()));
+    REQUIRE((success_error_code.message() == always_success_error_code.message()));
+    REQUIRE((success_error_code.category() != always_success_error_code.category()));
+    REQUIRE((success_error_code != always_success_error_code));
+    REQUIRE((AlwaysSuccessErrorCode{AlwaysSuccessErrorCodeEnum::Success} != success_error_code));
+    REQUIRE((BinaryErrorCode{BinaryErrorCodeEnum::Success} != always_success_error_code));
+}

From 4d21d9b04c42e3f9de2ddfa19491240e9cb99ce4 Mon Sep 17 00:00:00 2001
From: haiqi96 <14502009+haiqi96@users.noreply.github.com>
Date: Thu, 5 Dec 2024 16:58:02 -0500
Subject: [PATCH 44/65] revert(core): Remove temporary output directory option
 from `clp` and `clo`. (#619)

---
 components/core/src/clp/clo/CommandLineArguments.cpp |  8 --------
 components/core/src/clp/clo/CommandLineArguments.hpp |  5 -----
 components/core/src/clp/clo/clo.cpp                  |  2 +-
 components/core/src/clp/clp/CommandLineArguments.cpp | 11 -----------
 components/core/src/clp/clp/CommandLineArguments.hpp |  3 ---
 components/core/src/clp/clp/decompression.cpp        |  2 +-
 6 files changed, 2 insertions(+), 29 deletions(-)

diff --git a/components/core/src/clp/clo/CommandLineArguments.cpp b/components/core/src/clp/clo/CommandLineArguments.cpp
index fffc3d783..4e187f985 100644
--- a/components/core/src/clp/clo/CommandLineArguments.cpp
+++ b/components/core/src/clp/clo/CommandLineArguments.cpp
@@ -181,10 +181,6 @@ auto CommandLineArguments::parse_ir_extraction_arguments(
     // clang-format off
     options_ir_extraction
             .add_options()(
-                    "temp-output-dir",
-                    po::value<string>(&m_ir_temp_output_dir)->value_name("DIR"),
-                    "Temporary output directory for IR chunks while they're being written"
-            )(
                     "target-size",
                     po::value<size_t>(&m_ir_target_size)->value_name("SIZE"),
                     "Target size (B) for each IR chunk before a new chunk is created"
@@ -287,10 +283,6 @@ auto CommandLineArguments::parse_ir_extraction_arguments(
     if (m_ir_mongodb_collection.empty()) {
         throw invalid_argument("COLLECTION not specified or empty.");
     }
-
-    if (m_ir_temp_output_dir.empty()) {
-        m_ir_temp_output_dir = m_ir_output_dir;
-    }
     return ParsingResult::Success;
 }
 
diff --git a/components/core/src/clp/clo/CommandLineArguments.hpp b/components/core/src/clp/clo/CommandLineArguments.hpp
index 9e6d311c3..d84b96a18 100644
--- a/components/core/src/clp/clo/CommandLineArguments.hpp
+++ b/components/core/src/clp/clo/CommandLineArguments.hpp
@@ -54,10 +54,6 @@ class CommandLineArguments : public CommandLineArgumentsBase {
 
     [[nodiscard]] auto get_ir_output_dir() const -> std::string const& { return m_ir_output_dir; }
 
-    [[nodiscard]] auto get_ir_temp_output_dir() const -> std::string const& {
-        return m_ir_temp_output_dir;
-    }
-
     [[nodiscard]] auto get_ir_mongodb_uri() const -> std::string const& { return m_ir_mongodb_uri; }
 
     [[nodiscard]] auto get_ir_mongodb_collection() const -> std::string const& {
@@ -187,7 +183,6 @@ class CommandLineArguments : public CommandLineArgumentsBase {
     std::string m_file_split_id;
     size_t m_ir_target_size{128ULL * 1024 * 1024};
     std::string m_ir_output_dir;
-    std::string m_ir_temp_output_dir;
     std::string m_ir_mongodb_uri;
     std::string m_ir_mongodb_collection;
 
diff --git a/components/core/src/clp/clo/clo.cpp b/components/core/src/clp/clo/clo.cpp
index f29df0306..23ff6f67e 100644
--- a/components/core/src/clp/clo/clo.cpp
+++ b/components/core/src/clp/clo/clo.cpp
@@ -224,7 +224,7 @@ bool extract_ir(CommandLineArguments const& command_line_args) {
                     archive_reader,
                     *file_metadata_ix_ptr,
                     command_line_args.get_ir_target_size(),
-                    command_line_args.get_ir_temp_output_dir(),
+                    command_line_args.get_ir_output_dir(),
                     ir_output_handler
             ))
         {
diff --git a/components/core/src/clp/clp/CommandLineArguments.cpp b/components/core/src/clp/clp/CommandLineArguments.cpp
index ccdc99793..cb44d96d8 100644
--- a/components/core/src/clp/clp/CommandLineArguments.cpp
+++ b/components/core/src/clp/clp/CommandLineArguments.cpp
@@ -255,13 +255,6 @@ CommandLineArguments::parse_arguments(int argc, char const* argv[]) {
                             ->default_value(m_ir_target_size),
                     "Target size (B) for each IR chunk before a new chunk is created"
             );
-            options_ir.add_options()(
-                    "temp-output-dir",
-                    po::value<string>(&m_ir_temp_output_dir)
-                            ->value_name("DIR")
-                            ->default_value(m_ir_temp_output_dir),
-                    "Temporary output directory for IR chunks while they're being written"
-            );
 
             po::options_description all_ir_options;
             all_ir_options.add(ir_positional_options);
@@ -311,10 +304,6 @@ CommandLineArguments::parse_arguments(int argc, char const* argv[]) {
             if (m_orig_file_id.empty()) {
                 throw invalid_argument("ORIG_FILE_ID cannot be empty.");
             }
-
-            if (m_ir_temp_output_dir.empty()) {
-                m_ir_temp_output_dir = m_output_dir;
-            }
         } else if (Command::Compress == m_command) {
             // Define compression hidden positional options
             po::options_description compression_positional_options;
diff --git a/components/core/src/clp/clp/CommandLineArguments.hpp b/components/core/src/clp/clp/CommandLineArguments.hpp
index b9cf15740..6e14a4b3b 100644
--- a/components/core/src/clp/clp/CommandLineArguments.hpp
+++ b/components/core/src/clp/clp/CommandLineArguments.hpp
@@ -37,8 +37,6 @@ class CommandLineArguments : public CommandLineArgumentsBase {
 
     std::string const& get_path_prefix_to_remove() const { return m_path_prefix_to_remove; }
 
-    std::string const& get_ir_temp_output_dir() const { return m_ir_temp_output_dir; }
-
     std::string const& get_output_dir() const { return m_output_dir; }
 
     std::string const& get_schema_file_path() const { return m_schema_file_path; }
@@ -91,7 +89,6 @@ class CommandLineArguments : public CommandLineArgumentsBase {
     size_t m_ir_msg_ix{0};
     size_t m_ir_target_size{128ULL * 1024 * 1024};
     bool m_sort_input_files;
-    std::string m_ir_temp_output_dir;
     std::string m_output_dir;
     std::string m_schema_file_path;
     bool m_show_progress;
diff --git a/components/core/src/clp/clp/decompression.cpp b/components/core/src/clp/clp/decompression.cpp
index 6b87f6777..b8ae06350 100644
--- a/components/core/src/clp/clp/decompression.cpp
+++ b/components/core/src/clp/clp/decompression.cpp
@@ -310,7 +310,7 @@ bool decompress_to_ir(CommandLineArguments& command_line_args) {
                     archive_reader,
                     *file_metadata_ix_ptr,
                     command_line_args.get_ir_target_size(),
-                    command_line_args.get_ir_temp_output_dir(),
+                    command_line_args.get_output_dir(),
                     ir_output_handler
             ))
         {

From 36892c1560772176b6d7ace258a4a50ea70dcb18 Mon Sep 17 00:00:00 2001
From: haiqi96 <14502009+haiqi96@users.noreply.github.com>
Date: Fri, 6 Dec 2024 14:34:52 -0500
Subject: [PATCH 45/65] refactor(clp-package): Unify the metadata schema for
 JSON and IR streams. (#620)

---
 components/core/src/clp/clo/OutputHandler.cpp |  2 +-
 components/core/src/clp/clo/clo.cpp           | 12 ++++--------
 components/core/src/clp/clo/constants.hpp     |  7 +++----
 .../core/src/clp/clp/FileDecompressor.hpp     |  2 +-
 components/core/src/clp/clp/decompression.cpp |  2 +-
 components/core/src/clp_s/JsonConstructor.cpp |  4 ++--
 .../core/src/clp_s/archive_constants.hpp      |  4 ++--
 .../log-viewer-webui/client/src/api/query.js  | 19 ++++---------------
 .../log-viewer-webui/server/src/DbManager.js  |  2 +-
 9 files changed, 19 insertions(+), 35 deletions(-)

diff --git a/components/core/src/clp/clo/OutputHandler.cpp b/components/core/src/clp/clo/OutputHandler.cpp
index bdf1bb1bd..1d92777c5 100644
--- a/components/core/src/clp/clo/OutputHandler.cpp
+++ b/components/core/src/clp/clo/OutputHandler.cpp
@@ -100,7 +100,7 @@ ErrorCode ResultsCacheOutputHandler::flush() {
         try {
             m_results.emplace_back(std::move(bsoncxx::builder::basic::make_document(
                     bsoncxx::builder::basic::kvp(
-                            cResultsCacheKeys::OrigFileId,
+                            cResultsCacheKeys::SearchOutput::OrigFileId,
                             std::move(result.orig_file_id)
                     ),
                     bsoncxx::builder::basic::kvp(
diff --git a/components/core/src/clp/clo/clo.cpp b/components/core/src/clp/clo/clo.cpp
index 23ff6f67e..d62049e6b 100644
--- a/components/core/src/clp/clo/clo.cpp
+++ b/components/core/src/clp/clo/clo.cpp
@@ -171,7 +171,7 @@ bool extract_ir(CommandLineArguments const& command_line_args) {
                                      string const& orig_file_id,
                                      size_t begin_message_ix,
                                      size_t end_message_ix,
-                                     bool is_last_ir_chunk) {
+                                     bool is_last_chunk) {
             auto dest_ir_file_name = orig_file_id;
             dest_ir_file_name += "_" + std::to_string(begin_message_ix);
             dest_ir_file_name += "_" + std::to_string(end_message_ix);
@@ -195,13 +195,9 @@ bool extract_ir(CommandLineArguments const& command_line_args) {
                             dest_ir_file_name
                     ),
                     bsoncxx::builder::basic::kvp(
-                            clp::clo::cResultsCacheKeys::OrigFileId,
+                            clp::clo::cResultsCacheKeys::IrOutput::StreamId,
                             orig_file_id
                     ),
-                    bsoncxx::builder::basic::kvp(
-                            clp::clo::cResultsCacheKeys::IrOutput::FileSplitId,
-                            file_split_id
-                    ),
                     bsoncxx::builder::basic::kvp(
                             clp::clo::cResultsCacheKeys::IrOutput::BeginMsgIx,
                             static_cast<int64_t>(begin_message_ix)
@@ -211,8 +207,8 @@ bool extract_ir(CommandLineArguments const& command_line_args) {
                             static_cast<int64_t>(end_message_ix)
                     ),
                     bsoncxx::builder::basic::kvp(
-                            clp::clo::cResultsCacheKeys::IrOutput::IsLastIrChunk,
-                            is_last_ir_chunk
+                            clp::clo::cResultsCacheKeys::IrOutput::IsLastChunk,
+                            is_last_chunk
                     )
             )));
             return true;
diff --git a/components/core/src/clp/clo/constants.hpp b/components/core/src/clp/clo/constants.hpp
index 86f7313f2..945bde83e 100644
--- a/components/core/src/clp/clo/constants.hpp
+++ b/components/core/src/clp/clo/constants.hpp
@@ -3,17 +3,16 @@
 
 // NOLINTBEGIN(cppcoreguidelines-avoid-c-arrays, readability-identifier-naming)
 namespace clp::clo::cResultsCacheKeys {
-constexpr char OrigFileId[]{"orig_file_id"};
-
 namespace IrOutput {
 constexpr char Path[]{"path"};
-constexpr char FileSplitId[]{"file_split_id"};
+constexpr char StreamId[]{"stream_id"};
 constexpr char BeginMsgIx[]{"begin_msg_ix"};
 constexpr char EndMsgIx[]{"end_msg_ix"};
-constexpr char IsLastIrChunk[]{"is_last_ir_chunk"};
+constexpr char IsLastChunk[]{"is_last_chunk"};
 }  // namespace IrOutput
 
 namespace SearchOutput {
+constexpr char OrigFileId[]{"orig_file_id"};
 constexpr char OrigFilePath[]{"orig_file_path"};
 constexpr char LogEventIx[]{"log_event_ix"};
 constexpr char Timestamp[]{"timestamp"};
diff --git a/components/core/src/clp/clp/FileDecompressor.hpp b/components/core/src/clp/clp/FileDecompressor.hpp
index 932cab7c5..b08a21eb4 100644
--- a/components/core/src/clp/clp/FileDecompressor.hpp
+++ b/components/core/src/clp/clp/FileDecompressor.hpp
@@ -39,7 +39,7 @@ class FileDecompressor {
      *
      * @tparam IrOutputHandler Function to handle the resulting IR chunks.
      * Signature: (std::filesystem::path const& ir_file_path, string const& orig_file_id,
-     * size_t begin_message_ix, size_t end_message_ix, bool is_last_ir_chunk) -> bool;
+     * size_t begin_message_ix, size_t end_message_ix, bool is_last_chunk) -> bool;
      * The function returns whether it succeeded.
      * @param archive_reader
      * @param file_metadata_ix
diff --git a/components/core/src/clp/clp/decompression.cpp b/components/core/src/clp/clp/decompression.cpp
index b8ae06350..c42357334 100644
--- a/components/core/src/clp/clp/decompression.cpp
+++ b/components/core/src/clp/clp/decompression.cpp
@@ -282,7 +282,7 @@ bool decompress_to_ir(CommandLineArguments& command_line_args) {
                                      string const& orig_file_id,
                                      size_t begin_message_ix,
                                      size_t end_message_ix,
-                                     [[maybe_unused]] bool is_last_ir_chunk) {
+                                     [[maybe_unused]] bool is_last_chunk) {
             auto dest_ir_file_name = orig_file_id;
             dest_ir_file_name += "_" + std::to_string(begin_message_ix);
             dest_ir_file_name += "_" + std::to_string(end_message_ix);
diff --git a/components/core/src/clp_s/JsonConstructor.cpp b/components/core/src/clp_s/JsonConstructor.cpp
index 95e3fa2c5..8886f2074 100644
--- a/components/core/src/clp_s/JsonConstructor.cpp
+++ b/components/core/src/clp_s/JsonConstructor.cpp
@@ -122,7 +122,7 @@ void JsonConstructor::construct_in_order() {
                             new_file_path.filename()
                     ),
                     bsoncxx::builder::basic::kvp(
-                            constants::results_cache::decompression::cOrigFileId,
+                            constants::results_cache::decompression::cStreamId,
                             m_option.archive_id
                     ),
                     bsoncxx::builder::basic::kvp(
@@ -134,7 +134,7 @@ void JsonConstructor::construct_in_order() {
                             last_idx
                     ),
                     bsoncxx::builder::basic::kvp(
-                            constants::results_cache::decompression::cIsLastIrChunk,
+                            constants::results_cache::decompression::cIsLastChunk,
                             false == open_new_writer
                     )
             )));
diff --git a/components/core/src/clp_s/archive_constants.hpp b/components/core/src/clp_s/archive_constants.hpp
index b76af2944..6dd7b6928 100644
--- a/components/core/src/clp_s/archive_constants.hpp
+++ b/components/core/src/clp_s/archive_constants.hpp
@@ -29,10 +29,10 @@ constexpr char cLogEventIdxName[] = "log_event_idx";
 
 namespace results_cache::decompression {
 constexpr char cPath[]{"path"};
-constexpr char cOrigFileId[]{"orig_file_id"};
+constexpr char cStreamId[]{"stream_id"};
 constexpr char cBeginMsgIx[]{"begin_msg_ix"};
 constexpr char cEndMsgIx[]{"end_msg_ix"};
-constexpr char cIsLastIrChunk[]{"is_last_ir_chunk"};
+constexpr char cIsLastChunk[]{"is_last_chunk"};
 }  // namespace results_cache::decompression
 
 namespace results_cache::search {
diff --git a/components/log-viewer-webui/client/src/api/query.js b/components/log-viewer-webui/client/src/api/query.js
index eda1db21c..f48f610a1 100644
--- a/components/log-viewer-webui/client/src/api/query.js
+++ b/components/log-viewer-webui/client/src/api/query.js
@@ -2,22 +2,11 @@ import axios from "axios";
 
 
 /**
- * @typedef {object} ExtractIrResp
+ * @typedef {object} ExtractStreamResp
+ * @property {string} stream_id
  * @property {number} begin_msg_ix
  * @property {number} end_msg_ix
- * @property {string} file_split_id
- * @property {boolean} is_last_ir_chunk
- * @property {string} orig_file_id
- * @property {string} path
- * @property {string} _id
- */
-
-/**
- * @typedef {object} ExtractJsonResp
- * @property {number} begin_msg_ix
- * @property {number} end_msg_ix
- * @property {boolean} is_last_ir_chunk
- * @property {string} orig_file_id
+ * @property {boolean} is_last_chunk
  * @property {string} path
  * @property {string} _id
  */
@@ -30,7 +19,7 @@ import axios from "axios";
  * @param {string} streamId
  * @param {number} logEventIdx
  * @param {Function} onUploadProgress Callback to handle upload progress events.
- * @return {Promise<axios.AxiosResponse<ExtractIrResp|ExtractJsonResp>>}
+ * @return {Promise<axios.AxiosResponse<ExtractStreamResp>>}
  */
 const submitExtractStreamJob = async (extractJobType, streamId, logEventIdx, onUploadProgress) => {
     return await axios.post(
diff --git a/components/log-viewer-webui/server/src/DbManager.js b/components/log-viewer-webui/server/src/DbManager.js
index e1ec00812..fc48ba5e8 100644
--- a/components/log-viewer-webui/server/src/DbManager.js
+++ b/components/log-viewer-webui/server/src/DbManager.js
@@ -171,7 +171,7 @@ class DbManager {
      */
     async getExtractedStreamFileMetadata (streamId, logEventIdx) {
         return await this.#streamFilesCollection.findOne({
-            orig_file_id: streamId,
+            stream_id: streamId,
             begin_msg_ix: {$lte: logEventIdx},
             end_msg_ix: {$gt: logEventIdx},
         });

From 604bd75d36657c1efb25233e7dd6611643a46cb9 Mon Sep 17 00:00:00 2001
From: Abigail Matthews <abigail.v.matthews@gmail.com>
Date: Mon, 9 Dec 2024 11:35:29 -0500
Subject: [PATCH 46/65] feat(clp-s): Add command line options for stubbed out
 kv-pair-IR ingestion. (#618)

---
 .../core/src/clp_s/CommandLineArguments.cpp   | 23 +++++++++++++++++++
 .../core/src/clp_s/CommandLineArguments.hpp   |  8 +++++++
 components/core/src/clp_s/JsonParser.hpp      |  2 ++
 components/core/src/clp_s/clp-s.cpp           | 13 +++++++++--
 4 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp
index 99539b627..c7fb9487e 100644
--- a/components/core/src/clp_s/CommandLineArguments.cpp
+++ b/components/core/src/clp_s/CommandLineArguments.cpp
@@ -148,6 +148,9 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
             po::options_description compression_options("Compression options");
             std::string metadata_db_config_file_path;
             std::string input_path_list_file_path;
+            constexpr std::string_view cJsonFileType{"json"};
+            constexpr std::string_view cKeyValueIrFileType{"kv-ir"};
+            std::string file_type{cJsonFileType};
             // clang-format off
             compression_options.add_options()(
                     "compression-level",
@@ -202,6 +205,10 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
                     "disable-log-order",
                     po::bool_switch(&m_disable_log_order),
                     "Do not record log order at ingestion time."
+            )(
+                    "file-type",
+                    po::value<std::string>(&file_type)->value_name("FILE_TYPE")->default_value(file_type),
+                    "The type of file being compressed (json or kv-ir)"
             );
             // clang-format on
 
@@ -255,6 +262,22 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
                 throw std::invalid_argument("No input paths specified.");
             }
 
+            if (cJsonFileType == file_type) {
+                m_file_type = FileType::Json;
+            } else if (cKeyValueIrFileType == file_type) {
+                m_file_type = FileType::KeyValueIr;
+                if (m_structurize_arrays) {
+                    SPDLOG_ERROR(
+                            "Invalid combination of arguments; --file-type {} and "
+                            "--structurize-arrays can't be used together",
+                            cKeyValueIrFileType
+                    );
+                    return ParsingResult::Failure;
+                }
+            } else {
+                throw std::invalid_argument("Unknown FILE_TYPE: " + file_type);
+            }
+
             // Parse and validate global metadata DB config
             if (false == metadata_db_config_file_path.empty()) {
                 clp::GlobalMetadataDBConfig metadata_db_config;
diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp
index a87e9b6bd..47c244646 100644
--- a/components/core/src/clp_s/CommandLineArguments.hpp
+++ b/components/core/src/clp_s/CommandLineArguments.hpp
@@ -36,6 +36,11 @@ class CommandLineArguments {
         Stdout,
     };
 
+    enum class FileType : uint8_t {
+        Json = 0,
+        KeyValueIr
+    };
+
     // Constructors
     explicit CommandLineArguments(std::string const& program_name) : m_program_name(program_name) {}
 
@@ -116,6 +121,8 @@ class CommandLineArguments {
 
     bool get_record_log_order() const { return false == m_disable_log_order; }
 
+    [[nodiscard]] auto get_file_type() const -> FileType { return m_file_type; }
+
 private:
     // Methods
     /**
@@ -184,6 +191,7 @@ class CommandLineArguments {
     size_t m_target_ordered_chunk_size{};
     size_t m_minimum_table_size{1ULL * 1024 * 1024};  // 1 MB
     bool m_disable_log_order{false};
+    FileType m_file_type{FileType::Json};
 
     // Metadata db variables
     std::optional<clp::GlobalMetadataDBConfig> m_metadata_db_config;
diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp
index bfd423c22..c05ab9d60 100644
--- a/components/core/src/clp_s/JsonParser.hpp
+++ b/components/core/src/clp_s/JsonParser.hpp
@@ -12,6 +12,7 @@
 
 #include "../clp/GlobalMySQLMetadataDB.hpp"
 #include "ArchiveWriter.hpp"
+#include "CommandLineArguments.hpp"
 #include "DictionaryWriter.hpp"
 #include "FileReader.hpp"
 #include "FileWriter.hpp"
@@ -29,6 +30,7 @@ using namespace simdjson;
 namespace clp_s {
 struct JsonParserOption {
     std::vector<std::string> file_paths;
+    CommandLineArguments::FileType input_file_type{CommandLineArguments::FileType::Json};
     std::string timestamp_key;
     std::string archives_dir;
     size_t target_encoded_size{};
diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp
index b76683caf..2c6639290 100644
--- a/components/core/src/clp_s/clp-s.cpp
+++ b/components/core/src/clp_s/clp-s.cpp
@@ -88,6 +88,7 @@ bool compress(CommandLineArguments const& command_line_arguments) {
 
     clp_s::JsonParserOption option{};
     option.file_paths = command_line_arguments.get_file_paths();
+    option.input_file_type = command_line_arguments.get_file_type();
     option.archives_dir = archives_dir.string();
     option.target_encoded_size = command_line_arguments.get_target_encoded_size();
     option.max_document_size = command_line_arguments.get_max_document_size();
@@ -113,9 +114,17 @@ bool compress(CommandLineArguments const& command_line_arguments) {
     }
 
     clp_s::JsonParser parser(option);
-    if (false == parser.parse()) {
-        SPDLOG_ERROR("Encountered error while parsing input");
+    if (CommandLineArguments::FileType::KeyValueIr == option.input_file_type) {
+        // Functionality Coming in later PR
+        //  -->Call new parsing function in Json Parser to parse IRv2 to archive
+        //  -->Check for error from parsing function
+        SPDLOG_ERROR("Compressing Key Value IR Files is not yet supported");
         return false;
+    } else {
+        if (false == parser.parse()) {
+            SPDLOG_ERROR("Encountered error while parsing input");
+            return false;
+        }
     }
     parser.store();
     return true;

From 0a9322b9e86921f26690a68684c11dee4f012efd Mon Sep 17 00:00:00 2001
From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com>
Date: Mon, 9 Dec 2024 11:44:26 -0500
Subject: [PATCH 47/65] feat(ffi): Add initial implementation of `IrErrorCode`
 (using the `ErrorCode` template) which will replace the `IRErrorCode` enum.
 (#623)

---
 components/core/CMakeLists.txt                |  2 ++
 .../src/clp/ffi/ir_stream/IrErrorCode.cpp     | 26 +++++++++++++++++++
 .../src/clp/ffi/ir_stream/IrErrorCode.hpp     | 24 +++++++++++++++++
 components/core/tests/test-error_handling.cpp | 15 +++++++++++
 4 files changed, 67 insertions(+)
 create mode 100644 components/core/src/clp/ffi/ir_stream/IrErrorCode.cpp
 create mode 100644 components/core/src/clp/ffi/ir_stream/IrErrorCode.hpp

diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index f974e5c7e..f15d14405 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -400,6 +400,8 @@ set(SOURCE_FILES_unitTest
         src/clp/ffi/ir_stream/decoding_methods.inc
         src/clp/ffi/ir_stream/encoding_methods.cpp
         src/clp/ffi/ir_stream/encoding_methods.hpp
+        src/clp/ffi/ir_stream/IrErrorCode.cpp
+        src/clp/ffi/ir_stream/IrErrorCode.hpp
         src/clp/ffi/ir_stream/IrUnitHandlerInterface.hpp
         src/clp/ffi/ir_stream/IrUnitType.hpp
         src/clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp
diff --git a/components/core/src/clp/ffi/ir_stream/IrErrorCode.cpp b/components/core/src/clp/ffi/ir_stream/IrErrorCode.cpp
new file mode 100644
index 000000000..f9a00ca1e
--- /dev/null
+++ b/components/core/src/clp/ffi/ir_stream/IrErrorCode.cpp
@@ -0,0 +1,26 @@
+#include "IrErrorCode.hpp"
+
+#include <string>
+
+using IrErrorCategory = clp::error_handling::ErrorCategory<clp::ffi::ir_stream::IrErrorCodeEnum>;
+using clp::ffi::ir_stream::IrErrorCodeEnum;
+
+template <>
+auto IrErrorCategory::name() const noexcept -> char const* {
+    return "clp::ffi::ir_stream::IrErrorCode";
+}
+
+template <>
+auto IrErrorCategory::message(IrErrorCodeEnum error_enum) const -> std::string {
+    switch (error_enum) {
+        case IrErrorCodeEnum::DecodingMethodFailure:
+            return "The decoding method failed.";
+        case IrErrorCodeEnum::EndOfStream:
+            return "The end-of-stream IR unit has already been consumed.";
+        case IrErrorCodeEnum::IncompleteStream:
+            return "The IR stream ended with a truncated IR unit or did not terminate with an "
+                   "end-of-stream IR unit.";
+        default:
+            return "Unknown error code enum.";
+    }
+}
diff --git a/components/core/src/clp/ffi/ir_stream/IrErrorCode.hpp b/components/core/src/clp/ffi/ir_stream/IrErrorCode.hpp
new file mode 100644
index 000000000..8eaad4e16
--- /dev/null
+++ b/components/core/src/clp/ffi/ir_stream/IrErrorCode.hpp
@@ -0,0 +1,24 @@
+#ifndef CLP_IRERRORCODE_HPP
+#define CLP_IRERRORCODE_HPP
+
+#include <cstdint>
+
+#include "../../error_handling/ErrorCode.hpp"
+
+namespace clp::ffi::ir_stream {
+/**
+ * This enum class represents all possible error codes related to serializing or deserializing CLP
+ * IR streams.
+ */
+enum class IrErrorCodeEnum : uint8_t {
+    DecodingMethodFailure,
+    EndOfStream,
+    IncompleteStream,
+};
+
+using IrErrorCode = clp::error_handling::ErrorCode<IrErrorCodeEnum>;
+}  // namespace clp::ffi::ir_stream
+
+CLP_ERROR_HANDLING_MARK_AS_ERROR_CODE_ENUM(clp::ffi::ir_stream::IrErrorCodeEnum);
+
+#endif  // CLP_IRERRORCODE_HPP
diff --git a/components/core/tests/test-error_handling.cpp b/components/core/tests/test-error_handling.cpp
index 2d640ed57..44327c833 100644
--- a/components/core/tests/test-error_handling.cpp
+++ b/components/core/tests/test-error_handling.cpp
@@ -9,6 +9,7 @@
 #include <Catch2/single_include/catch2/catch.hpp>
 
 #include "../src/clp/error_handling/ErrorCode.hpp"
+#include "../src/clp/ffi/ir_stream/IrErrorCode.hpp"
 
 using clp::error_handling::ErrorCategory;
 using clp::error_handling::ErrorCode;
@@ -139,3 +140,17 @@ TEST_CASE("test_error_code_implementation", "[error_handling][ErrorCode]") {
     REQUIRE((AlwaysSuccessErrorCode{AlwaysSuccessErrorCodeEnum::Success} != success_error_code));
     REQUIRE((BinaryErrorCode{BinaryErrorCodeEnum::Success} != always_success_error_code));
 }
+
+TEST_CASE("test_ir_error_code", "[error_handling][ErrorCode][IrErrorCode]") {
+    using clp::ffi::ir_stream::IrErrorCode;
+    using clp::ffi::ir_stream::IrErrorCodeEnum;
+
+    auto assert_error_code_matches_error_code_enum = [](IrErrorCodeEnum error_code_enum) -> bool {
+        std::error_code const error_code{IrErrorCode{error_code_enum}};
+        return error_code == IrErrorCode{error_code_enum};
+    };
+
+    REQUIRE(assert_error_code_matches_error_code_enum(IrErrorCodeEnum::DecodingMethodFailure));
+    REQUIRE(assert_error_code_matches_error_code_enum(IrErrorCodeEnum::EndOfStream));
+    REQUIRE(assert_error_code_matches_error_code_enum(IrErrorCodeEnum::IncompleteStream));
+}

From 78a535cc079632a047ec34503f0782578af5be65 Mon Sep 17 00:00:00 2001
From: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com>
Date: Mon, 9 Dec 2024 18:44:42 -0500
Subject: [PATCH 48/65] feat(ffi): Add support for auto/user-generated KV-pairs
 in `KeyValuePairLogEvent`; Detect and invalidate duplicate keys among
 non-leaf nodes when constructing a `KeyValuePairLogEvent`. (#558)

Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com>
---
 .../core/src/clp/ffi/KeyValuePairLogEvent.cpp | 289 ++++++++++----
 .../core/src/clp/ffi/KeyValuePairLogEvent.hpp | 101 +++--
 components/core/src/clp/ffi/SchemaTree.hpp    |   6 +
 .../src/clp/ffi/ir_stream/Deserializer.hpp    |  17 +-
 .../ir_unit_deserialization_methods.cpp       |   7 +-
 .../ir_unit_deserialization_methods.hpp       |  11 +-
 .../tests/test-ffi_IrUnitHandlerInterface.cpp |  12 +-
 .../tests/test-ffi_KeyValuePairLogEvent.cpp   | 360 ++++++++++++++----
 .../core/tests/test-ir_encoding_methods.cpp   |   6 +-
 9 files changed, 611 insertions(+), 198 deletions(-)

diff --git a/components/core/src/clp/ffi/KeyValuePairLogEvent.cpp b/components/core/src/clp/ffi/KeyValuePairLogEvent.cpp
index a8a8cf617..8e8bb15f5 100644
--- a/components/core/src/clp/ffi/KeyValuePairLogEvent.cpp
+++ b/components/core/src/clp/ffi/KeyValuePairLogEvent.cpp
@@ -153,6 +153,20 @@ node_type_matches_value_type(SchemaTree::Node::Type type, Value const& value) ->
         KeyValuePairLogEvent::NodeIdValuePairs const& node_id_value_pairs
 ) -> bool;
 
+/**
+ * @param node_id_value_pairs
+ * @param schema_tree
+ * @return A result containing a bitmap where every bit corresponds to the ID of a node in the
+ * schema tree, and the set bits correspond to the nodes in the subtree defined by all paths from
+ * the root node to the nodes in `node_id_value_pairs`; or an error code indicating a failure:
+ * - std::errc::result_out_of_range if a node ID in `node_id_value_pairs` doesn't exist in the
+ *   schema tree.
+ */
+[[nodiscard]] auto get_schema_subtree_bitmap(
+        KeyValuePairLogEvent::NodeIdValuePairs const& node_id_value_pairs,
+        SchemaTree const& schema_tree
+) -> OUTCOME_V2_NAMESPACE::std_result<vector<bool>>;
+
 /**
  * Inserts the given key-value pair into the JSON object (map).
  * @param node The schema tree node of the key to insert.
@@ -175,6 +189,34 @@ node_type_matches_value_type(SchemaTree::Node::Type type, Value const& value) ->
  */
 [[nodiscard]] auto decode_as_encoded_text_ast(Value const& val) -> std::optional<string>;
 
+/**
+ * Serializes the given node-ID-value pairs into a `nlohmann::json` object.
+ * @param schema_tree
+ * @param node_id_value_pairs
+ * @param schema_subtree_bitmap
+ * @return A result containing the serialized JSON object or an error code indicating the failure:
+ * - std::errc::protocol_error if a value in the log event couldn't be decoded, or it couldn't be
+ *   inserted into a JSON object.
+ */
+[[nodiscard]] auto serialize_node_id_value_pairs_to_json(
+        SchemaTree const& schema_tree,
+        KeyValuePairLogEvent::NodeIdValuePairs const& node_id_value_pairs,
+        vector<bool> const& schema_subtree_bitmap
+) -> OUTCOME_V2_NAMESPACE::std_result<nlohmann::json>;
+
+/**
+ * @param node A non-root schema tree node.
+ * @param parent_node_id_to_key_names
+ * @return true if `node`'s key is unique among its sibling nodes with `parent_node_id_to_key_names`
+ * updated to keep track of this unique key name.
+ * @return false if a sibling of `node` has the same key.
+ */
+[[nodiscard]] auto check_key_uniqueness_among_sibling_nodes(
+        SchemaTree::Node const& node,
+        std::unordered_map<SchemaTree::Node::id_t, std::unordered_set<std::string_view>>&
+                parent_node_id_to_key_names
+) -> bool;
+
 auto node_type_matches_value_type(SchemaTree::Node::Type type, Value const& value) -> bool {
     switch (type) {
         case SchemaTree::Node::Type::Obj:
@@ -202,6 +244,7 @@ auto validate_node_id_value_pairs(
     try {
         std::unordered_map<SchemaTree::Node::id_t, std::unordered_set<std::string_view>>
                 parent_node_id_to_key_names;
+        std::vector<bool> key_duplication_checked_node_id_bitmap(schema_tree.get_size(), false);
         for (auto const& [node_id, value] : node_id_value_pairs) {
             auto const& node{schema_tree.get_node(node_id)};
             if (node.is_root()) {
@@ -226,20 +269,38 @@ auto validate_node_id_value_pairs(
                 return std::errc::operation_not_permitted;
             }
 
-            // We checked that the node isn't the root above, so we can query the underlying ID
-            // safely without a repeated check.
-            auto const parent_node_id{node.get_parent_id_unsafe()};
-            auto const key_name{node.get_key_name()};
-            if (parent_node_id_to_key_names.contains(parent_node_id)) {
-                auto const [it, new_key_inserted]{
-                        parent_node_id_to_key_names.at(parent_node_id).emplace(key_name)
-                };
-                if (false == new_key_inserted) {
-                    // The key is duplicated under the same parent
+            if (false
+                == check_key_uniqueness_among_sibling_nodes(node, parent_node_id_to_key_names))
+            {
+                return std::errc::protocol_not_supported;
+            }
+
+            // Iteratively check if there's any key duplication in the node's ancestors until:
+            // 1. The ancestor has already been checked. We only need to check an ancestor node
+            //    once since if there are key duplications among its siblings, it would've been
+            //    caught when the sibling was first checked (the order in which siblings get checked
+            //    doesn't affect the results).
+            // 2. We reach the root node.
+            auto next_ancestor_node_id_to_check{node.get_parent_id_unsafe()};
+            while (false == key_duplication_checked_node_id_bitmap[next_ancestor_node_id_to_check])
+            {
+                auto const& node_to_check{schema_tree.get_node(next_ancestor_node_id_to_check)};
+                if (node_to_check.is_root()) {
+                    key_duplication_checked_node_id_bitmap[node_to_check.get_id()] = true;
+                    break;
+                }
+
+                if (false
+                    == check_key_uniqueness_among_sibling_nodes(
+                            node_to_check,
+                            parent_node_id_to_key_names
+                    ))
+                {
                     return std::errc::protocol_not_supported;
                 }
-            } else {
-                parent_node_id_to_key_names.emplace(parent_node_id, std::unordered_set{key_name});
+
+                key_duplication_checked_node_id_bitmap[next_ancestor_node_id_to_check] = true;
+                next_ancestor_node_id_to_check = node_to_check.get_parent_id_unsafe();
             }
         }
     } catch (SchemaTree::OperationFailed const& ex) {
@@ -269,6 +330,38 @@ auto is_leaf_node(
     return true;
 }
 
+auto get_schema_subtree_bitmap(
+        KeyValuePairLogEvent::NodeIdValuePairs const& node_id_value_pairs,
+        SchemaTree const& schema_tree
+) -> OUTCOME_V2_NAMESPACE::std_result<vector<bool>> {
+    vector<bool> schema_subtree_bitmap(schema_tree.get_size(), false);
+    for (auto const& [node_id, val] : node_id_value_pairs) {
+        if (node_id >= schema_subtree_bitmap.size()) {
+            return std::errc::result_out_of_range;
+        }
+        schema_subtree_bitmap[node_id] = true;
+
+        // Iteratively mark the parents as true
+        auto optional_parent_id{schema_tree.get_node(node_id).get_parent_id()};
+        while (true) {
+            // Ideally, we'd use this if statement as the loop condition, but clang-tidy will
+            // complain about an unchecked `optional` access.
+            if (false == optional_parent_id.has_value()) {
+                // Reached the root
+                break;
+            }
+            auto const parent_id{optional_parent_id.value()};
+            if (schema_subtree_bitmap[parent_id]) {
+                // Parent already set by other child
+                break;
+            }
+            schema_subtree_bitmap[parent_id] = true;
+            optional_parent_id = schema_tree.get_node(parent_id).get_parent_id();
+        }
+    }
+    return schema_subtree_bitmap;
+}
+
 auto insert_kv_pair_into_json_obj(
         SchemaTree::Node const& node,
         std::optional<Value> const& optional_val,
@@ -332,54 +425,13 @@ auto decode_as_encoded_text_ast(Value const& val) -> std::optional<string> {
                    ? val.get_immutable_view<FourByteEncodedTextAst>().decode_and_unparse()
                    : val.get_immutable_view<EightByteEncodedTextAst>().decode_and_unparse();
 }
-}  // namespace
-
-auto KeyValuePairLogEvent::create(
-        std::shared_ptr<SchemaTree const> schema_tree,
-        NodeIdValuePairs node_id_value_pairs,
-        UtcOffset utc_offset
-) -> OUTCOME_V2_NAMESPACE::std_result<KeyValuePairLogEvent> {
-    if (auto const ret_val{validate_node_id_value_pairs(*schema_tree, node_id_value_pairs)};
-        std::errc{} != ret_val)
-    {
-        return ret_val;
-    }
-    return KeyValuePairLogEvent{std::move(schema_tree), std::move(node_id_value_pairs), utc_offset};
-}
-
-auto KeyValuePairLogEvent::get_schema_subtree_bitmap(
-) const -> OUTCOME_V2_NAMESPACE::std_result<vector<bool>> {
-    auto schema_subtree_bitmap{vector<bool>(m_schema_tree->get_size(), false)};
-    for (auto const& [node_id, val] : m_node_id_value_pairs) {
-        if (node_id >= schema_subtree_bitmap.size()) {
-            return std::errc::result_out_of_range;
-        }
-        schema_subtree_bitmap[node_id] = true;
-
-        // Iteratively mark the parents as true
-        auto optional_parent_id{m_schema_tree->get_node(node_id).get_parent_id()};
-        while (true) {
-            // Ideally, we'd use this if statement as the loop condition, but clang-tidy will
-            // complain about an unchecked `optional` access.
-            if (false == optional_parent_id.has_value()) {
-                // Reached the root
-                break;
-            }
-            auto const parent_id{optional_parent_id.value()};
-            if (schema_subtree_bitmap[parent_id]) {
-                // Parent already set by other child
-                break;
-            }
-            schema_subtree_bitmap[parent_id] = true;
-            optional_parent_id = m_schema_tree->get_node(parent_id).get_parent_id();
-        }
-    }
-    return schema_subtree_bitmap;
-}
 
-auto KeyValuePairLogEvent::serialize_to_json(
-) const -> OUTCOME_V2_NAMESPACE::std_result<nlohmann::json> {
-    if (m_node_id_value_pairs.empty()) {
+auto serialize_node_id_value_pairs_to_json(
+        SchemaTree const& schema_tree,
+        KeyValuePairLogEvent::NodeIdValuePairs const& node_id_value_pairs,
+        vector<bool> const& schema_subtree_bitmap
+) -> OUTCOME_V2_NAMESPACE::std_result<nlohmann::json> {
+    if (node_id_value_pairs.empty()) {
         return nlohmann::json::object();
     }
 
@@ -393,12 +445,6 @@ auto KeyValuePairLogEvent::serialize_to_json(
     // vector grows).
     std::stack<DfsIterator> dfs_stack;
 
-    auto const schema_subtree_bitmap_ret{get_schema_subtree_bitmap()};
-    if (schema_subtree_bitmap_ret.has_error()) {
-        return schema_subtree_bitmap_ret.error();
-    }
-    auto const& schema_subtree_bitmap{schema_subtree_bitmap_ret.value()};
-
     // Traverse the schema tree in DFS order, but only traverse the nodes that are set in
     // `schema_subtree_bitmap`.
     //
@@ -408,7 +454,7 @@ auto KeyValuePairLogEvent::serialize_to_json(
     //
     // On the way up, add the current node's `nlohmann::json::object_t` to the parent's
     // `nlohmann::json::object_t`.
-    auto const& root_schema_tree_node{m_schema_tree->get_root()};
+    auto const& root_schema_tree_node{schema_tree.get_root()};
     auto root_json_obj = nlohmann::json::object_t();
 
     dfs_stack.emplace(
@@ -424,13 +470,13 @@ auto KeyValuePairLogEvent::serialize_to_json(
             continue;
         }
         auto const child_schema_tree_node_id{top.get_next_child_schema_tree_node()};
-        auto const& child_schema_tree_node{m_schema_tree->get_node(child_schema_tree_node_id)};
-        if (m_node_id_value_pairs.contains(child_schema_tree_node_id)) {
+        auto const& child_schema_tree_node{schema_tree.get_node(child_schema_tree_node_id)};
+        if (node_id_value_pairs.contains(child_schema_tree_node_id)) {
             // Handle leaf node
             if (false
                 == insert_kv_pair_into_json_obj(
                         child_schema_tree_node,
-                        m_node_id_value_pairs.at(child_schema_tree_node_id),
+                        node_id_value_pairs.at(child_schema_tree_node_id),
                         top.get_json_obj()
                 ))
             {
@@ -452,4 +498,109 @@ auto KeyValuePairLogEvent::serialize_to_json(
 
     return root_json_obj;
 }
+
+auto check_key_uniqueness_among_sibling_nodes(
+        SchemaTree::Node const& node,
+        std::unordered_map<SchemaTree::Node::id_t, std::unordered_set<std::string_view>>&
+                parent_node_id_to_key_names
+) -> bool {
+    // The caller checks that the given node is not the root, so we can query the underlying
+    // parent ID safely without a check.
+    auto const parent_node_id{node.get_parent_id_unsafe()};
+    auto const key_name{node.get_key_name()};
+    auto const parent_node_id_to_key_names_it{parent_node_id_to_key_names.find(parent_node_id)};
+    if (parent_node_id_to_key_names_it != parent_node_id_to_key_names.end()) {
+        auto const [it, new_key_inserted]{parent_node_id_to_key_names_it->second.emplace(key_name)};
+        if (false == new_key_inserted) {
+            // The key is duplicated under the same parent
+            return false;
+        }
+    } else {
+        parent_node_id_to_key_names.emplace(parent_node_id, std::unordered_set{key_name});
+    }
+    return true;
+}
+}  // namespace
+
+auto KeyValuePairLogEvent::create(
+        std::shared_ptr<SchemaTree const> auto_gen_keys_schema_tree,
+        std::shared_ptr<SchemaTree const> user_gen_keys_schema_tree,
+        NodeIdValuePairs auto_gen_node_id_value_pairs,
+        NodeIdValuePairs user_gen_node_id_value_pairs,
+        UtcOffset utc_offset
+) -> OUTCOME_V2_NAMESPACE::std_result<KeyValuePairLogEvent> {
+    if (nullptr == auto_gen_keys_schema_tree || nullptr == user_gen_keys_schema_tree) {
+        return std::errc::invalid_argument;
+    }
+
+    if (auto const ret_val{validate_node_id_value_pairs(
+                *auto_gen_keys_schema_tree,
+                auto_gen_node_id_value_pairs
+        )};
+        std::errc{} != ret_val)
+    {
+        return ret_val;
+    }
+
+    if (auto const ret_val{validate_node_id_value_pairs(
+                *user_gen_keys_schema_tree,
+                user_gen_node_id_value_pairs
+        )};
+        std::errc{} != ret_val)
+    {
+        return ret_val;
+    }
+
+    return KeyValuePairLogEvent{
+            std::move(auto_gen_keys_schema_tree),
+            std::move(user_gen_keys_schema_tree),
+            std::move(auto_gen_node_id_value_pairs),
+            std::move(user_gen_node_id_value_pairs),
+            utc_offset
+    };
+}
+
+auto KeyValuePairLogEvent::get_auto_gen_keys_schema_subtree_bitmap(
+) const -> OUTCOME_V2_NAMESPACE::std_result<std::vector<bool>> {
+    return get_schema_subtree_bitmap(m_auto_gen_node_id_value_pairs, *m_auto_gen_keys_schema_tree);
+}
+
+auto KeyValuePairLogEvent::get_user_gen_keys_schema_subtree_bitmap(
+) const -> outcome_v2::std_result<std::vector<bool>> {
+    return get_schema_subtree_bitmap(m_user_gen_node_id_value_pairs, *m_user_gen_keys_schema_tree);
+}
+
+auto KeyValuePairLogEvent::serialize_to_json(
+) const -> OUTCOME_V2_NAMESPACE::std_result<std::pair<nlohmann::json, nlohmann::json>> {
+    auto const auto_gen_keys_schema_subtree_bitmap_result{get_auto_gen_keys_schema_subtree_bitmap()
+    };
+    if (auto_gen_keys_schema_subtree_bitmap_result.has_error()) {
+        return auto_gen_keys_schema_subtree_bitmap_result.error();
+    }
+    auto serialized_auto_gen_kv_pairs_result{serialize_node_id_value_pairs_to_json(
+            *m_auto_gen_keys_schema_tree,
+            m_auto_gen_node_id_value_pairs,
+            auto_gen_keys_schema_subtree_bitmap_result.value()
+    )};
+    if (serialized_auto_gen_kv_pairs_result.has_error()) {
+        return serialized_auto_gen_kv_pairs_result.error();
+    }
+
+    auto const user_gen_keys_schema_subtree_bitmap_result{get_user_gen_keys_schema_subtree_bitmap()
+    };
+    if (user_gen_keys_schema_subtree_bitmap_result.has_error()) {
+        return user_gen_keys_schema_subtree_bitmap_result.error();
+    }
+    auto serialized_user_gen_kv_pairs_result{serialize_node_id_value_pairs_to_json(
+            *m_user_gen_keys_schema_tree,
+            m_user_gen_node_id_value_pairs,
+            user_gen_keys_schema_subtree_bitmap_result.value()
+    )};
+    if (serialized_user_gen_kv_pairs_result.has_error()) {
+        return serialized_user_gen_kv_pairs_result.error();
+    }
+
+    return {std::move(serialized_auto_gen_kv_pairs_result.value()),
+            std::move(serialized_user_gen_kv_pairs_result.value())};
+}
 }  // namespace clp::ffi
diff --git a/components/core/src/clp/ffi/KeyValuePairLogEvent.hpp b/components/core/src/clp/ffi/KeyValuePairLogEvent.hpp
index f6334d378..2929c7498 100644
--- a/components/core/src/clp/ffi/KeyValuePairLogEvent.hpp
+++ b/components/core/src/clp/ffi/KeyValuePairLogEvent.hpp
@@ -17,10 +17,13 @@
 namespace clp::ffi {
 /**
  * A log event containing key-value pairs. Each event contains:
- * - A collection of node-ID & value pairs, where each pair represents a leaf `SchemaTreeNode` in
- *   the `SchemaTree`.
- * - A reference to the `SchemaTree`
- * - The UTC offset of the current log event
+ * - A reference to the schema tree for auto-generated keys.
+ * - A reference to the schema tree for user-generated keys.
+ * - A collection of auto-generated node-ID & value pairs, where each pair represents a leaf
+ *   `SchemaTree::Node` in the schema tree for auto-generated keys.
+ * - A collection of user-generated node-ID & value pairs, where each pair represents a leaf
+ *   `SchemaTree::Node` in the schema tree for user-generated keys.
+ * - The UTC offset of the current log event.
  */
 class KeyValuePairLogEvent {
 public:
@@ -29,15 +32,21 @@ class KeyValuePairLogEvent {
 
     // Factory functions
     /**
-     * @param schema_tree
-     * @param node_id_value_pairs
+     * @param auto_gen_keys_schema_tree
+     * @param user_gen_keys_schema_tree
+     * @param auto_gen_node_id_value_pairs
+     * @param user_gen_node_id_value_pairs
      * @param utc_offset
      * @return A result containing the key-value pair log event or an error code indicating the
-     * failure. See `validate_node_id_value_pairs` for the possible error codes.
+     * failure:
+     * - std::errc::invalid_argument if any of the given schema tree pointers are null.
+     * - Forwards `validate_node_id_value_pairs`'s return values.
      */
     [[nodiscard]] static auto create(
-            std::shared_ptr<SchemaTree const> schema_tree,
-            NodeIdValuePairs node_id_value_pairs,
+            std::shared_ptr<SchemaTree const> auto_gen_keys_schema_tree,
+            std::shared_ptr<SchemaTree const> user_gen_keys_schema_tree,
+            NodeIdValuePairs auto_gen_node_id_value_pairs,
+            NodeIdValuePairs user_gen_node_id_value_pairs,
             UtcOffset utc_offset
     ) -> OUTCOME_V2_NAMESPACE::std_result<KeyValuePairLogEvent>;
 
@@ -53,51 +62,77 @@ class KeyValuePairLogEvent {
     ~KeyValuePairLogEvent() = default;
 
     // Methods
-    [[nodiscard]] auto get_schema_tree() const -> SchemaTree const& { return *m_schema_tree; }
+    [[nodiscard]] auto get_auto_gen_keys_schema_tree() const -> SchemaTree const& {
+        return *m_auto_gen_keys_schema_tree;
+    }
 
-    [[nodiscard]] auto get_node_id_value_pairs() const -> NodeIdValuePairs const& {
-        return m_node_id_value_pairs;
+    [[nodiscard]] auto get_user_gen_keys_schema_tree() const -> SchemaTree const& {
+        return *m_user_gen_keys_schema_tree;
     }
 
-    [[nodiscard]] auto get_utc_offset() const -> UtcOffset { return m_utc_offset; }
+    [[nodiscard]] auto get_auto_gen_node_id_value_pairs() const -> NodeIdValuePairs const& {
+        return m_auto_gen_node_id_value_pairs;
+    }
+
+    [[nodiscard]] auto get_user_gen_node_id_value_pairs() const -> NodeIdValuePairs const& {
+        return m_user_gen_node_id_value_pairs;
+    }
 
     /**
      * @return A result containing a bitmap where every bit corresponds to the ID of a node in the
-     * schema tree, and the set bits correspond to the nodes in the subtree defined by all paths
-     * from the root node to the nodes in `node_id_value_pairs`; or an error code indicating a
-     * failure:
-     * - std::errc::result_out_of_range if a node ID in `node_id_value_pairs` doesn't exist in the
-     *   schema tree.
+     * schema tree for auto-generated keys, and the set bits correspond to the nodes in the subtree
+     * defined by all paths from the root node to the nodes in `m_auto_gen_node_id_value_pairs`; or
+     * an error code indicating a failure:
+     * - Forwards `get_schema_subtree_bitmap`'s return values.
      */
-    [[nodiscard]] auto get_schema_subtree_bitmap(
+    [[nodiscard]] auto get_auto_gen_keys_schema_subtree_bitmap(
     ) const -> OUTCOME_V2_NAMESPACE::std_result<std::vector<bool>>;
 
     /**
-     * Serializes the log event into a `nlohmann::json` object.
-     * @return A result containing the serialized JSON object or an error code indicating the
-     * failure:
-     * - std::errc::protocol_error if a value in the log event couldn't be decoded or it couldn't be
-     *   inserted into a JSON object.
-     * - std::errc::result_out_of_range if a node ID in the log event doesn't exist in the schema
-     *   tree.
+     * @return A result containing a bitmap where every bit corresponds to the ID of a node in the
+     * schema tree for user-generated keys, and the set bits correspond to the nodes in the subtree
+     * defined by all paths from the root node to the nodes in `m_user_gen_node_id_value_pairs`; or
+     * an error code indicating a failure:
+     * - Forwards `get_schema_subtree_bitmap`'s return values.
+     */
+    [[nodiscard]] auto get_user_gen_keys_schema_subtree_bitmap(
+    ) const -> OUTCOME_V2_NAMESPACE::std_result<std::vector<bool>>;
+
+    [[nodiscard]] auto get_utc_offset() const -> UtcOffset { return m_utc_offset; }
+
+    /**
+     * Serializes the log event into `nlohmann::json` objects.
+     * @return A result containing a pair or an error code indicating the failure:
+     * - The pair:
+     *   - Serialized auto-generated key-value pairs as a JSON object
+     *   - Serialized user-generated key-value pairs as a JSON object
+     * - The possible error codes:
+     *   - Forwards `get_auto_gen_keys_schema_subtree_bitmap`'s return values on failure.
+     *   - Forwards `serialize_node_id_value_pairs_to_json`'s return values on failure.
      */
     [[nodiscard]] auto serialize_to_json(
-    ) const -> OUTCOME_V2_NAMESPACE::std_result<nlohmann::json>;
+    ) const -> OUTCOME_V2_NAMESPACE::std_result<std::pair<nlohmann::json, nlohmann::json>>;
 
 private:
     // Constructor
     KeyValuePairLogEvent(
-            std::shared_ptr<SchemaTree const> schema_tree,
-            NodeIdValuePairs node_id_value_pairs,
+            std::shared_ptr<SchemaTree const> auto_gen_keys_schema_tree,
+            std::shared_ptr<SchemaTree const> user_gen_keys_schema_tree,
+            NodeIdValuePairs auto_gen_node_id_value_pairs,
+            NodeIdValuePairs user_gen_node_id_value_pairs,
             UtcOffset utc_offset
     )
-            : m_schema_tree{std::move(schema_tree)},
-              m_node_id_value_pairs{std::move(node_id_value_pairs)},
+            : m_auto_gen_keys_schema_tree{std::move(auto_gen_keys_schema_tree)},
+              m_user_gen_keys_schema_tree{std::move(user_gen_keys_schema_tree)},
+              m_auto_gen_node_id_value_pairs{std::move(auto_gen_node_id_value_pairs)},
+              m_user_gen_node_id_value_pairs{std::move(user_gen_node_id_value_pairs)},
               m_utc_offset{utc_offset} {}
 
     // Variables
-    std::shared_ptr<SchemaTree const> m_schema_tree;
-    NodeIdValuePairs m_node_id_value_pairs;
+    std::shared_ptr<SchemaTree const> m_auto_gen_keys_schema_tree;
+    std::shared_ptr<SchemaTree const> m_user_gen_keys_schema_tree;
+    NodeIdValuePairs m_auto_gen_node_id_value_pairs;
+    NodeIdValuePairs m_user_gen_node_id_value_pairs;
     UtcOffset m_utc_offset{0};
 };
 }  // namespace clp::ffi
diff --git a/components/core/src/clp/ffi/SchemaTree.hpp b/components/core/src/clp/ffi/SchemaTree.hpp
index 46494fa71..4efbbf81e 100644
--- a/components/core/src/clp/ffi/SchemaTree.hpp
+++ b/components/core/src/clp/ffi/SchemaTree.hpp
@@ -128,6 +128,8 @@ class SchemaTree {
         ~Node() = default;
 
         // Methods
+        [[nodiscard]] auto operator==(Node const& rhs) const -> bool = default;
+
         [[nodiscard]] auto get_id() const -> id_t { return m_id; }
 
         [[nodiscard]] auto is_root() const -> bool { return false == m_parent_id.has_value(); }
@@ -249,6 +251,10 @@ class SchemaTree {
     ~SchemaTree() = default;
 
     // Methods
+    [[nodiscard]] auto operator==(SchemaTree const& rhs) const -> bool {
+        return m_tree_nodes == rhs.m_tree_nodes;
+    }
+
     [[nodiscard]] auto get_size() const -> size_t { return m_tree_nodes.size(); }
 
     [[nodiscard]] auto get_root() const -> Node const& { return m_tree_nodes[cRootId]; }
diff --git a/components/core/src/clp/ffi/ir_stream/Deserializer.hpp b/components/core/src/clp/ffi/ir_stream/Deserializer.hpp
index 3418a39ae..d31699cd2 100644
--- a/components/core/src/clp/ffi/ir_stream/Deserializer.hpp
+++ b/components/core/src/clp/ffi/ir_stream/Deserializer.hpp
@@ -115,7 +115,8 @@ class Deserializer {
     Deserializer(IrUnitHandler ir_unit_handler) : m_ir_unit_handler{std::move(ir_unit_handler)} {}
 
     // Variables
-    std::shared_ptr<SchemaTree> m_schema_tree{std::make_shared<SchemaTree>()};
+    std::shared_ptr<SchemaTree> m_auto_gen_keys_schema_tree{std::make_shared<SchemaTree>()};
+    std::shared_ptr<SchemaTree> m_user_gen_keys_schema_tree{std::make_shared<SchemaTree>()};
     UtcOffset m_utc_offset{0};
     IrUnitHandler m_ir_unit_handler;
     bool m_is_complete{false};
@@ -183,9 +184,13 @@ auto Deserializer<IrUnitHandler>::deserialize_next_ir_unit(ReaderInterface& read
     auto const ir_unit_type{optional_ir_unit_type.value()};
     switch (ir_unit_type) {
         case IrUnitType::LogEvent: {
-            auto result{
-                    deserialize_ir_unit_kv_pair_log_event(reader, tag, m_schema_tree, m_utc_offset)
-            };
+            auto result{deserialize_ir_unit_kv_pair_log_event(
+                    reader,
+                    tag,
+                    m_auto_gen_keys_schema_tree,
+                    m_user_gen_keys_schema_tree,
+                    m_utc_offset
+            )};
             if (result.has_error()) {
                 return result.error();
             }
@@ -207,7 +212,7 @@ auto Deserializer<IrUnitHandler>::deserialize_next_ir_unit(ReaderInterface& read
             }
 
             auto const node_locator{result.value()};
-            if (m_schema_tree->has_node(node_locator)) {
+            if (m_user_gen_keys_schema_tree->has_node(node_locator)) {
                 return std::errc::protocol_error;
             }
 
@@ -217,7 +222,7 @@ auto Deserializer<IrUnitHandler>::deserialize_next_ir_unit(ReaderInterface& read
                 return ir_error_code_to_errc(err);
             }
 
-            std::ignore = m_schema_tree->insert_node(node_locator);
+            std::ignore = m_user_gen_keys_schema_tree->insert_node(node_locator);
             break;
         }
 
diff --git a/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp b/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp
index 5e1813a3e..cea4a1b84 100644
--- a/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp
+++ b/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp
@@ -551,7 +551,8 @@ auto deserialize_ir_unit_utc_offset_change(ReaderInterface& reader
 auto deserialize_ir_unit_kv_pair_log_event(
         ReaderInterface& reader,
         encoded_tag_t tag,
-        std::shared_ptr<SchemaTree> schema_tree,
+        std::shared_ptr<SchemaTree> auto_gen_keys_schema_tree,
+        std::shared_ptr<SchemaTree> user_gen_keys_schema_tree,
         UtcOffset utc_offset
 ) -> OUTCOME_V2_NAMESPACE::std_result<KeyValuePairLogEvent> {
     auto const schema_result{deserialize_schema(reader, tag)};
@@ -579,7 +580,9 @@ auto deserialize_ir_unit_kv_pair_log_event(
     }
 
     return KeyValuePairLogEvent::create(
-            std::move(schema_tree),
+            std::move(auto_gen_keys_schema_tree),
+            std::move(user_gen_keys_schema_tree),
+            {},
             std::move(node_id_value_pairs),
             utc_offset
     );
diff --git a/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.hpp b/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.hpp
index 68ed4408b..451f627db 100644
--- a/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.hpp
+++ b/components/core/src/clp/ffi/ir_stream/ir_unit_deserialization_methods.hpp
@@ -57,10 +57,12 @@ namespace clp::ffi::ir_stream {
  * Deserializes a key-value pair log event IR unit.
  * @param reader
  * @param tag
- * @param schema_tree Schema tree used to construct the KV-pair log event.
+ * @param auto_gen_keys_schema_tree Schema tree for auto-generated keys, used to construct the
+ * KV-pair log event.
+ * @param user_gen_keys_schema_tree Schema tree for user-generated keys, used to construct the
+ * KV-pair log event.
  * @param utc_offset UTC offset used to construct the KV-pair log event.
- * @return A result containing the deserialized log event or an error code indicating the
- * failure:
+ * @return A result containing the deserialized log event or an error code indicating the failure:
  * - std::errc::result_out_of_range if the IR stream is truncated.
  * - std::errc::protocol_error if the IR stream is corrupted.
  * - std::errc::protocol_not_supported if the IR stream contains an unsupported metadata format
@@ -72,7 +74,8 @@ namespace clp::ffi::ir_stream {
 [[nodiscard]] auto deserialize_ir_unit_kv_pair_log_event(
         ReaderInterface& reader,
         encoded_tag_t tag,
-        std::shared_ptr<SchemaTree> schema_tree,
+        std::shared_ptr<SchemaTree> auto_gen_keys_schema_tree,
+        std::shared_ptr<SchemaTree> user_gen_keys_schema_tree,
         UtcOffset utc_offset
 ) -> OUTCOME_V2_NAMESPACE::std_result<KeyValuePairLogEvent>;
 }  // namespace clp::ffi::ir_stream
diff --git a/components/core/tests/test-ffi_IrUnitHandlerInterface.cpp b/components/core/tests/test-ffi_IrUnitHandlerInterface.cpp
index 5b8ad82cd..8f76a2f1a 100644
--- a/components/core/tests/test-ffi_IrUnitHandlerInterface.cpp
+++ b/components/core/tests/test-ffi_IrUnitHandlerInterface.cpp
@@ -87,9 +87,13 @@ auto test_ir_unit_handler_interface(clp::ffi::ir_stream::IrUnitHandlerInterface
 
 auto test_ir_unit_handler_interface(clp::ffi::ir_stream::IrUnitHandlerInterface auto& handler
 ) -> void {
-    auto test_log_event_result{
-            KeyValuePairLogEvent::create(std::make_shared<SchemaTree>(), {}, cTestUtcOffset)
-    };
+    auto test_log_event_result{KeyValuePairLogEvent::create(
+            std::make_shared<SchemaTree>(),
+            std::make_shared<SchemaTree>(),
+            {},
+            {},
+            cTestUtcOffset
+    )};
     REQUIRE(
             (false == test_log_event_result.has_error()
              && IRErrorCode::IRErrorCode_Success
@@ -127,7 +131,7 @@ TEMPLATE_TEST_CASE(
     REQUIRE(
             (optional_log_event.has_value()
              && optional_log_event.value().get_utc_offset() == cTestUtcOffset
-             && optional_log_event.value().get_node_id_value_pairs().empty())
+             && optional_log_event.value().get_user_gen_node_id_value_pairs().empty())
     );
     auto const& optional_schema_tree_locator{handler.get_schema_tree_node_locator()};
     REQUIRE(
diff --git a/components/core/tests/test-ffi_KeyValuePairLogEvent.cpp b/components/core/tests/test-ffi_KeyValuePairLogEvent.cpp
index 2e9cfb691..9ffee4f68 100644
--- a/components/core/tests/test-ffi_KeyValuePairLogEvent.cpp
+++ b/components/core/tests/test-ffi_KeyValuePairLogEvent.cpp
@@ -11,6 +11,7 @@
 #include <vector>
 
 #include <Catch2/single_include/catch2/catch.hpp>
+#include <json/single_include/nlohmann/json.hpp>
 
 #include "../src/clp/ffi/encoding_methods.hpp"
 #include "../src/clp/ffi/KeyValuePairLogEvent.hpp"
@@ -81,6 +82,25 @@ auto insert_invalid_node_id_value_pairs_with_node_type_errors(
         KeyValuePairLogEvent::NodeIdValuePairs& invalid_node_id_value_pairs
 ) -> void;
 
+/**
+ * Asserts that `KeyValuePairLogEvent` creation fails with the expected error code.
+ * @param auto_gen_keys_schema_tree
+ * @param user_gen_keys_schema_tree
+ * @param auto_gen_node_id_value_pairs
+ * @param user_gen_node_id_value_pairs
+ * @param utc_offset
+ * @param expected_error_code
+ * @return Whether the assertion succeeded.
+ */
+[[nodiscard]] auto assert_kv_pair_log_event_creation_failure(
+        std::shared_ptr<SchemaTree> auto_gen_keys_schema_tree,
+        std::shared_ptr<SchemaTree> user_gen_keys_schema_tree,
+        KeyValuePairLogEvent::NodeIdValuePairs auto_gen_node_id_value_pairs,
+        KeyValuePairLogEvent::NodeIdValuePairs user_gen_node_id_value_pairs,
+        UtcOffset utc_offset,
+        std::errc expected_error_code
+) -> bool;
+
 template <typename encoded_variable_t>
 requires(std::is_same_v<encoded_variable_t, eight_byte_encoded_variable_t>
          || std::is_same_v<encoded_variable_t, four_byte_encoded_variable_t>)
@@ -197,6 +217,24 @@ auto insert_invalid_node_id_value_pairs_with_node_type_errors(
         invalid_node_id_value_pairs.emplace(node_id, Value{});
     }
 }
+
+auto assert_kv_pair_log_event_creation_failure(
+        std::shared_ptr<SchemaTree> auto_gen_keys_schema_tree,
+        std::shared_ptr<SchemaTree> user_gen_keys_schema_tree,
+        KeyValuePairLogEvent::NodeIdValuePairs auto_gen_node_id_value_pairs,
+        KeyValuePairLogEvent::NodeIdValuePairs user_gen_node_id_value_pairs,
+        UtcOffset utc_offset,
+        std::errc expected_error_code
+) -> bool {
+    auto const result{KeyValuePairLogEvent::create(
+            std::move(auto_gen_keys_schema_tree),
+            std::move(user_gen_keys_schema_tree),
+            std::move(auto_gen_node_id_value_pairs),
+            std::move(user_gen_node_id_value_pairs),
+            utc_offset
+    )};
+    return result.has_error() && result.error() == expected_error_code;
+}
 }  // namespace
 
 TEST_CASE("ffi_Value_basic", "[ffi][Value]") {
@@ -250,22 +288,23 @@ TEST_CASE("ffi_KeyValuePairLogEvent_create", "[ffi]") {
      *      |
      *      |------------> <1:a:Obj>
      *      |                  |
-     *      |--> <2:a:Int>     |--> <3:b:Obj>
-     *                                  |
-     *                                  |------------> <4:c:Obj>
-     *                                  |                  |
-     *                                  |--> <5:d:Str>     |--> <7:a:UnstructuredArray>
-     *                                  |                  |
-     *                                  |--> <6:d:Bool>    |--> <8:d:Str>
-     *                                  |                  |
-     *                                  |--> <10:e:Obj>    |--> <9:d:Float>
-     *                                                     |
-     *                                                     |--> <11:f:Obj>
+     *      |--> <2:b:Int>     |--> <3:b:Obj>
+     *      |                  |        |
+     *      |--> <12:a:Int>    |        |------------> <4:c:Obj>
+     *                         |        |                  |
+     *                         |        |--> <5:d:Str>     |--> <7:a:UnstructuredArray>
+     *                         |        |                  |
+     *                         |        |--> <6:d:Bool>    |--> <8:d:Str>
+     *                         |        |                  |
+     *                         |        |--> <10:e:Obj>    |--> <9:d:Float>
+     *                         |                           |
+     *                         |--> <13:b:Bool>            |--> <11:f:Obj>
      */
-    auto const schema_tree{std::make_shared<SchemaTree>()};
+    auto const auto_gen_keys_schema_tree{std::make_shared<SchemaTree>()};
+    auto const user_gen_keys_schema_tree{std::make_shared<SchemaTree>()};
     std::vector<SchemaTree::NodeLocator> const locators{
             {SchemaTree::cRootId, "a", SchemaTree::Node::Type::Obj},
-            {SchemaTree::cRootId, "a", SchemaTree::Node::Type::Int},
+            {SchemaTree::cRootId, "b", SchemaTree::Node::Type::Int},
             {1, "b", SchemaTree::Node::Type::Obj},
             {3, "c", SchemaTree::Node::Type::Obj},
             {3, "d", SchemaTree::Node::Type::Str},
@@ -274,63 +313,88 @@ TEST_CASE("ffi_KeyValuePairLogEvent_create", "[ffi]") {
             {4, "d", SchemaTree::Node::Type::Str},
             {4, "d", SchemaTree::Node::Type::Float},
             {3, "e", SchemaTree::Node::Type::Obj},
-            {4, "f", SchemaTree::Node::Type::Obj}
+            {4, "f", SchemaTree::Node::Type::Obj},
+            {SchemaTree::cRootId, "a", SchemaTree::Node::Type::Int},
+            {1, "b", SchemaTree::Node::Type::Bool}
     };
     for (auto const& locator : locators) {
-        REQUIRE_NOTHROW(schema_tree->insert_node(locator));
+        REQUIRE_NOTHROW(auto_gen_keys_schema_tree->insert_node(locator));
+        REQUIRE_NOTHROW(user_gen_keys_schema_tree->insert_node(locator));
     }
 
+    REQUIRE((*auto_gen_keys_schema_tree == *user_gen_keys_schema_tree));
+
     SECTION("Test empty ID-value pairs") {
-        KeyValuePairLogEvent::NodeIdValuePairs node_id_value_pairs;
         auto const result{KeyValuePairLogEvent::create(
-                schema_tree,
-                std::move(node_id_value_pairs),
+                auto_gen_keys_schema_tree,
+                user_gen_keys_schema_tree,
+                {},
+                {},
                 UtcOffset{0}
         )};
         REQUIRE_FALSE(result.has_error());
     }
 
+    SECTION("Test schema tree pointers being null") {
+        REQUIRE(assert_kv_pair_log_event_creation_failure(
+                nullptr,
+                user_gen_keys_schema_tree,
+                {},
+                {},
+                UtcOffset{0},
+                std::errc::invalid_argument
+        ));
+        REQUIRE(assert_kv_pair_log_event_creation_failure(
+                auto_gen_keys_schema_tree,
+                nullptr,
+                {},
+                {},
+                UtcOffset{0},
+                std::errc::invalid_argument
+        ));
+    }
+
     SECTION("Test mismatched types") {
         KeyValuePairLogEvent::NodeIdValuePairs invalid_node_id_value_pairs;
         // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
         // Int:
         insert_invalid_node_id_value_pairs_with_node_type_errors(
-                *schema_tree,
+                *user_gen_keys_schema_tree,
                 2,
                 invalid_node_id_value_pairs
         );
 
         // Float:
         insert_invalid_node_id_value_pairs_with_node_type_errors(
-                *schema_tree,
+                *user_gen_keys_schema_tree,
                 9,
                 invalid_node_id_value_pairs
         );
 
         // Bool:
         insert_invalid_node_id_value_pairs_with_node_type_errors(
-                *schema_tree,
+                *user_gen_keys_schema_tree,
                 6,
                 invalid_node_id_value_pairs
         );
 
         // Str:
         insert_invalid_node_id_value_pairs_with_node_type_errors(
-                *schema_tree,
+                *user_gen_keys_schema_tree,
                 5,
                 invalid_node_id_value_pairs
         );
 
         // UnstructuredArray:
         insert_invalid_node_id_value_pairs_with_node_type_errors(
-                *schema_tree,
+                *user_gen_keys_schema_tree,
                 7,
                 invalid_node_id_value_pairs
         );
 
         // Obj:
         insert_invalid_node_id_value_pairs_with_node_type_errors(
-                *schema_tree,
+                *user_gen_keys_schema_tree,
                 3,
                 invalid_node_id_value_pairs
         );
@@ -343,26 +407,37 @@ TEST_CASE("ffi_KeyValuePairLogEvent_create", "[ffi]") {
             } else {
                 node_id_value_pair_to_test.emplace(node_id, std::nullopt);
             }
-            auto const result{KeyValuePairLogEvent::create(
-                    schema_tree,
-                    std::move(node_id_value_pair_to_test),
-                    UtcOffset{0}
-            )};
-            REQUIRE(result.has_error());
-            auto const& err{result.error()};
-            REQUIRE((std::errc::protocol_error == err));
+
+            REQUIRE(assert_kv_pair_log_event_creation_failure(
+                    auto_gen_keys_schema_tree,
+                    user_gen_keys_schema_tree,
+                    node_id_value_pair_to_test,
+                    {},
+                    UtcOffset{0},
+                    std::errc::protocol_error
+            ));
+            REQUIRE(assert_kv_pair_log_event_creation_failure(
+                    auto_gen_keys_schema_tree,
+                    user_gen_keys_schema_tree,
+                    {},
+                    node_id_value_pair_to_test,
+                    UtcOffset{0},
+                    std::errc::protocol_error
+            ));
         }
     }
 
     SECTION("Test valid ID-value pairs") {
-        KeyValuePairLogEvent::NodeIdValuePairs node_id_value_pairs;
+        constexpr std::string_view cJsonArrayToEncode{"[\"a\", 1, 0.1, null]"};
+        constexpr std::string_view cStaticText{"Test"};
+        KeyValuePairLogEvent::NodeIdValuePairs valid_node_id_value_pairs;
         /*
          * The sub schema tree of `node_id_value_pairs`:
          * <0:root:Obj>
          *      |
          *      |------------> <1:a:Obj>
          *      |                  |
-         *      |--> <2:a:Int>     |--> <3:b:Obj>
+         *      |--> <2:b:Int>     |--> <3:b:Obj>
          *                                  |
          *                                  |------------> <4:c:Obj>
          *                                  |                  |
@@ -375,77 +450,206 @@ TEST_CASE("ffi_KeyValuePairLogEvent_create", "[ffi]") {
          *                                                     |--> <11:f:Obj>
          */
         // NOLINTBEGIN(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-        node_id_value_pairs.emplace(2, Value{static_cast<value_int_t>(0)});
-        node_id_value_pairs.emplace(5, Value{string{"Test"}});
-        node_id_value_pairs.emplace(
+        valid_node_id_value_pairs.emplace(2, Value{static_cast<value_int_t>(0)});
+        valid_node_id_value_pairs.emplace(5, Value{string{cStaticText}});
+        valid_node_id_value_pairs.emplace(
                 8,
                 Value{get_encoded_text_ast<four_byte_encoded_variable_t>(cStringToEncode)}
         );
-        node_id_value_pairs.emplace(
+        valid_node_id_value_pairs.emplace(
                 7,
-                Value{get_encoded_text_ast<eight_byte_encoded_variable_t>(cStringToEncode)}
+                Value{get_encoded_text_ast<eight_byte_encoded_variable_t>(cJsonArrayToEncode)}
         );
-        node_id_value_pairs.emplace(10, Value{});
-        node_id_value_pairs.emplace(11, std::nullopt);
+        valid_node_id_value_pairs.emplace(10, Value{});
+        valid_node_id_value_pairs.emplace(11, std::nullopt);
         // NOLINTEND(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-        auto const result{
-                KeyValuePairLogEvent::create(schema_tree, node_id_value_pairs, UtcOffset{0})
-        };
+        auto const result{KeyValuePairLogEvent::create(
+                auto_gen_keys_schema_tree,
+                user_gen_keys_schema_tree,
+                valid_node_id_value_pairs,
+                valid_node_id_value_pairs,
+                UtcOffset{0}
+        )};
         REQUIRE_FALSE(result.has_error());
 
-        SECTION("Test duplicated key conflict on node #3") {
-            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-            node_id_value_pairs.emplace(6, Value{static_cast<value_bool_t>(false)});
-            auto const result{
-                    KeyValuePairLogEvent::create(schema_tree, node_id_value_pairs, UtcOffset{0})
+        SECTION("Test JSON serialization") {
+            nlohmann::json const subtree_rooted_at_node_4
+                    = {{"a", nlohmann::json::parse(cJsonArrayToEncode)},
+                       {"d", cStringToEncode},
+                       {"f", nlohmann::json::object_t()}};
+            nlohmann::json const subtree_rooted_at_node_3
+                    = {{"c", subtree_rooted_at_node_4}, {"d", cStaticText}, {"e", nullptr}};
+            nlohmann::json const expected = {
+                    {"a", {{"b", subtree_rooted_at_node_3}}},
+                    {"b", 0},
             };
-            REQUIRE(result.has_error());
-            REQUIRE((std::errc::protocol_not_supported == result.error()));
+
+            auto const& kv_pair_log_event{result.value()};
+            auto const serialized_json_result{kv_pair_log_event.serialize_to_json()};
+            REQUIRE_FALSE(serialized_json_result.has_error());
+            auto const& [serialized_auto_gen_kv_pairs, serialized_user_gen_kv_pairs]{
+                    serialized_json_result.value()
+            };
+            REQUIRE((serialized_auto_gen_kv_pairs == expected));
+            REQUIRE((serialized_user_gen_kv_pairs == expected));
         }
 
-        SECTION("Test duplicated key conflict on node #4") {
+        SECTION("Test duplicated key conflict under node #3") {
+            auto invalid_node_id_value_pairs{valid_node_id_value_pairs};
             // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
-            node_id_value_pairs.emplace(9, Value{static_cast<value_float_t>(0.0)});
-            auto const result{
-                    KeyValuePairLogEvent::create(schema_tree, node_id_value_pairs, UtcOffset{0})
-            };
-            REQUIRE(result.has_error());
-            REQUIRE((std::errc::protocol_not_supported == result.error()));
+            invalid_node_id_value_pairs.emplace(6, Value{static_cast<value_bool_t>(false)});
+            REQUIRE(assert_kv_pair_log_event_creation_failure(
+                    auto_gen_keys_schema_tree,
+                    user_gen_keys_schema_tree,
+                    invalid_node_id_value_pairs,
+                    valid_node_id_value_pairs,
+                    UtcOffset{0},
+                    std::errc::protocol_not_supported
+            ));
+            REQUIRE(assert_kv_pair_log_event_creation_failure(
+                    auto_gen_keys_schema_tree,
+                    user_gen_keys_schema_tree,
+                    valid_node_id_value_pairs,
+                    invalid_node_id_value_pairs,
+                    UtcOffset{0},
+                    std::errc::protocol_not_supported
+            ));
+        }
+
+        SECTION("Test duplicated key conflict under node #4") {
+            auto invalid_node_id_value_pairs{valid_node_id_value_pairs};
+            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+            invalid_node_id_value_pairs.emplace(9, Value{static_cast<value_float_t>(0.0)});
+            REQUIRE(assert_kv_pair_log_event_creation_failure(
+                    auto_gen_keys_schema_tree,
+                    user_gen_keys_schema_tree,
+                    invalid_node_id_value_pairs,
+                    valid_node_id_value_pairs,
+                    UtcOffset{0},
+                    std::errc::protocol_not_supported
+            ));
+            REQUIRE(assert_kv_pair_log_event_creation_failure(
+                    auto_gen_keys_schema_tree,
+                    user_gen_keys_schema_tree,
+                    valid_node_id_value_pairs,
+                    invalid_node_id_value_pairs,
+                    UtcOffset{0},
+                    std::errc::protocol_not_supported
+            ));
+        }
+
+        SECTION("Test duplicated keys among siblings of node #1") {
+            auto invalid_node_id_value_pairs{valid_node_id_value_pairs};
+            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+            invalid_node_id_value_pairs.emplace(12, static_cast<value_int_t>(0));
+            // Node #12 has the same key as its sibling node #1
+            REQUIRE(assert_kv_pair_log_event_creation_failure(
+                    auto_gen_keys_schema_tree,
+                    user_gen_keys_schema_tree,
+                    invalid_node_id_value_pairs,
+                    valid_node_id_value_pairs,
+                    UtcOffset{0},
+                    std::errc::protocol_not_supported
+            ));
+            REQUIRE(assert_kv_pair_log_event_creation_failure(
+                    auto_gen_keys_schema_tree,
+                    user_gen_keys_schema_tree,
+                    valid_node_id_value_pairs,
+                    invalid_node_id_value_pairs,
+                    UtcOffset{0},
+                    std::errc::protocol_not_supported
+            ));
+        }
+
+        SECTION("Test duplicated keys among siblings of node #3") {
+            auto invalid_node_id_value_pairs{valid_node_id_value_pairs};
+            // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,readability-magic-numbers)
+            invalid_node_id_value_pairs.emplace(13, false);
+            // Node #13 has the same key as its sibling node #3
+            REQUIRE(assert_kv_pair_log_event_creation_failure(
+                    auto_gen_keys_schema_tree,
+                    user_gen_keys_schema_tree,
+                    invalid_node_id_value_pairs,
+                    valid_node_id_value_pairs,
+                    UtcOffset{0},
+                    std::errc::protocol_not_supported
+            ));
+            REQUIRE(assert_kv_pair_log_event_creation_failure(
+                    auto_gen_keys_schema_tree,
+                    user_gen_keys_schema_tree,
+                    valid_node_id_value_pairs,
+                    invalid_node_id_value_pairs,
+                    UtcOffset{0},
+                    std::errc::protocol_not_supported
+            ));
         }
 
         SECTION("Test invalid sub-tree on node #3") {
-            node_id_value_pairs.emplace(3, std::nullopt);
-            auto const result{
-                    KeyValuePairLogEvent::create(schema_tree, node_id_value_pairs, UtcOffset{0})
-            };
+            auto invalid_node_id_value_pairs{valid_node_id_value_pairs};
+            invalid_node_id_value_pairs.emplace(3, std::nullopt);
             // Node #3 is empty, but its descendants appear in the sub schema tree (node #5 & #10)
-            REQUIRE(result.has_error());
-            REQUIRE((std::errc::operation_not_permitted == result.error()));
+            REQUIRE(assert_kv_pair_log_event_creation_failure(
+                    auto_gen_keys_schema_tree,
+                    user_gen_keys_schema_tree,
+                    invalid_node_id_value_pairs,
+                    valid_node_id_value_pairs,
+                    UtcOffset{0},
+                    std::errc::operation_not_permitted
+            ));
+            REQUIRE(assert_kv_pair_log_event_creation_failure(
+                    auto_gen_keys_schema_tree,
+                    user_gen_keys_schema_tree,
+                    valid_node_id_value_pairs,
+                    invalid_node_id_value_pairs,
+                    UtcOffset{0},
+                    std::errc::operation_not_permitted
+            ));
         }
 
         SECTION("Test invalid sub-tree on node #4") {
-            node_id_value_pairs.emplace(4, Value{});
-            auto const result{
-                    KeyValuePairLogEvent::create(schema_tree, node_id_value_pairs, UtcOffset{0})
-            };
+            auto invalid_node_id_value_pairs{valid_node_id_value_pairs};
+            invalid_node_id_value_pairs.emplace(4, Value{});
             // Node #4 is null, but its descendants appear in the sub schema tree (node #5 & #10)
-            REQUIRE(result.has_error());
-            REQUIRE((std::errc::operation_not_permitted == result.error()));
+            REQUIRE(assert_kv_pair_log_event_creation_failure(
+                    auto_gen_keys_schema_tree,
+                    user_gen_keys_schema_tree,
+                    invalid_node_id_value_pairs,
+                    valid_node_id_value_pairs,
+                    UtcOffset{0},
+                    std::errc::operation_not_permitted
+            ));
+            REQUIRE(assert_kv_pair_log_event_creation_failure(
+                    auto_gen_keys_schema_tree,
+                    user_gen_keys_schema_tree,
+                    valid_node_id_value_pairs,
+                    invalid_node_id_value_pairs,
+                    UtcOffset{0},
+                    std::errc::operation_not_permitted
+            ));
         }
     }
 
     SECTION("Test out-of-bound node ID") {
         KeyValuePairLogEvent::NodeIdValuePairs node_id_value_pairs_out_of_bound;
         node_id_value_pairs_out_of_bound.emplace(
-                static_cast<SchemaTree::Node::id_t>(schema_tree->get_size()),
+                static_cast<SchemaTree::Node::id_t>(user_gen_keys_schema_tree->get_size()),
                 Value{}
         );
-        auto const out_of_bound_result{KeyValuePairLogEvent::create(
-                schema_tree,
-                std::move(node_id_value_pairs_out_of_bound),
-                UtcOffset{0}
-        )};
-        REQUIRE(out_of_bound_result.has_error());
-        REQUIRE((std::errc::operation_not_permitted == out_of_bound_result.error()));
+        REQUIRE(assert_kv_pair_log_event_creation_failure(
+                auto_gen_keys_schema_tree,
+                user_gen_keys_schema_tree,
+                node_id_value_pairs_out_of_bound,
+                {},
+                UtcOffset{0},
+                std::errc::operation_not_permitted
+        ));
+        REQUIRE(assert_kv_pair_log_event_creation_failure(
+                auto_gen_keys_schema_tree,
+                user_gen_keys_schema_tree,
+                {},
+                node_id_value_pairs_out_of_bound,
+                UtcOffset{0},
+                std::errc::operation_not_permitted
+        ));
     }
 }
diff --git a/components/core/tests/test-ir_encoding_methods.cpp b/components/core/tests/test-ir_encoding_methods.cpp
index 1ee1e3542..347dadb7a 100644
--- a/components/core/tests/test-ir_encoding_methods.cpp
+++ b/components/core/tests/test-ir_encoding_methods.cpp
@@ -1246,12 +1246,14 @@ TEMPLATE_TEST_CASE(
         auto const& deserialized_log_event{deserialized_log_events.at(idx)};
 
         auto const num_leaves_in_json_obj{count_num_leaves(expect)};
-        auto const num_kv_pairs{deserialized_log_event.get_node_id_value_pairs().size()};
+        auto const num_kv_pairs{deserialized_log_event.get_user_gen_node_id_value_pairs().size()};
         REQUIRE((num_leaves_in_json_obj == num_kv_pairs));
 
         auto const serialized_json_result{deserialized_log_event.serialize_to_json()};
         REQUIRE_FALSE(serialized_json_result.has_error());
-        REQUIRE((expect == serialized_json_result.value()));
+        auto const& [auto_generated, user_generated]{serialized_json_result.value()};
+        REQUIRE(auto_generated.empty());
+        REQUIRE((expect == user_generated));
     }
 
     auto const eof_result{deserializer.deserialize_next_ir_unit(reader)};

From 42db88c34e9336941cadaa212f1f30884fd6705c Mon Sep 17 00:00:00 2001
From: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com>
Date: Tue, 10 Dec 2024 17:00:04 -0500
Subject: [PATCH 49/65] build(docs): Update dependencies to latest versions.
 (#631)

---
 docs/requirements.txt | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 84466dcae..dd8ca3593 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,10 +1,6 @@
-myst-parser>=2.0.0
-# Locked to avoid pydata/pydata-sphinx-theme#1676 until its fix is released in a version above
-# 0.15.2
-pydata-sphinx-theme==0.14.4
-# Locked to avoid the following issue until a fix is released:
-# https://github.com/sphinx-doc/sphinx/issues/13002
-sphinx==8.0.2
-sphinx_design>=0.5.0
+myst-parser>=4.0.0
+pydata-sphinx-theme>=0.16.0
+sphinx>=8.1.3
+sphinx_design>=0.6.1
 sphinx-copybutton>=0.5.2
-sphinxcontrib-mermaid>=0.9.2
+sphinxcontrib-mermaid>=1.0.0

From 13c752801bf28427bfdcd9e8ab942a28e7dbbea5 Mon Sep 17 00:00:00 2001
From: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com>
Date: Thu, 12 Dec 2024 11:02:56 -0500
Subject: [PATCH 50/65] ci(pr-title-checks): Remove default GH workflow
 permissions and document risk of `pull_request_target` workflow trigger.
 (#633)

---
 .github/workflows/clp-pr-title-checks.yaml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/clp-pr-title-checks.yaml b/.github/workflows/clp-pr-title-checks.yaml
index 428e9f21d..1c8ced072 100644
--- a/.github/workflows/clp-pr-title-checks.yaml
+++ b/.github/workflows/clp-pr-title-checks.yaml
@@ -2,9 +2,16 @@ name: "clp-pr-title-checks"
 
 on:
   pull_request_target:
+    # NOTE: Workflows triggered by this event give the workflow access to secrets and grant the
+    # `GITHUB_TOKEN` read/write repository access by default. So we need to ensure:
+    # - This workflow doesn't inadvertently check out, build, or execute untrusted code from the
+    #   pull request triggered by this event.
+    # - Each job has `permissions` set to only those necessary.
     types: ["edited", "opened", "reopened"]
     branches: ["main"]
 
+permissions: {}
+
 concurrency:
   group: "${{github.workflow}}-${{github.ref}}"
 

From 8b34dac702ade914935cc5624982afba0e345efc Mon Sep 17 00:00:00 2001
From: Devin Gibson <gibber9809@users.noreply.github.com>
Date: Fri, 13 Dec 2024 15:40:43 -0500
Subject: [PATCH 51/65] feat(core-clp): Add `BoundedReader` to prevent
 out-of-bound reads in segmented input streams. (#624)

Co-authored-by: haiqi96 <14502009+haiqi96@users.noreply.github.com>
Co-authored-by: Lin Zhihao <59785146+LinZhihao-723@users.noreply.github.com>
---
 components/core/CMakeLists.txt               |  3 +
 components/core/src/clp/BoundedReader.cpp    | 43 +++++++++
 components/core/src/clp/BoundedReader.hpp    | 89 ++++++++++++++++++
 components/core/src/clp/StringReader.cpp     |  4 +
 components/core/tests/test-BoundedReader.cpp | 99 ++++++++++++++++++++
 5 files changed, 238 insertions(+)
 create mode 100644 components/core/src/clp/BoundedReader.cpp
 create mode 100644 components/core/src/clp/BoundedReader.hpp
 create mode 100644 components/core/tests/test-BoundedReader.cpp

diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index f15d14405..7509efebd 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -352,6 +352,8 @@ set(SOURCE_FILES_unitTest
         src/clp/aws/AwsAuthenticationSigner.cpp
         src/clp/aws/AwsAuthenticationSigner.hpp
         src/clp/aws/constants.hpp
+        src/clp/BoundedReader.cpp
+        src/clp/BoundedReader.hpp
         src/clp/BufferedFileReader.cpp
         src/clp/BufferedFileReader.hpp
         src/clp/BufferReader.cpp
@@ -571,6 +573,7 @@ set(SOURCE_FILES_unitTest
         submodules/sqlite3/sqlite3ext.h
         tests/LogSuppressor.hpp
         tests/test-Array.cpp
+        tests/test-BoundedReader.cpp
         tests/test-BufferedFileReader.cpp
         tests/test-clp_s-end_to_end.cpp
         tests/test-EncodedVariableInterpreter.cpp
diff --git a/components/core/src/clp/BoundedReader.cpp b/components/core/src/clp/BoundedReader.cpp
new file mode 100644
index 000000000..9bca08f71
--- /dev/null
+++ b/components/core/src/clp/BoundedReader.cpp
@@ -0,0 +1,43 @@
+#include "BoundedReader.hpp"
+
+#include <cstddef>
+
+#include "ErrorCode.hpp"
+
+namespace clp {
+auto BoundedReader::try_seek_from_begin(size_t pos) -> ErrorCode {
+    auto const next_pos = pos > m_bound ? m_bound : pos;
+    if (auto const rc = m_reader->try_seek_from_begin(next_pos); ErrorCode_Success != rc) {
+        m_curr_pos = ErrorCode_EndOfFile == rc ? next_pos : m_curr_pos;
+        return rc;
+    }
+    m_curr_pos = next_pos;
+    if (m_curr_pos >= m_bound) {
+        return ErrorCode_EndOfFile;
+    }
+    return ErrorCode_Success;
+}
+
+auto BoundedReader::try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read)
+        -> ErrorCode {
+    if (m_curr_pos == m_bound) {
+        num_bytes_read = 0;
+        return ErrorCode_EndOfFile;
+    }
+
+    if ((m_curr_pos + num_bytes_to_read) > m_bound) {
+        num_bytes_to_read = m_bound - m_curr_pos;
+    }
+
+    auto const rc = m_reader->try_read(buf, num_bytes_to_read, num_bytes_read);
+    m_curr_pos += num_bytes_read;
+    if (ErrorCode_EndOfFile == rc) {
+        if (0 == num_bytes_read) {
+            return ErrorCode_EndOfFile;
+        }
+    } else if (ErrorCode_Success != rc) {
+        return rc;
+    }
+    return ErrorCode_Success;
+}
+}  // namespace clp
diff --git a/components/core/src/clp/BoundedReader.hpp b/components/core/src/clp/BoundedReader.hpp
new file mode 100644
index 000000000..cfcb07422
--- /dev/null
+++ b/components/core/src/clp/BoundedReader.hpp
@@ -0,0 +1,89 @@
+#ifndef CLP_BOUNDEDREADER_HPP
+#define CLP_BOUNDEDREADER_HPP
+
+#include <cstddef>
+#include <string>
+
+#include "ErrorCode.hpp"
+#include "ReaderInterface.hpp"
+
+namespace clp {
+/**
+ * BoundedReader is a ReaderInterface designed to wrap other ReaderInterfaces and prevent users
+ * from reading or seeking beyond a certain point in the underlying input stream.
+ *
+ * This is useful when the underlying input stream is divided into several logical segments and we
+ * want to prevent a reader for an earlier segment consuming any bytes from a later segment. In
+ * particular, reading part of a later segment may force the reader for that later segment to seek
+ * backwards, which can be either inefficient or impossible for certain kinds of input streams.
+ */
+class BoundedReader : public ReaderInterface {
+public:
+    // Constructor
+    explicit BoundedReader(ReaderInterface* reader, size_t bound)
+            : m_reader{reader},
+              m_bound{bound} {
+        if (nullptr == m_reader) {
+            throw ReaderInterface::OperationFailed(ErrorCode_BadParam, __FILE__, __LINE__);
+        }
+        m_curr_pos = m_reader->get_pos();
+        if (m_curr_pos > m_bound) {
+            throw ReaderInterface::OperationFailed(ErrorCode_BadParam, __FILE__, __LINE__);
+        }
+    }
+
+    // Methods implementing the ReaderInterface
+    /**
+     * Tries to get the current position of the read head in the underlying reader.
+     * @param pos Returns the position of the underlying reader's head
+     * @return ErrorCode_Success on success
+     * @return ErrorCode_errno on failure
+     */
+    [[nodiscard]] auto try_get_pos(size_t& pos) -> ErrorCode override {
+        return m_reader->try_get_pos(pos);
+    }
+
+    /**
+     * Tries to seek to the given position, limited by the bound.
+     * @param pos
+     * @return ErrorCode_Success on success
+     * @return ErrorCode_EndOfFile on EOF or if trying to seek beyond the checkpoint
+     * @return ErrorCode_errno on failure
+     */
+    [[nodiscard]] auto try_seek_from_begin(size_t pos) -> ErrorCode override;
+
+    /**
+     * Tries to read up to a given number of bytes from the file, limited by the bound.
+     * @param buf
+     * @param num_bytes_to_read The number of bytes to try and read
+     * @param num_bytes_read The actual number of bytes read
+     * @return ErrorCode_errno on error
+     * @return ErrorCode_EndOfFile on EOF or trying to read after hitting checkpoint
+     * @return ErrorCode_Success on success
+     */
+    [[nodiscard]] auto
+    try_read(char* buf, size_t num_bytes_to_read, size_t& num_bytes_read) -> ErrorCode override;
+
+    /**
+     * This function is unsupported because BoundedReader can not delegate to a potentially
+     * efficient implementation in the underlying reader, as the underlying reader's implementation
+     * will not respect the bound.
+     * @return ErrorCode_Unsupported
+     */
+    [[nodiscard]] auto try_read_to_delimiter(
+            [[maybe_unused]] char delim,
+            [[maybe_unused]] bool keep_delimiter,
+            [[maybe_unused]] bool append,
+            [[maybe_unused]] std::string& str
+    ) -> ErrorCode override {
+        return ErrorCode_Unsupported;
+    }
+
+private:
+    ReaderInterface* m_reader{nullptr};
+    size_t m_bound{};
+    size_t m_curr_pos{};
+};
+}  // namespace clp
+
+#endif  // CLP_BOUNDEDREADER_HPP
diff --git a/components/core/src/clp/StringReader.cpp b/components/core/src/clp/StringReader.cpp
index 9fa2c27d3..8dd0a3793 100644
--- a/components/core/src/clp/StringReader.cpp
+++ b/components/core/src/clp/StringReader.cpp
@@ -41,6 +41,10 @@ ErrorCode StringReader::try_read(char* buf, size_t num_bytes_to_read, size_t& nu
 }
 
 ErrorCode StringReader::try_seek_from_begin(size_t pos) {
+    if (pos > input_string.size()) {
+        this->pos = input_string.size();
+        return ErrorCode_EndOfFile;
+    }
     this->pos = pos;
     return ErrorCode_Success;
 }
diff --git a/components/core/tests/test-BoundedReader.cpp b/components/core/tests/test-BoundedReader.cpp
new file mode 100644
index 000000000..9d1a9d2c0
--- /dev/null
+++ b/components/core/tests/test-BoundedReader.cpp
@@ -0,0 +1,99 @@
+#include <array>
+#include <cstddef>
+#include <string>
+#include <string_view>
+
+#include <Catch2/single_include/catch2/catch.hpp>
+
+#include "../src/clp/BoundedReader.hpp"
+#include "../src/clp/ErrorCode.hpp"
+#include "../src/clp/StringReader.hpp"
+
+TEST_CASE("Test Bounded Reader", "[BoundedReader]") {
+    constexpr std::string_view cTestString{"0123456789"};
+
+    SECTION("BoundedReader does not support try_read_to_delimiter") {
+        clp::StringReader string_reader;
+        string_reader.open(std::string{cTestString});
+        clp::BoundedReader bounded_reader{&string_reader, cTestString.size()};
+        std::string tmp;
+        REQUIRE(clp::ErrorCode_Unsupported
+                == bounded_reader.try_read_to_delimiter('0', false, false, tmp));
+    }
+
+    SECTION("BoundedReader does not allow reads beyond end of underlying stream.") {
+        clp::StringReader string_reader;
+        string_reader.open(std::string{cTestString});
+        clp::BoundedReader bounded_reader{&string_reader, cTestString.size() + 1};
+        std::array<char, cTestString.size() + 1> buf{};
+        size_t num_bytes_read{};
+        auto rc = bounded_reader.try_read(buf.data(), cTestString.size() + 1, num_bytes_read);
+        REQUIRE(clp::ErrorCode_Success == rc);
+        REQUIRE(num_bytes_read == cTestString.size());
+        REQUIRE(cTestString.size() == string_reader.get_pos());
+        REQUIRE(cTestString.size() == bounded_reader.get_pos());
+    }
+
+    SECTION("BoundedReader does not allow reads beyond checkpoint.") {
+        clp::StringReader string_reader;
+        string_reader.open(std::string{cTestString});
+        clp::BoundedReader bounded_reader{&string_reader, 1};
+        std::array<char, cTestString.size()> buf{};
+        size_t num_bytes_read{};
+        auto rc = bounded_reader.try_read(buf.data(), cTestString.size(), num_bytes_read);
+        REQUIRE(clp::ErrorCode_Success == rc);
+        REQUIRE(1 == num_bytes_read);
+        REQUIRE(1 == string_reader.get_pos());
+        REQUIRE(1 == bounded_reader.get_pos());
+        rc = bounded_reader.try_read(buf.data(), 1, num_bytes_read);
+        REQUIRE(clp::ErrorCode_EndOfFile == rc);
+        REQUIRE(0 == num_bytes_read);
+        REQUIRE(1 == string_reader.get_pos());
+        REQUIRE(1 == bounded_reader.get_pos());
+    }
+
+    SECTION("BoundedReader does allow reads before checkpoint.") {
+        clp::StringReader string_reader;
+        string_reader.open(std::string{cTestString});
+        clp::BoundedReader bounded_reader{&string_reader, 1};
+        char buf{};
+        size_t num_bytes_read{};
+        auto rc = bounded_reader.try_read(&buf, 1, num_bytes_read);
+        REQUIRE(clp::ErrorCode_Success == rc);
+        REQUIRE(1 == num_bytes_read);
+        REQUIRE(1 == string_reader.get_pos());
+        REQUIRE(1 == bounded_reader.get_pos());
+    }
+
+    SECTION("BoundedReader does not allow seeks beyond end of underlying stream.") {
+        clp::StringReader string_reader;
+        string_reader.open(std::string{cTestString});
+        clp::BoundedReader bounded_reader{&string_reader, cTestString.size() + 1};
+        auto rc = bounded_reader.try_seek_from_begin(cTestString.size() + 1);
+        REQUIRE(clp::ErrorCode_EndOfFile == rc);
+        REQUIRE(cTestString.size() == string_reader.get_pos());
+        REQUIRE(cTestString.size() == bounded_reader.get_pos());
+    }
+
+    SECTION("BoundedReader does not allow seeks beyond checkpoint.") {
+        clp::StringReader string_reader;
+        string_reader.open(std::string{cTestString});
+        clp::BoundedReader bounded_reader{&string_reader, 1};
+        size_t num_bytes_read{};
+        auto rc = bounded_reader.try_seek_from_begin(cTestString.size());
+        REQUIRE(clp::ErrorCode_EndOfFile == rc);
+        REQUIRE(1 == string_reader.get_pos());
+        REQUIRE(1 == bounded_reader.get_pos());
+    }
+
+    SECTION("BoundedReader does allow seeks before checkpoint.") {
+        clp::StringReader string_reader;
+        string_reader.open(std::string{cTestString});
+        clp::BoundedReader bounded_reader{&string_reader, 2};
+        size_t num_bytes_read{};
+        auto rc = bounded_reader.try_seek_from_begin(1);
+        REQUIRE(clp::ErrorCode_Success == rc);
+        REQUIRE(1 == string_reader.get_pos());
+        REQUIRE(1 == bounded_reader.get_pos());
+    }
+}

From 02d8956db32ffcf2978ce85bd38c3b7522e583fe Mon Sep 17 00:00:00 2001
From: Devin Gibson <gibber9809@users.noreply.github.com>
Date: Mon, 16 Dec 2024 12:57:47 -0500
Subject: [PATCH 52/65] build(core): Update Boost to v1.87.0 in order to pull
 in boost::urls; Replace calls to boost::asio's deprecated `expires_from_now`
 with `expires_after`. (#636)

---
 components/core/CMakeLists.txt                             | 2 +-
 components/core/src/reducer/reducer_server.cpp             | 7 +++----
 .../centos-stream-9/install-packages-from-source.sh        | 2 +-
 components/core/tools/scripts/lib_install/install-boost.sh | 2 +-
 .../ubuntu-focal/install-packages-from-source.sh           | 2 +-
 .../ubuntu-jammy/install-packages-from-source.sh           | 3 +++
 .../lib_install/ubuntu-jammy/install-prebuilt-packages.sh  | 3 ---
 7 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index 7509efebd..0995a0afb 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -101,7 +101,7 @@ endif()
 if(CLP_USE_STATIC_LIBS)
     set(Boost_USE_STATIC_LIBS ON)
 endif()
-find_package(Boost 1.74 REQUIRED iostreams program_options filesystem system regex)
+find_package(Boost 1.81 REQUIRED iostreams program_options filesystem system regex url)
 if(Boost_FOUND)
     message(STATUS "Found Boost ${Boost_VERSION}")
 else()
diff --git a/components/core/src/reducer/reducer_server.cpp b/components/core/src/reducer/reducer_server.cpp
index ab35b7396..a243c763c 100644
--- a/components/core/src/reducer/reducer_server.cpp
+++ b/components/core/src/reducer/reducer_server.cpp
@@ -121,7 +121,7 @@ void PeriodicUpsertTask::operator()([[maybe_unused]] boost::system::error_code c
     }
 
     auto& upsert_timer = m_server_ctx->get_upsert_timer();
-    upsert_timer.expires_from_now(std::chrono::milliseconds(m_server_ctx->get_upsert_interval()));
+    upsert_timer.expires_after(std::chrono::milliseconds(m_server_ctx->get_upsert_interval()));
     upsert_timer.async_wait(PeriodicUpsertTask(m_server_ctx));
 }
 
@@ -205,9 +205,8 @@ void SchedulerUpdateListenerTask::operator()(
 
         if (m_server_ctx->is_timeline_aggregation()) {
             auto& upsert_timer = m_server_ctx->get_upsert_timer();
-            upsert_timer.expires_from_now(
-                    std::chrono::milliseconds(m_server_ctx->get_upsert_interval())
-            );
+            upsert_timer.expires_after(std::chrono::milliseconds(m_server_ctx->get_upsert_interval()
+            ));
             upsert_timer.async_wait(PeriodicUpsertTask(m_server_ctx));
         }
 
diff --git a/components/core/tools/scripts/lib_install/centos-stream-9/install-packages-from-source.sh b/components/core/tools/scripts/lib_install/centos-stream-9/install-packages-from-source.sh
index f2965f9fd..e6b6b3579 100755
--- a/components/core/tools/scripts/lib_install/centos-stream-9/install-packages-from-source.sh
+++ b/components/core/tools/scripts/lib_install/centos-stream-9/install-packages-from-source.sh
@@ -10,7 +10,7 @@ script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 lib_install_scripts_dir="${script_dir}/.."
 
 # NOTE: The remaining installation scripts depend on boost, so we install it beforehand.
-"${lib_install_scripts_dir}/install-boost.sh" 1.76.0
+"${lib_install_scripts_dir}/install-boost.sh" 1.87.0
 
 "${lib_install_scripts_dir}/fmtlib.sh" 8.0.1
 "${lib_install_scripts_dir}/spdlog.sh" 1.9.2
diff --git a/components/core/tools/scripts/lib_install/install-boost.sh b/components/core/tools/scripts/lib_install/install-boost.sh
index 9e5f9a1c5..40232caf8 100755
--- a/components/core/tools/scripts/lib_install/install-boost.sh
+++ b/components/core/tools/scripts/lib_install/install-boost.sh
@@ -34,7 +34,7 @@ tar xzf ${tar_filename}
 cd boost_${version_with_underscores}
 
 # Build
-./bootstrap.sh --with-libraries=filesystem,iostreams,program_options,regex,system
+./bootstrap.sh --with-libraries=filesystem,iostreams,program_options,regex,system,url
 ./b2 -j${num_cpus}
 
 # Install
diff --git a/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh b/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh
index 10a2b0482..839f6d3c3 100755
--- a/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh
+++ b/components/core/tools/scripts/lib_install/ubuntu-focal/install-packages-from-source.sh
@@ -10,7 +10,7 @@ script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 lib_install_scripts_dir=$script_dir/..
 
 # NOTE: boost must be installed first since the remaining packages depend on it
-"$lib_install_scripts_dir"/install-boost.sh 1.74.0
+"$lib_install_scripts_dir"/install-boost.sh 1.87.0
 
 "$lib_install_scripts_dir"/fmtlib.sh 8.0.1
 "$lib_install_scripts_dir"/libarchive.sh 3.5.1
diff --git a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh
index 97aaf7093..839f6d3c3 100755
--- a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh
+++ b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-packages-from-source.sh
@@ -9,6 +9,9 @@ set -u
 script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 lib_install_scripts_dir=$script_dir/..
 
+# NOTE: boost must be installed first since the remaining packages depend on it
+"$lib_install_scripts_dir"/install-boost.sh 1.87.0
+
 "$lib_install_scripts_dir"/fmtlib.sh 8.0.1
 "$lib_install_scripts_dir"/libarchive.sh 3.5.1
 "$lib_install_scripts_dir"/liblzma.sh 5.4.6
diff --git a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh
index ca1f5f59e..ea055ffdf 100755
--- a/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh
+++ b/components/core/tools/scripts/lib_install/ubuntu-jammy/install-prebuilt-packages.sh
@@ -15,9 +15,6 @@ DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
   build-essential \
   git \
   jq \
-  libboost-filesystem-dev \
-  libboost-iostreams-dev \
-  libboost-program-options-dev \
   libcurl4 \
   libcurl4-openssl-dev \
   liblzma-dev \

From da66fbf833eff06aec0feb1c20706a9c766b7e2f Mon Sep 17 00:00:00 2001
From: Devin Gibson <gibber9809@users.noreply.github.com>
Date: Mon, 16 Dec 2024 13:00:39 -0500
Subject: [PATCH 53/65] refactor(clp-s): Replace instances of `std::string
 const&` with `std::string_view` where it would remove unnecessary conversions
 to and from `std::string`. (#635)

Co-authored-by: haiqi96 <14502009+haiqi96@users.noreply.github.com>
---
 components/core/src/clp_s/ArchiveWriter.hpp   |  8 +++---
 components/core/src/clp_s/ParsedMessage.hpp   | 10 +++++++
 .../src/clp_s/TimestampDictionaryWriter.cpp   | 10 ++++---
 .../src/clp_s/TimestampDictionaryWriter.hpp   | 10 ++++---
 components/core/src/clp_s/TimestampEntry.hpp  |  3 +-
 .../core/src/clp_s/TimestampPattern.cpp       | 28 +++++++++----------
 .../core/src/clp_s/TimestampPattern.hpp       |  6 ++--
 .../core/src/clp_s/search/StringLiteral.hpp   | 16 ++---------
 8 files changed, 49 insertions(+), 42 deletions(-)

diff --git a/components/core/src/clp_s/ArchiveWriter.hpp b/components/core/src/clp_s/ArchiveWriter.hpp
index 3b13f4426..82a0122bc 100644
--- a/components/core/src/clp_s/ArchiveWriter.hpp
+++ b/components/core/src/clp_s/ArchiveWriter.hpp
@@ -122,9 +122,9 @@ class ArchiveWriter {
      * @return the epoch time corresponding to the string timestamp
      */
     epochtime_t ingest_timestamp_entry(
-            std::string const& key,
+            std::string_view key,
             int32_t node_id,
-            std::string const& timestamp,
+            std::string_view timestamp,
             uint64_t& pattern_id
     ) {
         return m_timestamp_dict.ingest_entry(key, node_id, timestamp, pattern_id);
@@ -136,11 +136,11 @@ class ArchiveWriter {
      * @param node_id
      * @param timestamp
      */
-    void ingest_timestamp_entry(std::string const& key, int32_t node_id, double timestamp) {
+    void ingest_timestamp_entry(std::string_view key, int32_t node_id, double timestamp) {
         m_timestamp_dict.ingest_entry(key, node_id, timestamp);
     }
 
-    void ingest_timestamp_entry(std::string const& key, int32_t node_id, int64_t timestamp) {
+    void ingest_timestamp_entry(std::string_view key, int32_t node_id, int64_t timestamp) {
         m_timestamp_dict.ingest_entry(key, node_id, timestamp);
     }
 
diff --git a/components/core/src/clp_s/ParsedMessage.hpp b/components/core/src/clp_s/ParsedMessage.hpp
index c843e2b7b..c1b6d7a35 100644
--- a/components/core/src/clp_s/ParsedMessage.hpp
+++ b/components/core/src/clp_s/ParsedMessage.hpp
@@ -1,8 +1,10 @@
 #ifndef CLP_S_PARSEDMESSAGE_HPP
 #define CLP_S_PARSEDMESSAGE_HPP
 
+#include <cstdint>
 #include <map>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <variant>
 
@@ -34,6 +36,10 @@ class ParsedMessage {
         m_message.emplace(node_id, value);
     }
 
+    inline void add_value(int32_t node_id, std::string_view value) {
+        m_message.emplace(node_id, std::string{value});
+    }
+
     /**
      * Adds a timestamp value and its encoding to the message for a given MST node ID.
      * @param node_id
@@ -55,6 +61,10 @@ class ParsedMessage {
         m_unordered_message.emplace_back(value);
     }
 
+    inline void add_unordered_value(std::string_view value) {
+        m_unordered_message.emplace_back(std::string{value});
+    }
+
     /**
      * Clears the message
      */
diff --git a/components/core/src/clp_s/TimestampDictionaryWriter.cpp b/components/core/src/clp_s/TimestampDictionaryWriter.cpp
index 39e66a6af..952bc36db 100644
--- a/components/core/src/clp_s/TimestampDictionaryWriter.cpp
+++ b/components/core/src/clp_s/TimestampDictionaryWriter.cpp
@@ -1,6 +1,8 @@
 #include "TimestampDictionaryWriter.hpp"
 
+#include <cstdint>
 #include <sstream>
+#include <string_view>
 
 #include "Utils.hpp"
 
@@ -42,9 +44,9 @@ uint64_t TimestampDictionaryWriter::get_pattern_id(TimestampPattern const* patte
 }
 
 epochtime_t TimestampDictionaryWriter::ingest_entry(
-        std::string const& key,
+        std::string_view key,
         int32_t node_id,
-        std::string const& timestamp,
+        std::string_view timestamp,
         uint64_t& pattern_id
 ) {
     epochtime_t ret;
@@ -88,7 +90,7 @@ epochtime_t TimestampDictionaryWriter::ingest_entry(
 }
 
 void TimestampDictionaryWriter::ingest_entry(
-        std::string const& key,
+        std::string_view key,
         int32_t node_id,
         double timestamp
 ) {
@@ -103,7 +105,7 @@ void TimestampDictionaryWriter::ingest_entry(
 }
 
 void TimestampDictionaryWriter::ingest_entry(
-        std::string const& key,
+        std::string_view key,
         int32_t node_id,
         int64_t timestamp
 ) {
diff --git a/components/core/src/clp_s/TimestampDictionaryWriter.hpp b/components/core/src/clp_s/TimestampDictionaryWriter.hpp
index 29288fd48..7c214a39e 100644
--- a/components/core/src/clp_s/TimestampDictionaryWriter.hpp
+++ b/components/core/src/clp_s/TimestampDictionaryWriter.hpp
@@ -1,9 +1,11 @@
 #ifndef CLP_S_TIMESTAMPDICTIONARYWRITER_HPP
 #define CLP_S_TIMESTAMPDICTIONARYWRITER_HPP
 
+#include <cstdint>
 #include <map>
 #include <sstream>
 #include <string>
+#include <string_view>
 #include <unordered_map>
 #include <utility>
 
@@ -47,9 +49,9 @@ class TimestampDictionaryWriter {
      * @return the epoch time corresponding to the string timestamp
      */
     epochtime_t ingest_entry(
-            std::string const& key,
+            std::string_view key,
             int32_t node_id,
-            std::string const& timestamp,
+            std::string_view timestamp,
             uint64_t& pattern_id
     );
 
@@ -59,9 +61,9 @@ class TimestampDictionaryWriter {
      * @param node_id
      * @param timestamp
      */
-    void ingest_entry(std::string const& key, int32_t node_id, double timestamp);
+    void ingest_entry(std::string_view key, int32_t node_id, double timestamp);
 
-    void ingest_entry(std::string const& key, int32_t node_id, int64_t timestamp);
+    void ingest_entry(std::string_view key, int32_t node_id, int64_t timestamp);
 
     /**
      * TODO: guarantee epoch milliseconds. The current clp-s approach to encoding timestamps and
diff --git a/components/core/src/clp_s/TimestampEntry.hpp b/components/core/src/clp_s/TimestampEntry.hpp
index 326ed9d73..47a26fd9e 100644
--- a/components/core/src/clp_s/TimestampEntry.hpp
+++ b/components/core/src/clp_s/TimestampEntry.hpp
@@ -3,6 +3,7 @@
 
 #include <sstream>
 #include <string>
+#include <string_view>
 #include <unordered_set>
 #include <variant>
 
@@ -43,7 +44,7 @@ class TimestampEntry {
               m_epoch_start(cEpochTimeMax),
               m_epoch_end(cEpochTimeMin) {}
 
-    TimestampEntry(std::string const& key_name)
+    TimestampEntry(std::string_view key_name)
             : m_encoding(UnkownTimestampEncoding),
               m_epoch_start_double(cDoubleEpochTimeMax),
               m_epoch_end_double(cDoubleEpochTimeMin),
diff --git a/components/core/src/clp_s/TimestampPattern.cpp b/components/core/src/clp_s/TimestampPattern.cpp
index 4ddb5648e..11fab3480 100644
--- a/components/core/src/clp_s/TimestampPattern.cpp
+++ b/components/core/src/clp_s/TimestampPattern.cpp
@@ -4,6 +4,8 @@
 
 #include <chrono>
 #include <cstring>
+#include <string>
+#include <string_view>
 #include <vector>
 
 #include <date/include/date/date.h>
@@ -12,6 +14,7 @@
 
 using clp::string_utils::convert_string_to_int;
 using std::string;
+using std::string_view;
 using std::to_string;
 using std::vector;
 
@@ -71,7 +74,7 @@ append_padded_value_notz(int value, char padding_character, size_t max_length, s
  * @return true if conversion succeeds, false otherwise
  */
 static bool convert_string_to_number(
-        string const& str,
+        string_view str,
         size_t begin_ix,
         size_t end_ix,
         char padding_character,
@@ -89,7 +92,7 @@ static bool convert_string_to_number(
  * @return true if conversion succeeds, false otherwise
  */
 static bool convert_string_to_number_notz(
-        string const& str,
+        string_view str,
         size_t max_digits,
         size_t begin_ix,
         size_t& end_ix,
@@ -125,7 +128,7 @@ append_padded_value_notz(int value, char padding_character, size_t max_length, s
 }
 
 static bool convert_string_to_number(
-        string const& str,
+        string_view str,
         size_t begin_ix,
         size_t end_ix,
         char padding_character,
@@ -154,7 +157,7 @@ static bool convert_string_to_number(
 }
 
 static bool convert_string_to_number_notz(
-        string const& str,
+        string_view str,
         size_t max_digits,
         size_t begin_ix,
         size_t& end_ix,
@@ -306,7 +309,7 @@ void TimestampPattern::init() {
 }
 
 TimestampPattern const* TimestampPattern::search_known_ts_patterns(
-        string const& line,
+        string_view line,
         epochtime_t& timestamp,
         size_t& timestamp_begin_pos,
         size_t& timestamp_end_pos
@@ -342,7 +345,7 @@ void TimestampPattern::clear() {
 }
 
 bool TimestampPattern::parse_timestamp(
-        string const& line,
+        string_view line,
         epochtime_t& timestamp,
         size_t& timestamp_begin_pos,
         size_t& timestamp_end_pos
@@ -827,23 +830,20 @@ bool TimestampPattern::parse_timestamp(
                     }
                     auto dot_position = line.find('.');
                     auto nanosecond_start = dot_position + 1;
-                    if (std::string::npos == dot_position || 0 == dot_position
+                    if (string::npos == dot_position || 0 == dot_position
                         || cNanosecondDigits != (line.length() - nanosecond_start))
                     {
                         return false;
                     }
 
-                    auto timestamp_view = std::string_view(line);
-                    if (false
-                        == convert_string_to_int(timestamp_view.substr(0, dot_position), timestamp))
-                    {
+                    if (false == convert_string_to_int(line.substr(0, dot_position), timestamp)) {
                         return false;
                     }
 
                     epochtime_t timestamp_nanoseconds;
                     if (false
                         == convert_string_to_int(
-                                timestamp_view.substr(nanosecond_start, cNanosecondDigits),
+                                line.substr(nanosecond_start, cNanosecondDigits),
                                 timestamp_nanoseconds
                         ))
                     {
@@ -1070,14 +1070,14 @@ void TimestampPattern::insert_formatted_timestamp(epochtime_t timestamp, string&
                 case 'E':  // UNIX epoch milliseconds
                     // Note: this timestamp format is required to make up the entire timestamp, so
                     // this is safe
-                    new_msg = std::to_string(timestamp);
+                    new_msg = to_string(timestamp);
                     break;
 
                 case 'F': {  // Nanosecond precision floating point UNIX epoch timestamp
                     constexpr auto cNanosecondDigits = 9;
                     // Note: this timestamp format is required to make up the entire timestamp, so
                     // this is safe
-                    new_msg = std::to_string(timestamp);
+                    new_msg = to_string(timestamp);
                     new_msg.insert(new_msg.end() - cNanosecondDigits, '.');
                     break;
                 }
diff --git a/components/core/src/clp_s/TimestampPattern.hpp b/components/core/src/clp_s/TimestampPattern.hpp
index 9219d33bb..278bb82e1 100644
--- a/components/core/src/clp_s/TimestampPattern.hpp
+++ b/components/core/src/clp_s/TimestampPattern.hpp
@@ -6,6 +6,8 @@
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <string>
+#include <string_view>
 #include <utility>
 
 #include "Defs.hpp"
@@ -83,7 +85,7 @@ class TimestampPattern {
      * @return pointer to the timestamp pattern if found, nullptr otherwise
      */
     static TimestampPattern const* search_known_ts_patterns(
-            std::string const& line,
+            std::string_view line,
             epochtime_t& timestamp,
             size_t& timestamp_begin_pos,
             size_t& timestamp_end_pos
@@ -121,7 +123,7 @@ class TimestampPattern {
      * @return true if parsed successfully, false otherwise
      */
     bool parse_timestamp(
-            std::string const& line,
+            std::string_view line,
             epochtime_t& timestamp,
             size_t& timestamp_begin_pos,
             size_t& timestamp_end_pos
diff --git a/components/core/src/clp_s/search/StringLiteral.hpp b/components/core/src/clp_s/search/StringLiteral.hpp
index 4ac6b9f2f..67c902a29 100644
--- a/components/core/src/clp_s/search/StringLiteral.hpp
+++ b/components/core/src/clp_s/search/StringLiteral.hpp
@@ -4,6 +4,7 @@
 #include <memory>
 #include <string>
 
+#include "../Utils.hpp"
 #include "Literal.hpp"
 
 namespace clp_s::search {
@@ -68,19 +69,8 @@ class StringLiteral : public Literal {
             m_string_type = LiteralType::VarStringT;
         }
 
-        // If '?' and '*' are not escaped, we add LiteralType::ClpStringT to m_string_type
-        bool escape = false;
-        for (char const c : m_v) {
-            if ('\\' == c) {
-                escape = !escape;
-            } else if ('?' == c || '*' == c) {
-                if (false == escape) {
-                    m_string_type |= LiteralType::ClpStringT;
-                    break;
-                }
-            } else {
-                escape = false;
-            }
+        if (StringUtils::has_unescaped_wildcards(m_v)) {
+            m_string_type |= LiteralType::ClpStringT;
         }
     }
 };

From b7741c079b6f8dfe05343b055b6d84b5b019f19e Mon Sep 17 00:00:00 2001
From: Jack Luo <jack.luo@mail.utoronto.ca>
Date: Tue, 17 Dec 2024 02:10:21 +0800
Subject: [PATCH 54/65] docs(core): Indicate dependency install scripts should
 be run with elevated privileges. (#637)

---
 .../dev-guide/components-core/centos-stream-9-deps-install.md   | 2 +-
 docs/src/dev-guide/components-core/ubuntu-focal-deps-install.md | 2 +-
 docs/src/dev-guide/components-core/ubuntu-jammy-deps-install.md | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/src/dev-guide/components-core/centos-stream-9-deps-install.md b/docs/src/dev-guide/components-core/centos-stream-9-deps-install.md
index 654b9bf5a..1bc90910a 100644
--- a/docs/src/dev-guide/components-core/centos-stream-9-deps-install.md
+++ b/docs/src/dev-guide/components-core/centos-stream-9-deps-install.md
@@ -10,7 +10,7 @@ Before you run any commands below, you should review the scripts to ensure they
 any dependencies or apply any configurations that you don't expect.
 :::
 
-To install all dependencies, run:
+To install all dependencies, run the following with elevated privileges:
 
 :::{note}
 The packages built from source ([install-packages-from-source.sh][src-install-script]) are installed
diff --git a/docs/src/dev-guide/components-core/ubuntu-focal-deps-install.md b/docs/src/dev-guide/components-core/ubuntu-focal-deps-install.md
index 53ee0ecbd..776c2d43e 100644
--- a/docs/src/dev-guide/components-core/ubuntu-focal-deps-install.md
+++ b/docs/src/dev-guide/components-core/ubuntu-focal-deps-install.md
@@ -10,7 +10,7 @@ Before you run any commands below, you should review the scripts to ensure they
 any dependencies or apply any configurations that you don't expect.
 :::
 
-To install all dependencies, run:
+To install all dependencies, run the following with elevated privileges:
 
 ```shell
 components/core/tools/scripts/lib_install/ubuntu-focal/install-all.sh
diff --git a/docs/src/dev-guide/components-core/ubuntu-jammy-deps-install.md b/docs/src/dev-guide/components-core/ubuntu-jammy-deps-install.md
index 186098446..2e5d4eb3c 100644
--- a/docs/src/dev-guide/components-core/ubuntu-jammy-deps-install.md
+++ b/docs/src/dev-guide/components-core/ubuntu-jammy-deps-install.md
@@ -10,7 +10,7 @@ Before you run any commands below, you should review the scripts to ensure they
 any dependencies or apply any configurations that you don't expect.
 :::
 
-To install all dependencies, run:
+To install all dependencies, run the following with elevated privileges:
 
 ```shell
 components/core/tools/scripts/lib_install/ubuntu-jammy/install-all.sh

From 1edc16e2f2165aafa856bdaf4d07d8a773fa8adf Mon Sep 17 00:00:00 2001
From: haiqi96 <14502009+haiqi96@users.noreply.github.com>
Date: Wed, 18 Dec 2024 20:23:36 -0500
Subject: [PATCH 55/65] feat(package)!: Add support for writing clp-s single
 file archives to S3. (#634)

Co-authored-by: kirkrodrigues <2454684+kirkrodrigues@users.noreply.github.com>
---
 .../clp_package_utils/general.py              |  26 ++-
 .../clp_package_utils/scripts/decompress.py   |  14 +-
 .../clp_package_utils/scripts/del_archives.py |   7 +
 .../scripts/native/decompress.py              |   4 +-
 .../scripts/native/del_archives.py            |   2 +-
 .../clp_package_utils/scripts/search.py       |   6 +
 .../clp_package_utils/scripts/start_clp.py    |  45 ++---
 .../clp-py-utils/clp_py_utils/clp_config.py   | 154 +++++++++++++++--
 .../initialize-orchestration-db.py            |   2 +-
 .../clp-py-utils/clp_py_utils/s3_utils.py     |  51 ++++++
 components/clp-py-utils/pyproject.toml        |   2 +
 .../executor/compress/fs_compression_task.py  | 163 ++++++++++++------
 .../executor/query/extract_stream_task.py     |  47 +++--
 .../executor/query/fs_search_task.py          |  39 +++--
 .../job_orchestration/executor/query/utils.py |   5 +-
 .../job_orchestration/executor/utils.py       |  23 +++
 .../compress/compression_scheduler.py         |  22 +--
 .../package-template/src/etc/clp-config.yml   |   4 +-
 18 files changed, 470 insertions(+), 146 deletions(-)
 create mode 100644 components/clp-py-utils/clp_py_utils/s3_utils.py
 create mode 100644 components/job-orchestration/job_orchestration/executor/utils.py

diff --git a/components/clp-package-utils/clp_package_utils/general.py b/components/clp-package-utils/clp_package_utils/general.py
index 5fae8166f..60f1053f8 100644
--- a/components/clp-package-utils/clp_package_utils/general.py
+++ b/components/clp-package-utils/clp_package_utils/general.py
@@ -20,7 +20,9 @@
     REDIS_COMPONENT_NAME,
     REDUCER_COMPONENT_NAME,
     RESULTS_CACHE_COMPONENT_NAME,
+    StorageType,
     WEBUI_COMPONENT_NAME,
+    WorkerConfig,
 )
 from clp_py_utils.core import (
     get_config_value,
@@ -239,17 +241,17 @@ def generate_container_config(
             DockerMountType.BIND, clp_config.logs_directory, container_clp_config.logs_directory
         )
 
-    container_clp_config.archive_output.directory = pathlib.Path("/") / "mnt" / "archive-output"
+    container_clp_config.archive_output.set_directory(pathlib.Path("/") / "mnt" / "archive-output")
     if not is_path_already_mounted(
         clp_home,
         CONTAINER_CLP_HOME,
-        clp_config.archive_output.directory,
-        container_clp_config.archive_output.directory,
+        clp_config.archive_output.get_directory(),
+        container_clp_config.archive_output.get_directory(),
     ):
         docker_mounts.archives_output_dir = DockerMount(
             DockerMountType.BIND,
-            clp_config.archive_output.directory,
-            container_clp_config.archive_output.directory,
+            clp_config.archive_output.get_directory(),
+            container_clp_config.archive_output.get_directory(),
         )
 
     container_clp_config.stream_output.directory = pathlib.Path("/") / "mnt" / "stream-output"
@@ -268,6 +270,18 @@ def generate_container_config(
     return container_clp_config, docker_mounts
 
 
+def generate_worker_config(clp_config: CLPConfig) -> WorkerConfig:
+    worker_config = WorkerConfig()
+    worker_config.package = clp_config.package.copy(deep=True)
+    worker_config.archive_output = clp_config.archive_output.copy(deep=True)
+    worker_config.data_directory = clp_config.data_directory
+
+    worker_config.stream_output_dir = clp_config.stream_output.directory
+    worker_config.stream_collection_name = clp_config.results_cache.stream_collection_name
+
+    return worker_config
+
+
 def dump_container_config(
     container_clp_config: CLPConfig, clp_config: CLPConfig, container_name: str
 ) -> Tuple[pathlib.Path, pathlib.Path]:
@@ -482,7 +496,7 @@ def validate_results_cache_config(
 
 def validate_worker_config(clp_config: CLPConfig):
     clp_config.validate_input_logs_dir()
-    clp_config.validate_archive_output_dir()
+    clp_config.validate_archive_output_config()
     clp_config.validate_stream_output_dir()
 
 
diff --git a/components/clp-package-utils/clp_package_utils/scripts/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/decompress.py
index 325f2add6..092c339a6 100755
--- a/components/clp-package-utils/clp_package_utils/scripts/decompress.py
+++ b/components/clp-package-utils/clp_package_utils/scripts/decompress.py
@@ -5,7 +5,7 @@
 import sys
 from typing import Optional
 
-from clp_py_utils.clp_config import CLPConfig
+from clp_py_utils.clp_config import CLPConfig, StorageType
 
 from clp_package_utils.general import (
     CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH,
@@ -81,6 +81,11 @@ def handle_extract_file_cmd(
     if clp_config is None:
         return -1
 
+    storage_type = clp_config.archive_output.storage.type
+    if StorageType.FS != storage_type:
+        logger.error(f"File extraction is not supported for archive storage type: {storage_type}.")
+        return -1
+
     container_name = generate_container_name(str(JobType.FILE_EXTRACTION))
     container_clp_config, mounts = generate_container_config(clp_config, clp_home)
     generated_config_path_on_container, generated_config_path_on_host = dump_container_config(
@@ -156,6 +161,13 @@ def handle_extract_stream_cmd(
     if clp_config is None:
         return -1
 
+    storage_type = clp_config.archive_output.storage.type
+    if StorageType.FS != storage_type:
+        logger.error(
+            f"Stream extraction is not supported for archive storage type: {storage_type}."
+        )
+        return -1
+
     container_name = generate_container_name(str(JobType.IR_EXTRACTION))
     container_clp_config, mounts = generate_container_config(clp_config, clp_home)
     generated_config_path_on_container, generated_config_path_on_host = dump_container_config(
diff --git a/components/clp-package-utils/clp_package_utils/scripts/del_archives.py b/components/clp-package-utils/clp_package_utils/scripts/del_archives.py
index 54d959771..5b9bc6d97 100644
--- a/components/clp-package-utils/clp_package_utils/scripts/del_archives.py
+++ b/components/clp-package-utils/clp_package_utils/scripts/del_archives.py
@@ -4,6 +4,8 @@
 import sys
 from pathlib import Path
 
+from clp_py_utils.clp_config import StorageType
+
 from clp_package_utils.general import (
     CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH,
     dump_container_config,
@@ -57,6 +59,11 @@ def main(argv):
         logger.exception("Failed to load config.")
         return -1
 
+    storage_type = clp_config.archive_output.storage.type
+    if StorageType.FS != storage_type:
+        logger.error(f"Archive deletion is not supported for storage type: {storage_type}.")
+        return -1
+
     # Validate the input timestamp
     begin_ts = parsed_args.begin_ts
     end_ts = parsed_args.end_ts
diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py
index d16cdcb6f..7e3c7da6e 100755
--- a/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py
+++ b/components/clp-package-utils/clp_package_utils/scripts/native/decompress.py
@@ -167,7 +167,7 @@ def validate_and_load_config_file(
     """
     try:
         clp_config = load_config_file(config_file_path, default_config_file_path, clp_home)
-        clp_config.validate_archive_output_dir()
+        clp_config.validate_archive_output_config()
         clp_config.validate_logs_dir()
         return clp_config
     except Exception:
@@ -207,7 +207,7 @@ def handle_extract_file_cmd(
     list_path = parsed_args.files_from
 
     logs_dir = clp_config.logs_directory
-    archives_dir = clp_config.archive_output.directory
+    archives_dir = clp_config.archive_output.get_directory()
 
     # Generate database config file for clp
     db_config_file_path = logs_dir / f".decompress-db-config-{uuid.uuid4()}.yml"
diff --git a/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py b/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py
index 735bf299d..c489c3806 100644
--- a/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py
+++ b/components/clp-package-utils/clp_package_utils/scripts/native/del_archives.py
@@ -54,7 +54,7 @@ def main(argv):
         return -1
 
     database_config = clp_config.database
-    archives_dir = clp_config.archive_output.directory
+    archives_dir = clp_config.archive_output.get_directory()
     if not archives_dir.exists():
         logger.error("`archive_output.directory` doesn't exist.")
         return -1
diff --git a/components/clp-package-utils/clp_package_utils/scripts/search.py b/components/clp-package-utils/clp_package_utils/scripts/search.py
index beb7fb0b0..38d528528 100755
--- a/components/clp-package-utils/clp_package_utils/scripts/search.py
+++ b/components/clp-package-utils/clp_package_utils/scripts/search.py
@@ -7,6 +7,7 @@
 import uuid
 
 import yaml
+from clp_py_utils.clp_config import StorageType
 
 from clp_package_utils.general import (
     CLP_DEFAULT_CONFIG_FILE_RELATIVE_PATH,
@@ -74,6 +75,11 @@ def main(argv):
         logger.exception("Failed to load config.")
         return -1
 
+    storage_type = clp_config.archive_output.storage.type
+    if StorageType.FS != storage_type:
+        logger.error(f"Search is not supported for archive storage type: {storage_type}.")
+        return -1
+
     container_name = generate_container_name(str(JobType.SEARCH))
 
     container_clp_config, mounts = generate_container_config(clp_config, clp_home)
diff --git a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py
index 8097929f1..6de3174ff 100755
--- a/components/clp-package-utils/clp_package_utils/scripts/start_clp.py
+++ b/components/clp-package-utils/clp_package_utils/scripts/start_clp.py
@@ -29,6 +29,7 @@
     REDIS_COMPONENT_NAME,
     REDUCER_COMPONENT_NAME,
     RESULTS_CACHE_COMPONENT_NAME,
+    StorageType,
     WEBUI_COMPONENT_NAME,
 )
 from job_orchestration.scheduler.constants import QueueName
@@ -42,6 +43,7 @@
     DockerMount,
     DockerMountType,
     generate_container_config,
+    generate_worker_config,
     get_clp_home,
     is_container_exited,
     is_container_running,
@@ -626,6 +628,7 @@ def start_compression_worker(
 ):
     celery_method = "job_orchestration.executor.compress"
     celery_route = f"{QueueName.COMPRESSION}"
+    compression_worker_mounts = [mounts.archives_output_dir]
     generic_start_worker(
         COMPRESSION_WORKER_COMPONENT_NAME,
         instance_id,
@@ -637,8 +640,7 @@ def start_compression_worker(
         clp_config.redis.compression_backend_database,
         num_cpus,
         mounts,
-        None,
-        None,
+        compression_worker_mounts,
     )
 
 
@@ -652,11 +654,9 @@ def start_query_worker(
     celery_method = "job_orchestration.executor.query"
     celery_route = f"{QueueName.QUERY}"
 
-    query_worker_mount = [mounts.stream_output_dir]
-    query_worker_env = {
-        "CLP_STREAM_OUTPUT_DIR": container_clp_config.stream_output.directory,
-        "CLP_STREAM_COLLECTION_NAME": clp_config.results_cache.stream_collection_name,
-    }
+    query_worker_mounts = [mounts.stream_output_dir]
+    if clp_config.archive_output.storage.type == StorageType.FS:
+        query_worker_mounts.append(mounts.archives_output_dir)
 
     generic_start_worker(
         QUERY_WORKER_COMPONENT_NAME,
@@ -669,8 +669,7 @@ def start_query_worker(
         clp_config.redis.query_backend_database,
         num_cpus,
         mounts,
-        query_worker_env,
-        query_worker_mount,
+        query_worker_mounts,
     )
 
 
@@ -685,8 +684,7 @@ def generic_start_worker(
     redis_database: int,
     num_cpus: int,
     mounts: CLPDockerMounts,
-    worker_specific_env: Dict[str, Any],
-    worker_specific_mount: List[Optional[DockerMount]],
+    worker_specific_mount: Optional[List[Optional[DockerMount]]],
 ):
     logger.info(f"Starting {component_name}...")
 
@@ -694,14 +692,18 @@ def generic_start_worker(
     if container_exists(container_name):
         return
 
-    validate_worker_config(clp_config)
+    container_config_filename = f"{container_name}.yml"
+    container_config_file_path = clp_config.logs_directory / container_config_filename
+    container_worker_config = generate_worker_config(container_clp_config)
+    with open(container_config_file_path, "w") as f:
+        yaml.safe_dump(container_worker_config.dump_to_primitive_dict(), f)
 
     logs_dir = clp_config.logs_directory / component_name
     logs_dir.mkdir(parents=True, exist_ok=True)
     container_logs_dir = container_clp_config.logs_directory / component_name
 
     # Create necessary directories
-    clp_config.archive_output.directory.mkdir(parents=True, exist_ok=True)
+    clp_config.archive_output.get_directory().mkdir(parents=True, exist_ok=True)
     clp_config.stream_output.directory.mkdir(parents=True, exist_ok=True)
 
     clp_site_packages_dir = CONTAINER_CLP_HOME / "lib" / "python3" / "site-packages"
@@ -724,24 +726,17 @@ def generic_start_worker(
             f"{container_clp_config.redis.host}:{container_clp_config.redis.port}/{redis_database}"
         ),
         "-e", f"CLP_HOME={CONTAINER_CLP_HOME}",
-        "-e", f"CLP_DATA_DIR={container_clp_config.data_directory}",
-        "-e", f"CLP_ARCHIVE_OUTPUT_DIR={container_clp_config.archive_output.directory}",
+        "-e", f"CLP_CONFIG_PATH={container_clp_config.logs_directory / container_config_filename}",
         "-e", f"CLP_LOGS_DIR={container_logs_dir}",
         "-e", f"CLP_LOGGING_LEVEL={worker_config.logging_level}",
-        "-e", f"CLP_STORAGE_ENGINE={clp_config.package.storage_engine}",
         "-u", f"{os.getuid()}:{os.getgid()}",
     ]
-    if worker_specific_env:
-        for env_name, env_value in worker_specific_env.items():
-            container_start_cmd.append("-e")
-            container_start_cmd.append(f"{env_name}={env_value}")
-
     # fmt: on
+
     necessary_mounts = [
         mounts.clp_home,
         mounts.data_dir,
         mounts.logs_dir,
-        mounts.archives_output_dir,
         mounts.input_logs_dir,
     ]
     if worker_specific_mount:
@@ -1125,6 +1120,12 @@ def main(argv):
             QUERY_WORKER_COMPONENT_NAME,
         ):
             validate_and_load_redis_credentials_file(clp_config, clp_home, True)
+        if target in (
+            ALL_TARGET_NAME,
+            COMPRESSION_WORKER_COMPONENT_NAME,
+            QUERY_WORKER_COMPONENT_NAME,
+        ):
+            validate_worker_config(clp_config)
 
         clp_config.validate_data_dir()
         clp_config.validate_logs_dir()
diff --git a/components/clp-py-utils/clp_py_utils/clp_config.py b/components/clp-py-utils/clp_py_utils/clp_config.py
index 79a94505d..f59de7647 100644
--- a/components/clp-py-utils/clp_py_utils/clp_config.py
+++ b/components/clp-py-utils/clp_py_utils/clp_config.py
@@ -1,10 +1,10 @@
 import pathlib
-import typing
 from enum import auto
+from typing import Literal, Optional, Union
 
 from dotenv import dotenv_values
 from pydantic import BaseModel, PrivateAttr, validator
-from strenum import KebabCaseStrEnum
+from strenum import KebabCaseStrEnum, LowercaseStrEnum
 
 from .clp_logging import get_valid_logging_level, is_valid_logging_level
 from .core import (
@@ -48,6 +48,11 @@ class StorageEngine(KebabCaseStrEnum):
     CLP_S = auto()
 
 
+class StorageType(LowercaseStrEnum):
+    FS = auto()
+    S3 = auto()
+
+
 VALID_STORAGE_ENGINES = [storage_engine.value for storage_engine in StorageEngine]
 
 
@@ -69,12 +74,12 @@ class Database(BaseModel):
     host: str = "localhost"
     port: int = 3306
     name: str = "clp-db"
-    ssl_cert: typing.Optional[str] = None
+    ssl_cert: Optional[str] = None
     auto_commit: bool = False
     compress: bool = True
 
-    username: typing.Optional[str] = None
-    password: typing.Optional[str] = None
+    username: Optional[str] = None
+    password: Optional[str] = None
 
     @validator("type")
     def validate_database_type(cls, field):
@@ -227,7 +232,7 @@ class Redis(BaseModel):
     query_backend_database: int = 0
     compression_backend_database: int = 1
     # redis can perform authentication without a username
-    password: typing.Optional[str]
+    password: Optional[str]
 
     @validator("host")
     def validate_host(cls, field):
@@ -300,12 +305,80 @@ class Queue(BaseModel):
     host: str = "localhost"
     port: int = 5672
 
-    username: typing.Optional[str]
-    password: typing.Optional[str]
+    username: Optional[str]
+    password: Optional[str]
 
 
-class ArchiveOutput(BaseModel):
+class S3Config(BaseModel):
+    region_code: str
+    bucket: str
+    key_prefix: str
+
+    access_key_id: Optional[str] = None
+    secret_access_key: Optional[str] = None
+
+    @validator("region_code")
+    def validate_region_code(cls, field):
+        if field == "":
+            raise ValueError("region_code cannot be empty")
+        return field
+
+    @validator("bucket")
+    def validate_bucket(cls, field):
+        if field == "":
+            raise ValueError("bucket cannot be empty")
+        return field
+
+    @validator("key_prefix")
+    def validate_key_prefix(cls, field):
+        if field == "":
+            raise ValueError("key_prefix cannot be empty")
+        if not field.endswith("/"):
+            raise ValueError('key_prefix must end with "/"')
+        return field
+
+
+class FsStorage(BaseModel):
+    type: Literal[StorageType.FS.value] = StorageType.FS.value
     directory: pathlib.Path = pathlib.Path("var") / "data" / "archives"
+
+    @validator("directory")
+    def validate_directory(cls, field):
+        if "" == field:
+            raise ValueError("directory cannot be empty")
+        return field
+
+    def make_config_paths_absolute(self, clp_home: pathlib.Path):
+        self.directory = make_config_path_absolute(clp_home, self.directory)
+
+    def dump_to_primitive_dict(self):
+        d = self.dict()
+        d["directory"] = str(d["directory"])
+        return d
+
+
+class S3Storage(BaseModel):
+    type: Literal[StorageType.S3.value] = StorageType.S3.value
+    staging_directory: pathlib.Path = pathlib.Path("var") / "data" / "staged_archives"
+    s3_config: S3Config
+
+    @validator("staging_directory")
+    def validate_staging_directory(cls, field):
+        if "" == field:
+            raise ValueError("staging_directory cannot be empty")
+        return field
+
+    def make_config_paths_absolute(self, clp_home: pathlib.Path):
+        self.staging_directory = make_config_path_absolute(clp_home, self.staging_directory)
+
+    def dump_to_primitive_dict(self):
+        d = self.dict()
+        d["staging_directory"] = str(d["staging_directory"])
+        return d
+
+
+class ArchiveOutput(BaseModel):
+    storage: Union[FsStorage, S3Storage] = FsStorage()
     target_archive_size: int = 256 * 1024 * 1024  # 256 MB
     target_dictionaries_size: int = 32 * 1024 * 1024  # 32 MB
     target_encoded_file_size: int = 256 * 1024 * 1024  # 256 MB
@@ -335,13 +408,30 @@ def validate_target_segment_size(cls, field):
             raise ValueError("target_segment_size must be greater than 0")
         return field
 
-    def make_config_paths_absolute(self, clp_home: pathlib.Path):
-        self.directory = make_config_path_absolute(clp_home, self.directory)
+    def set_directory(self, directory: pathlib.Path):
+        storage_config = self.storage
+        storage_type = storage_config.type
+        if StorageType.FS == storage_type:
+            storage_config.directory = directory
+        elif StorageType.S3 == storage_type:
+            storage_config.staging_directory = directory
+        else:
+            raise NotImplementedError(f"storage.type {storage_type} is not supported")
+
+    def get_directory(self) -> pathlib.Path:
+        storage_config = self.storage
+        storage_type = storage_config.type
+        if StorageType.FS == storage_config.type:
+            return storage_config.directory
+        elif StorageType.S3 == storage_type:
+            return storage_config.staging_directory
+        else:
+            raise NotImplementedError(f"storage.type {storage_type} is not supported")
 
     def dump_to_primitive_dict(self):
         d = self.dict()
         # Turn directory (pathlib.Path) into a primitive string
-        d["directory"] = str(d["directory"])
+        d["storage"] = self.storage.dump_to_primitive_dict()
         return d
 
 
@@ -352,7 +442,7 @@ class StreamOutput(BaseModel):
     @validator("directory")
     def validate_directory(cls, field):
         if "" == field:
-            raise ValueError("directory can not be empty")
+            raise ValueError("directory cannot be empty")
         return field
 
     @validator("target_uncompressed_size")
@@ -408,7 +498,7 @@ def validate_port(cls, field):
 
 
 class CLPConfig(BaseModel):
-    execution_container: typing.Optional[str]
+    execution_container: Optional[str] = None
 
     input_logs_directory: pathlib.Path = pathlib.Path("/")
 
@@ -436,7 +526,7 @@ class CLPConfig(BaseModel):
     def make_config_paths_absolute(self, clp_home: pathlib.Path):
         self.input_logs_directory = make_config_path_absolute(clp_home, self.input_logs_directory)
         self.credentials_file_path = make_config_path_absolute(clp_home, self.credentials_file_path)
-        self.archive_output.make_config_paths_absolute(clp_home)
+        self.archive_output.storage.make_config_paths_absolute(clp_home)
         self.stream_output.make_config_paths_absolute(clp_home)
         self.data_directory = make_config_path_absolute(clp_home, self.data_directory)
         self.logs_directory = make_config_path_absolute(clp_home, self.logs_directory)
@@ -451,11 +541,19 @@ def validate_input_logs_dir(self):
         if not input_logs_dir.is_dir():
             raise ValueError(f"input_logs_directory '{input_logs_dir}' is not a directory.")
 
-    def validate_archive_output_dir(self):
+    def validate_archive_output_config(self):
+        if (
+            StorageType.S3 == self.archive_output.storage.type
+            and StorageEngine.CLP_S != self.package.storage_engine
+        ):
+            raise ValueError(
+                f"archive_output.storage.type = 's3' is only supported with package.storage_engine"
+                f" = '{StorageEngine.CLP_S}'"
+            )
         try:
-            validate_path_could_be_dir(self.archive_output.directory)
+            validate_path_could_be_dir(self.archive_output.get_directory())
         except ValueError as ex:
-            raise ValueError(f"archive_output.directory is invalid: {ex}")
+            raise ValueError(f"archive_output.storage's directory is invalid: {ex}")
 
     def validate_stream_output_dir(self):
         try:
@@ -537,3 +635,23 @@ def dump_to_primitive_dict(self):
         d["data_directory"] = str(self.data_directory)
         d["logs_directory"] = str(self.logs_directory)
         return d
+
+
+class WorkerConfig(BaseModel):
+    package: Package = Package()
+    archive_output: ArchiveOutput = ArchiveOutput()
+    data_directory: pathlib.Path = CLPConfig().data_directory
+
+    # Only needed by query workers.
+    stream_output_dir: pathlib.Path = StreamOutput().directory
+    stream_collection_name: str = ResultsCache().stream_collection_name
+
+    def dump_to_primitive_dict(self):
+        d = self.dict()
+        d["archive_output"] = self.archive_output.dump_to_primitive_dict()
+
+        # Turn paths into primitive strings
+        d["data_directory"] = str(self.data_directory)
+        d["stream_output_dir"] = str(self.stream_output_dir)
+
+        return d
diff --git a/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py b/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py
index 1ed727367..2c8133e8a 100644
--- a/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py
+++ b/components/clp-py-utils/clp_py_utils/initialize-orchestration-db.py
@@ -52,7 +52,7 @@ def main(argv):
                 CREATE TABLE IF NOT EXISTS `{COMPRESSION_JOBS_TABLE_NAME}` (
                     `id` INT NOT NULL AUTO_INCREMENT,
                     `status` INT NOT NULL DEFAULT '{CompressionJobStatus.PENDING}',
-                    `status_msg` VARCHAR(255) NOT NULL DEFAULT '',
+                    `status_msg` VARCHAR(512) NOT NULL DEFAULT '',
                     `creation_time` DATETIME(3) NOT NULL DEFAULT CURRENT_TIMESTAMP(3),
                     `start_time` DATETIME(3) NULL DEFAULT NULL,
                     `duration` FLOAT NULL DEFAULT NULL,
diff --git a/components/clp-py-utils/clp_py_utils/s3_utils.py b/components/clp-py-utils/clp_py_utils/s3_utils.py
new file mode 100644
index 000000000..03717a445
--- /dev/null
+++ b/components/clp-py-utils/clp_py_utils/s3_utils.py
@@ -0,0 +1,51 @@
+from pathlib import Path
+
+import boto3
+from botocore.config import Config
+from botocore.exceptions import ClientError
+from result import Err, Ok, Result
+
+from clp_py_utils.clp_config import S3Config
+
+
+def s3_put(
+    s3_config: S3Config, src_file: Path, dest_file_name: str, total_max_attempts: int = 3
+) -> Result[bool, str]:
+    """
+    Uploads a local file to an S3 bucket using AWS's PutObject operation.
+    :param s3_config: S3 configuration specifying the upload destination and credentials.
+    :param src_file: Local file to upload.
+    :param dest_file_name: The name for the uploaded file in the S3 bucket.
+    :param total_max_attempts: Maximum number of retry attempts for the upload.
+    :return: Result.OK(bool) on success, or Result.Err(str) with the error message otherwise.
+    """
+    if not src_file.exists():
+        return Err(f"{src_file} doesn't exist")
+    if not src_file.is_file():
+        return Err(f"{src_file} is not a file")
+    if src_file.stat().st_size > 5 * 1024 * 1024 * 1024:
+        return Err(f"{src_file} is larger than the limit (5GiB) for a single PutObject operation.")
+
+    config = Config(retries=dict(total_max_attempts=total_max_attempts, mode="adaptive"))
+
+    my_s3_client = boto3.client(
+        "s3",
+        region_name=s3_config.region_code,
+        aws_access_key_id=s3_config.access_key_id,
+        aws_secret_access_key=s3_config.secret_access_key,
+        config=config,
+    )
+
+    with open(src_file, "rb") as file_data:
+        try:
+            my_s3_client.put_object(
+                Bucket=s3_config.bucket, Body=file_data, Key=s3_config.key_prefix + dest_file_name
+            )
+        except ClientError as e:
+            error_code = e.response["Error"]["Code"]
+            error_message = e.response["Error"]["Message"]
+            return Err(f"ClientError: {error_code} - {error_message}")
+        except Exception as e:
+            return Err(f"An unexpected error occurred: {e}")
+
+    return Ok(True)
diff --git a/components/clp-py-utils/pyproject.toml b/components/clp-py-utils/pyproject.toml
index 4e827b926..6d68ceebe 100644
--- a/components/clp-py-utils/pyproject.toml
+++ b/components/clp-py-utils/pyproject.toml
@@ -10,6 +10,7 @@ readme = "README.md"
 
 [tool.poetry.dependencies]
 python = "^3.8 || ^3.10"
+boto3 = "^1.35.81"
 # mariadb version must be compatible with libmariadev installed in runtime env.
 # See https://mariadb.com/docs/server/connect/programming-languages/python/install/#Dependencies
 mariadb = "~1.0.11"
@@ -19,6 +20,7 @@ python-dotenv = "^1.0.1"
 python-Levenshtein = "~0.22"
 sqlalchemy = "~2.0"
 PyYAML = "^6.0.1"
+result = "^0.17.0"
 StrEnum = "^0.4.15"
 
 [build-system]
diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py
index ce88ad185..a5dbc0e35 100644
--- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py
+++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py
@@ -4,6 +4,7 @@
 import pathlib
 import subprocess
 from contextlib import closing
+from typing import Any, Dict, Optional
 
 import yaml
 from celery.app.task import Task
@@ -12,9 +13,14 @@
     COMPRESSION_JOBS_TABLE_NAME,
     COMPRESSION_TASKS_TABLE_NAME,
     Database,
+    S3Config,
     StorageEngine,
+    StorageType,
+    WorkerConfig,
 )
 from clp_py_utils.clp_logging import set_logging_level
+from clp_py_utils.core import read_yaml_config_file
+from clp_py_utils.s3_utils import s3_put
 from clp_py_utils.sql_adapter import SQL_Adapter
 from job_orchestration.executor.compress.celery import app
 from job_orchestration.scheduler.constants import CompressionTaskStatus
@@ -108,6 +114,7 @@ def make_clp_s_command(
     archive_output_dir: pathlib.Path,
     clp_config: ClpIoConfig,
     db_config_file_path: pathlib.Path,
+    enable_s3_write: bool,
 ):
     # fmt: off
     compression_cmd = [
@@ -120,6 +127,9 @@ def make_clp_s_command(
     ]
     # fmt: on
 
+    if enable_s3_write:
+        compression_cmd.append("--single-file-archive")
+
     if clp_config.input.timestamp_key is not None:
         compression_cmd.append("--timestamp-key")
         compression_cmd.append(clp_config.input.timestamp_key)
@@ -128,10 +138,9 @@ def make_clp_s_command(
 
 
 def run_clp(
+    worker_config: WorkerConfig,
     clp_config: ClpIoConfig,
     clp_home: pathlib.Path,
-    data_dir: pathlib.Path,
-    archive_output_dir: pathlib.Path,
     logs_dir: pathlib.Path,
     job_id: int,
     task_id: int,
@@ -143,10 +152,9 @@ def run_clp(
     """
     Compresses files from an FS into archives on an FS
 
+    :param worker_config: WorkerConfig
     :param clp_config: ClpIoConfig
     :param clp_home:
-    :param data_dir:
-    :param archive_output_dir:
     :param logs_dir:
     :param job_id:
     :param task_id:
@@ -156,16 +164,31 @@ def run_clp(
     :param clp_metadata_db_connection_config
     :return: tuple -- (whether compression was successful, output messages)
     """
-    clp_storage_engine = str(os.getenv("CLP_STORAGE_ENGINE"))
-
     instance_id_str = f"compression-job-{job_id}-task-{task_id}"
 
+    clp_storage_engine = worker_config.package.storage_engine
+    data_dir = worker_config.data_directory
+    archive_output_dir = worker_config.archive_output.get_directory()
+
     # Generate database config file for clp
     db_config_file_path = data_dir / f"{instance_id_str}-db-config.yml"
     db_config_file = open(db_config_file_path, "w")
     yaml.safe_dump(clp_metadata_db_connection_config, db_config_file)
     db_config_file.close()
 
+    # Get s3 config
+    s3_config: S3Config
+    enable_s3_write = False
+    storage_type = worker_config.archive_output.storage.type
+    if StorageType.S3 == storage_type:
+        if StorageEngine.CLP_S != clp_storage_engine:
+            error_msg = f"S3 storage is not supported for storage engine: {clp_storage_engine}."
+            logger.error(error_msg)
+            return False, {"error_message": error_msg}
+
+        s3_config = worker_config.archive_output.storage.s3_config
+        enable_s3_write = True
+
     if StorageEngine.CLP == clp_storage_engine:
         compression_cmd = make_clp_command(
             clp_home=clp_home,
@@ -179,6 +202,7 @@ def run_clp(
             archive_output_dir=archive_output_dir,
             clp_config=clp_config,
             db_config_file_path=db_config_file_path,
+            enable_s3_write=enable_s3_write,
         )
     else:
         logger.error(f"Unsupported storage engine {clp_storage_engine}")
@@ -212,48 +236,65 @@ def run_clp(
 
     # Compute the total amount of data compressed
     last_archive_stats = None
+    last_line_decoded = False
     total_uncompressed_size = 0
     total_compressed_size = 0
-    while True:
+
+    # Handle job metadata update and s3 write if enabled
+    s3_error = None
+    while not last_line_decoded:
         line = proc.stdout.readline()
-        if not line:
-            break
-        stats = json.loads(line.decode("ascii"))
-        if last_archive_stats is not None and stats["id"] != last_archive_stats["id"]:
-            # We've started a new archive so add the previous archive's last
-            # reported size to the total
-            total_uncompressed_size += last_archive_stats["uncompressed_size"]
-            total_compressed_size += last_archive_stats["size"]
-            with closing(sql_adapter.create_connection(True)) as db_conn, closing(
-                db_conn.cursor(dictionary=True)
-            ) as db_cursor:
-                update_job_metadata_and_tags(
-                    db_cursor,
-                    job_id,
-                    clp_metadata_db_connection_config["table_prefix"],
-                    tag_ids,
-                    last_archive_stats,
-                )
-                db_conn.commit()
+        stats: Optional[Dict[str, Any]] = None
+        if "" == line:
+            # Skip empty lines that could be caused by potential errors in printing archive stats
+            continue
+
+        if line is not None:
+            stats = json.loads(line.decode("ascii"))
+        else:
+            last_line_decoded = True
+
+        if last_archive_stats is not None and (
+            None is stats or stats["id"] != last_archive_stats["id"]
+        ):
+            if enable_s3_write:
+                archive_id = last_archive_stats["id"]
+                archive_path = archive_output_dir / archive_id
+
+                if s3_error is None:
+                    logger.info(f"Uploading archive {archive_id} to S3...")
+                    result = s3_put(s3_config, archive_path, archive_id)
+
+                    if result.is_err():
+                        logger.error(f"Failed to upload archive {archive_id}: {result.err_value}")
+                        s3_error = result.err_value
+                        # NOTE: It's possible `proc` finishes before we call `terminate` on it, in
+                        # which case the process will still return success.
+                        proc.terminate()
+                    else:
+                        logger.info(f"Finished uploading archive {archive_id} to S3.")
+
+                archive_path.unlink()
+
+            if s3_error is None:
+                # We've started a new archive so add the previous archive's last reported size to
+                # the total
+                total_uncompressed_size += last_archive_stats["uncompressed_size"]
+                total_compressed_size += last_archive_stats["size"]
+                with closing(sql_adapter.create_connection(True)) as db_conn, closing(
+                    db_conn.cursor(dictionary=True)
+                ) as db_cursor:
+                    update_job_metadata_and_tags(
+                        db_cursor,
+                        job_id,
+                        clp_metadata_db_connection_config["table_prefix"],
+                        tag_ids,
+                        last_archive_stats,
+                    )
+                    db_conn.commit()
 
         last_archive_stats = stats
 
-    if last_archive_stats is not None:
-        # Add the last archive's last reported size
-        total_uncompressed_size += last_archive_stats["uncompressed_size"]
-        total_compressed_size += last_archive_stats["size"]
-        with closing(sql_adapter.create_connection(True)) as db_conn, closing(
-            db_conn.cursor(dictionary=True)
-        ) as db_cursor:
-            update_job_metadata_and_tags(
-                db_cursor,
-                job_id,
-                clp_metadata_db_connection_config["table_prefix"],
-                tag_ids,
-                last_archive_stats,
-            )
-            db_conn.commit()
-
     # Wait for compression to finish
     return_code = proc.wait()
     if 0 != return_code:
@@ -274,10 +315,16 @@ def run_clp(
         "total_uncompressed_size": total_uncompressed_size,
         "total_compressed_size": total_compressed_size,
     }
-    if compression_successful:
+
+    if compression_successful and s3_error is None:
         return CompressionTaskStatus.SUCCEEDED, worker_output
     else:
-        worker_output["error_message"] = f"See logs {stderr_log_path}"
+        error_msgs = []
+        if compression_successful is False:
+            error_msgs.append(f"See logs {stderr_log_path}")
+        if s3_error is not None:
+            error_msgs.append(s3_error)
+        worker_output["error_message"] = "\n".join(error_msgs)
         return CompressionTaskStatus.FAILED, worker_output
 
 
@@ -291,15 +338,28 @@ def compress(
     paths_to_compress_json: str,
     clp_metadata_db_connection_config,
 ):
-    clp_home_str = os.getenv("CLP_HOME")
-    data_dir_str = os.getenv("CLP_DATA_DIR")
-    archive_output_dir_str = os.getenv("CLP_ARCHIVE_OUTPUT_DIR")
-    logs_dir_str = os.getenv("CLP_LOGS_DIR")
+    clp_home = pathlib.Path(os.getenv("CLP_HOME"))
 
     # Set logging level
+    logs_dir = pathlib.Path(os.getenv("CLP_LOGS_DIR"))
     clp_logging_level = str(os.getenv("CLP_LOGGING_LEVEL"))
     set_logging_level(logger, clp_logging_level)
 
+    # Load configuration
+    try:
+        worker_config = WorkerConfig.parse_obj(
+            read_yaml_config_file(pathlib.Path(os.getenv("CLP_CONFIG_PATH")))
+        )
+    except Exception as ex:
+        error_msg = "Failed to load worker config"
+        logger.exception(error_msg)
+        return CompressionTaskResult(
+            task_id=task_id,
+            status=CompressionTaskStatus.FAILED,
+            duration=0,
+            error_message=error_msg,
+        )
+
     clp_io_config = ClpIoConfig.parse_raw(clp_io_config_json)
     paths_to_compress = PathsToCompress.parse_raw(paths_to_compress_json)
 
@@ -308,11 +368,10 @@ def compress(
     start_time = datetime.datetime.now()
     logger.info(f"[job_id={job_id} task_id={task_id}] COMPRESSION STARTED.")
     compression_task_status, worker_output = run_clp(
+        worker_config,
         clp_io_config,
-        pathlib.Path(clp_home_str),
-        pathlib.Path(data_dir_str),
-        pathlib.Path(archive_output_dir_str),
-        pathlib.Path(logs_dir_str),
+        clp_home,
+        logs_dir,
         job_id,
         task_id,
         tag_ids,
diff --git a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py
index 423ebb757..58ae43450 100644
--- a/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py
+++ b/components/job-orchestration/job_orchestration/executor/query/extract_stream_task.py
@@ -5,14 +5,15 @@
 
 from celery.app.task import Task
 from celery.utils.log import get_task_logger
-from clp_py_utils.clp_config import Database, StorageEngine
+from clp_py_utils.clp_config import Database, StorageEngine, StorageType, WorkerConfig
 from clp_py_utils.clp_logging import set_logging_level
 from clp_py_utils.sql_adapter import SQL_Adapter
 from job_orchestration.executor.query.celery import app
 from job_orchestration.executor.query.utils import (
-    report_command_creation_failure,
+    report_task_failure,
     run_query_task,
 )
+from job_orchestration.executor.utils import load_worker_config
 from job_orchestration.scheduler.job_config import ExtractIrJobConfig, ExtractJsonJobConfig
 from job_orchestration.scheduler.scheduler_data import QueryTaskStatus
 
@@ -21,15 +22,17 @@
 
 
 def make_command(
-    storage_engine: str,
     clp_home: Path,
-    archives_dir: Path,
+    worker_config: WorkerConfig,
     archive_id: str,
-    stream_output_dir: Path,
     job_config: dict,
     results_cache_uri: str,
-    stream_collection_name: str,
 ) -> Optional[List[str]]:
+    storage_engine = worker_config.package.storage_engine
+    archives_dir = worker_config.archive_output.get_directory()
+    stream_output_dir = worker_config.stream_output_dir
+    stream_collection_name = worker_config.stream_collection_name
+
     if StorageEngine.CLP == storage_engine:
         logger.info("Starting IR extraction")
         extract_ir_config = ExtractIrJobConfig.parse_obj(job_config)
@@ -97,28 +100,38 @@ def extract_stream(
     task_status: QueryTaskStatus
     sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params))
 
+    # Load configuration
+    clp_config_path = Path(os.getenv("CLP_CONFIG_PATH"))
+    worker_config = load_worker_config(clp_config_path, logger)
+    if worker_config is None:
+        return report_task_failure(
+            sql_adapter=sql_adapter,
+            task_id=task_id,
+            start_time=start_time,
+        )
+
+    if worker_config.archive_output.storage.type == StorageType.S3:
+        logger.error(f"Stream extraction is not supported for the S3 storage type")
+        return report_task_failure(
+            sql_adapter=sql_adapter,
+            task_id=task_id,
+            start_time=start_time,
+        )
+
     # Make task_command
     clp_home = Path(os.getenv("CLP_HOME"))
-    archive_directory = Path(os.getenv("CLP_ARCHIVE_OUTPUT_DIR"))
-    clp_storage_engine = os.getenv("CLP_STORAGE_ENGINE")
-    stream_output_dir = Path(os.getenv("CLP_STREAM_OUTPUT_DIR"))
-    stream_collection_name = os.getenv("CLP_STREAM_COLLECTION_NAME")
 
     task_command = make_command(
-        storage_engine=clp_storage_engine,
         clp_home=clp_home,
-        archives_dir=archive_directory,
+        worker_config=worker_config,
         archive_id=archive_id,
-        stream_output_dir=stream_output_dir,
         job_config=job_config,
         results_cache_uri=results_cache_uri,
-        stream_collection_name=stream_collection_name,
     )
     if not task_command:
-        return report_command_creation_failure(
+        logger.error(f"Error creating {task_name} command")
+        return report_task_failure(
             sql_adapter=sql_adapter,
-            logger=logger,
-            task_name=task_name,
             task_id=task_id,
             start_time=start_time,
         )
diff --git a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py
index 598bfdcfc..7cf7b330f 100644
--- a/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py
+++ b/components/job-orchestration/job_orchestration/executor/query/fs_search_task.py
@@ -5,14 +5,15 @@
 
 from celery.app.task import Task
 from celery.utils.log import get_task_logger
-from clp_py_utils.clp_config import Database, StorageEngine
+from clp_py_utils.clp_config import Database, StorageEngine, StorageType, WorkerConfig
 from clp_py_utils.clp_logging import set_logging_level
 from clp_py_utils.sql_adapter import SQL_Adapter
 from job_orchestration.executor.query.celery import app
 from job_orchestration.executor.query.utils import (
-    report_command_creation_failure,
+    report_task_failure,
     run_query_task,
 )
+from job_orchestration.executor.utils import load_worker_config
 from job_orchestration.scheduler.job_config import SearchJobConfig
 from job_orchestration.scheduler.scheduler_data import QueryTaskStatus
 
@@ -21,14 +22,16 @@
 
 
 def make_command(
-    storage_engine: str,
     clp_home: Path,
-    archives_dir: Path,
+    worker_config: WorkerConfig,
     archive_id: str,
     search_config: SearchJobConfig,
     results_cache_uri: str,
     results_collection: str,
 ) -> Optional[List[str]]:
+    storage_engine = worker_config.package.storage_engine
+    archives_dir = worker_config.archive_output.get_directory()
+
     if StorageEngine.CLP == storage_engine:
         command = [str(clp_home / "bin" / "clo"), "s", str(archives_dir / archive_id)]
         if search_config.path_filter is not None:
@@ -116,26 +119,40 @@ def search(
     task_status: QueryTaskStatus
     sql_adapter = SQL_Adapter(Database.parse_obj(clp_metadata_db_conn_params))
 
+    # Load configuration
+    clp_config_path = Path(os.getenv("CLP_CONFIG_PATH"))
+    worker_config = load_worker_config(clp_config_path, logger)
+    if worker_config is None:
+        return report_task_failure(
+            sql_adapter=sql_adapter,
+            task_id=task_id,
+            start_time=start_time,
+        )
+
+    if worker_config.archive_output.storage.type == StorageType.S3:
+        logger.error(f"Search is not supported for the S3 storage type")
+        return report_task_failure(
+            sql_adapter=sql_adapter,
+            task_id=task_id,
+            start_time=start_time,
+        )
+
     # Make task_command
     clp_home = Path(os.getenv("CLP_HOME"))
-    archive_directory = Path(os.getenv("CLP_ARCHIVE_OUTPUT_DIR"))
-    clp_storage_engine = os.getenv("CLP_STORAGE_ENGINE")
     search_config = SearchJobConfig.parse_obj(job_config)
 
     task_command = make_command(
-        storage_engine=clp_storage_engine,
         clp_home=clp_home,
-        archives_dir=archive_directory,
+        worker_config=worker_config,
         archive_id=archive_id,
         search_config=search_config,
         results_cache_uri=results_cache_uri,
         results_collection=job_id,
     )
     if not task_command:
-        return report_command_creation_failure(
+        logger.error(f"Error creating {task_name} command")
+        return report_task_failure(
             sql_adapter=sql_adapter,
-            logger=logger,
-            task_name=task_name,
             task_id=task_id,
             start_time=start_time,
         )
diff --git a/components/job-orchestration/job_orchestration/executor/query/utils.py b/components/job-orchestration/job_orchestration/executor/query/utils.py
index 69d22398e..523abbe00 100644
--- a/components/job-orchestration/job_orchestration/executor/query/utils.py
+++ b/components/job-orchestration/job_orchestration/executor/query/utils.py
@@ -19,14 +19,11 @@ def get_task_log_file_path(clp_logs_dir: Path, job_id: str, task_id: int) -> Pat
     return worker_logs_dir / f"{task_id}-clo.log"
 
 
-def report_command_creation_failure(
+def report_task_failure(
     sql_adapter: SQL_Adapter,
-    logger: Logger,
-    task_name: str,
     task_id: int,
     start_time: datetime.datetime,
 ):
-    logger.error(f"Error creating {task_name} command")
     task_status = QueryTaskStatus.FAILED
     update_query_task_metadata(
         sql_adapter,
diff --git a/components/job-orchestration/job_orchestration/executor/utils.py b/components/job-orchestration/job_orchestration/executor/utils.py
new file mode 100644
index 000000000..47ea702ae
--- /dev/null
+++ b/components/job-orchestration/job_orchestration/executor/utils.py
@@ -0,0 +1,23 @@
+from logging import Logger
+from pathlib import Path
+from typing import Optional
+
+from clp_py_utils.clp_config import WorkerConfig
+from clp_py_utils.core import read_yaml_config_file
+
+
+def load_worker_config(
+    config_path: Path,
+    logger: Logger,
+) -> Optional[WorkerConfig]:
+    """
+    Loads a WorkerConfig object from the specified configuration file.
+    :param config_path: Path to the configuration file.
+    :param logger: Logger instance for reporting errors if loading fails.
+    :return: The loaded WorkerConfig object on success, None otherwise.
+    """
+    try:
+        return WorkerConfig.parse_obj(read_yaml_config_file(config_path))
+    except Exception:
+        logger.exception("Failed to load worker config")
+        return None
diff --git a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py
index 62b7a27fc..bd793686b 100644
--- a/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py
+++ b/components/job-orchestration/job_orchestration/scheduler/compress/compression_scheduler.py
@@ -53,13 +53,14 @@ def update_compression_task_metadata(db_cursor, task_id, kv):
         logger.error("Must specify at least one field to update")
         raise ValueError
 
-    field_set_expressions = [f'{k}="{v}"' for k, v in kv.items()]
+    field_set_expressions = [f"{k} = %s" for k in kv.keys()]
     query = f"""
-    UPDATE {COMPRESSION_TASKS_TABLE_NAME}
-    SET {", ".join(field_set_expressions)}
-    WHERE id={task_id}
+        UPDATE {COMPRESSION_TASKS_TABLE_NAME}
+        SET {", ".join(field_set_expressions)}
+        WHERE id = %s
     """
-    db_cursor.execute(query)
+    values = list(kv.values()) + [task_id]
+    db_cursor.execute(query, values)
 
 
 def update_compression_job_metadata(db_cursor, job_id, kv):
@@ -67,13 +68,14 @@ def update_compression_job_metadata(db_cursor, job_id, kv):
         logger.error("Must specify at least one field to update")
         raise ValueError
 
-    field_set_expressions = [f'{k}="{v}"' for k, v in kv.items()]
+    field_set_expressions = [f"{k} = %s" for k in kv.keys()]
     query = f"""
-    UPDATE {COMPRESSION_JOBS_TABLE_NAME}
-    SET {", ".join(field_set_expressions)}
-    WHERE id={job_id}
+        UPDATE {COMPRESSION_JOBS_TABLE_NAME}
+        SET {", ".join(field_set_expressions)}
+        WHERE id = %s
     """
-    db_cursor.execute(query)
+    values = list(kv.values()) + [job_id]
+    db_cursor.execute(query, values)
 
 
 def search_and_schedule_new_tasks(db_conn, db_cursor, clp_metadata_db_connection_config):
diff --git a/components/package-template/src/etc/clp-config.yml b/components/package-template/src/etc/clp-config.yml
index f19b93463..22b03b889 100644
--- a/components/package-template/src/etc/clp-config.yml
+++ b/components/package-template/src/etc/clp-config.yml
@@ -66,7 +66,9 @@
 #
 ## Where archives should be output to
 #archive_output:
-#  directory: "var/data/archives"
+#  storage:
+#    type: "fs"
+#    directory: "var/data/archives"
 #
 #  # How much data CLP should try to compress into each archive
 #  target_archive_size: 268435456  # 256 MB

From e4c9dd3c08dac002a50fa670e911a6379c8c9976 Mon Sep 17 00:00:00 2001
From: haiqi96 <14502009+haiqi96@users.noreply.github.com>
Date: Wed, 18 Dec 2024 22:54:19 -0500
Subject: [PATCH 56/65] fix(clp-package): Remove faulty error handling for
 parsing archive compression stats.  (#640)

---
 .../executor/compress/fs_compression_task.py          | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py
index a5dbc0e35..593c07bd7 100644
--- a/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py
+++ b/components/job-orchestration/job_orchestration/executor/compress/fs_compression_task.py
@@ -243,16 +243,13 @@ def run_clp(
     # Handle job metadata update and s3 write if enabled
     s3_error = None
     while not last_line_decoded:
-        line = proc.stdout.readline()
         stats: Optional[Dict[str, Any]] = None
-        if "" == line:
-            # Skip empty lines that could be caused by potential errors in printing archive stats
-            continue
 
-        if line is not None:
-            stats = json.loads(line.decode("ascii"))
-        else:
+        line = proc.stdout.readline()
+        if not line:
             last_line_decoded = True
+        else:
+            stats = json.loads(line.decode("ascii"))
 
         if last_archive_stats is not None and (
             None is stats or stats["id"] != last_archive_stats["id"]

From 32dc98901129eb0cc514265d2d82acc76a260065 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingranhu98@gmail.com>
Date: Thu, 19 Dec 2024 00:11:55 -0500
Subject: [PATCH 57/65] fix(core): Add missing `../` to fix relative header
 file includes. (#627)

Co-authored-by: Bingran Hu <bingran.hu@yscope.com>
---
 components/core/src/clp/clo/CommandLineArguments.cpp | 2 +-
 components/core/src/clp/clp/FileDecompressor.hpp     | 6 +++---
 components/core/src/clp/clp/decompression.cpp        | 2 +-
 components/core/src/clp/clp/utils.cpp                | 4 ++--
 components/core/src/clp/clp/utils.hpp                | 4 ++--
 components/core/src/clp/ir/EncodedTextAst.cpp        | 2 +-
 components/core/src/clp/ir/LogEvent.hpp              | 2 +-
 components/core/src/clp_s/CommandLineArguments.cpp   | 2 +-
 8 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/components/core/src/clp/clo/CommandLineArguments.cpp b/components/core/src/clp/clo/CommandLineArguments.cpp
index 4e187f985..f0a7f7ecc 100644
--- a/components/core/src/clp/clo/CommandLineArguments.cpp
+++ b/components/core/src/clp/clo/CommandLineArguments.cpp
@@ -8,8 +8,8 @@
 
 #include <boost/program_options.hpp>
 
+#include "../../reducer/types.hpp"
 #include "../cli_utils.hpp"
-#include "../reducer/types.hpp"
 #include "../spdlog_with_specializations.hpp"
 #include "../version.hpp"
 
diff --git a/components/core/src/clp/clp/FileDecompressor.hpp b/components/core/src/clp/clp/FileDecompressor.hpp
index b08a21eb4..17a8b8e43 100644
--- a/components/core/src/clp/clp/FileDecompressor.hpp
+++ b/components/core/src/clp/clp/FileDecompressor.hpp
@@ -6,17 +6,17 @@
 #include <filesystem>
 #include <string>
 
+#include "../ErrorCode.hpp"
 #include "../FileWriter.hpp"
 #include "../ir/constants.hpp"
 #include "../ir/LogEventSerializer.hpp"
+#include "../ir/types.hpp"
 #include "../spdlog_with_specializations.hpp"
 #include "../streaming_archive/MetadataDB.hpp"
 #include "../streaming_archive/reader/Archive.hpp"
 #include "../streaming_archive/reader/File.hpp"
 #include "../streaming_archive/reader/Message.hpp"
-#include "ErrorCode.hpp"
-#include "ir/types.hpp"
-#include "Utils.hpp"
+#include "../Utils.hpp"
 
 namespace clp::clp {
 /**
diff --git a/components/core/src/clp/clp/decompression.cpp b/components/core/src/clp/clp/decompression.cpp
index c42357334..ce7cbd5c7 100644
--- a/components/core/src/clp/clp/decompression.cpp
+++ b/components/core/src/clp/clp/decompression.cpp
@@ -7,12 +7,12 @@
 #include "../FileWriter.hpp"
 #include "../GlobalMySQLMetadataDB.hpp"
 #include "../GlobalSQLiteMetadataDB.hpp"
+#include "../ir/constants.hpp"
 #include "../spdlog_with_specializations.hpp"
 #include "../streaming_archive/reader/Archive.hpp"
 #include "../TraceableException.hpp"
 #include "../Utils.hpp"
 #include "FileDecompressor.hpp"
-#include "ir/constants.hpp"
 #include "utils.hpp"
 
 using std::cerr;
diff --git a/components/core/src/clp/clp/utils.cpp b/components/core/src/clp/clp/utils.cpp
index 0f05d75ac..123f9a836 100644
--- a/components/core/src/clp/clp/utils.cpp
+++ b/components/core/src/clp/clp/utils.cpp
@@ -9,9 +9,9 @@
 #include "../GlobalMySQLMetadataDB.hpp"
 #include "../GlobalSQLiteMetadataDB.hpp"
 #include "../spdlog_with_specializations.hpp"
+#include "../streaming_archive/Constants.hpp"
+#include "../TraceableException.hpp"
 #include "../Utils.hpp"
-#include "streaming_archive/Constants.hpp"
-#include "TraceableException.hpp"
 
 using std::string;
 using std::vector;
diff --git a/components/core/src/clp/clp/utils.hpp b/components/core/src/clp/clp/utils.hpp
index 0a6918445..47adc50f2 100644
--- a/components/core/src/clp/clp/utils.hpp
+++ b/components/core/src/clp/clp/utils.hpp
@@ -7,11 +7,11 @@
 
 #include <boost/filesystem/path.hpp>
 
+#include "../ErrorCode.hpp"
 #include "../GlobalMetadataDB.hpp"
 #include "../GlobalMetadataDBConfig.hpp"
-#include "ErrorCode.hpp"
+#include "../TraceableException.hpp"
 #include "FileToCompress.hpp"
-#include "TraceableException.hpp"
 
 namespace clp::clp {
 // Types
diff --git a/components/core/src/clp/ir/EncodedTextAst.cpp b/components/core/src/clp/ir/EncodedTextAst.cpp
index f0ee4d493..72a8f2729 100644
--- a/components/core/src/clp/ir/EncodedTextAst.cpp
+++ b/components/core/src/clp/ir/EncodedTextAst.cpp
@@ -5,7 +5,7 @@
 #include <string>
 
 #include "../ffi/encoding_methods.hpp"
-#include "ffi/ir_stream/decoding_methods.hpp"
+#include "../ffi/ir_stream/decoding_methods.hpp"
 
 using clp::ffi::decode_float_var;
 using clp::ffi::decode_integer_var;
diff --git a/components/core/src/clp/ir/LogEvent.hpp b/components/core/src/clp/ir/LogEvent.hpp
index 4a3ef7567..e2d4b310e 100644
--- a/components/core/src/clp/ir/LogEvent.hpp
+++ b/components/core/src/clp/ir/LogEvent.hpp
@@ -5,8 +5,8 @@
 #include <utility>
 #include <vector>
 
+#include "../time_types.hpp"
 #include "EncodedTextAst.hpp"
-#include "time_types.hpp"
 #include "types.hpp"
 
 namespace clp::ir {
diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp
index c7fb9487e..fc7427f11 100644
--- a/components/core/src/clp_s/CommandLineArguments.cpp
+++ b/components/core/src/clp_s/CommandLineArguments.cpp
@@ -6,9 +6,9 @@
 #include <spdlog/spdlog.h>
 
 #include "../clp/cli_utils.hpp"
+#include "../clp/type_utils.hpp"
 #include "../reducer/types.hpp"
 #include "FileReader.hpp"
-#include "type_utils.hpp"
 
 namespace po = boost::program_options;
 

From 02f0b8f1597c810072bbcafb202b5b29bf48b68c Mon Sep 17 00:00:00 2001
From: davidlion <david.lion@yscope.com>
Date: Fri, 20 Dec 2024 00:20:21 -0500
Subject: [PATCH 58/65] Refactor lzma stream and add some doc strings.

---
 .../streaming_compression/lzma/Compressor.cpp | 108 +++++++--------
 .../streaming_compression/lzma/Compressor.hpp | 126 ++++++++++++------
 .../core/tests/test-StreamingCompression.cpp  |   8 +-
 3 files changed, 140 insertions(+), 102 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 3e6bb0254..52febe232 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -12,21 +12,22 @@
 #include "../../FileWriter.hpp"
 #include "../../TraceableException.hpp"
 #include "../../type_utils.hpp"
-#include "Constants.hpp"
 
 namespace clp::streaming_compression::lzma {
-auto Compressor::open(FileWriter& file_writer, int compression_level) -> void {
+auto Compressor::open(FileWriter& file_writer) -> void {
     if (nullptr != m_compressed_stream_file_writer) {
         throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__);
     }
-    if (compression_level < cMinCompressionLevel || compression_level > cMaxCompressionLevel) {
-        throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__);
-    }
-    m_compression_level = compression_level;
 
-    m_lzma_ops.init_lzma_encoder();
-    m_lzma_ops.detach_input_src();
-    m_lzma_ops.attach_output_buffer();
+    m_lzma_stream.detach_input();
+    if (false
+        == m_lzma_stream.attach_output(
+                m_compressed_stream_block_buffer.data(),
+                m_compressed_stream_block_buffer.size()
+        ))
+    {
+        throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__);
+    }
     m_compressed_stream_file_writer = &file_writer;
     m_uncompressed_stream_pos = 0;
 }
@@ -36,16 +37,14 @@ auto Compressor::close() -> void {
         throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
     }
 
-    if (m_compression_stream.avail_in > 0) {
+    if (m_lzma_stream.avail_in() > 0) {
         SPDLOG_WARN("Trying to close LZMA compressor with unprocessed input data. Processing and "
                     "flushing remaining data.");
         flush_lzma(LZMA_FULL_FLUSH);
     }
 
     flush_lzma(LZMA_FINISH);
-    // Deallocates LZMA stream's internal data structures
-    lzma_end(&m_compression_stream);
-    m_lzma_ops.detach_output_buffer();
+    m_lzma_stream.end_and_detach_output();
     m_compressed_stream_file_writer = nullptr;
 }
 
@@ -62,10 +61,14 @@ auto Compressor::write(char const* data, size_t data_length) -> void {
         throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
     }
 
-    m_compression_stream.next_in = clp::size_checked_pointer_cast<uint8_t const>(data);
-    m_compression_stream.avail_in = data_length;
+    if (false
+        == m_lzma_stream
+                   .attach_input(clp::size_checked_pointer_cast<uint8_t const>(data), data_length))
+    {
+        throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__);
+    }
     encode_lzma();
-    m_lzma_ops.detach_input_src();
+    m_lzma_stream.detach_input();
     m_uncompressed_stream_pos += data_length;
 }
 
@@ -85,18 +88,17 @@ auto Compressor::try_get_pos(size_t& pos) const -> ErrorCode {
 }
 
 auto Compressor::encode_lzma() -> void {
-    while (m_compression_stream.avail_in > 0) {
-        if (0 == m_compression_stream.avail_out) {
+    while (m_lzma_stream.avail_in() > 0) {
+        if (0 == m_lzma_stream.avail_out()) {
             flush_stream_output_block_buffer();
         }
-        auto const rc = lzma_code(&m_compression_stream, LZMA_RUN);
+        auto const rc = m_lzma_stream.lzma_code(LZMA_RUN);
         switch (rc) {
             case LZMA_OK:
                 break;
             case LZMA_BUF_ERROR:
-                SPDLOG_ERROR(
-                        "LZMA compressor input stream is corrupt. No encoding progress can be made."
-                );
+                SPDLOG_ERROR("LZMA compressor input stream is corrupt. No encoding "
+                             "progress can be made.");
                 throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
             default:
                 SPDLOG_ERROR(
@@ -109,7 +111,7 @@ auto Compressor::encode_lzma() -> void {
 }
 
 auto Compressor::flush_lzma(lzma_action flush_action) -> void {
-    if (false == LzmaStreamOperations::is_flush_action(flush_action)) {
+    if (false == LzmaStream::is_flush_action(flush_action)) {
         SPDLOG_ERROR(
                 "lzma_code() supplied with invalid flush action - {}.",
                 static_cast<int>(flush_action)
@@ -119,24 +121,24 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void {
 
     bool flushed{false};
     while (false == flushed) {
-        if (0 == m_compression_stream.avail_out) {
+        if (0 == m_lzma_stream.avail_out()) {
             flush_stream_output_block_buffer();
         }
-        auto const rc = lzma_code(&m_compression_stream, flush_action);
+        auto const rc = m_lzma_stream.lzma_code(flush_action);
         switch (rc) {
             case LZMA_OK:
                 break;
             case LZMA_STREAM_END:
-                // NOTE: flush may not have completed if a multithreaded encoder is using action
-                // LZMA_FULL_BARRIER. For now, we skip this check.
+                // NOTE: flush may not have completed if a multithreaded encoder is using
+                // action LZMA_FULL_BARRIER. For now, we skip this check.
                 flushed = true;
                 break;
             case LZMA_BUF_ERROR:
-                // NOTE: this can happen if we are using LZMA_FULL_FLUSH or LZMA_FULL_BARRIER. These
-                // two actions keeps encoding input data alongside flushing buffered encoded data.
-                SPDLOG_ERROR(
-                        "LZMA compressor input stream is corrupt. No encoding progress can be made."
-                );
+                // NOTE: this can happen if we are using LZMA_FULL_FLUSH or
+                // LZMA_FULL_BARRIER. These two actions keeps encoding input data
+                // alongside flushing buffered encoded data.
+                SPDLOG_ERROR("LZMA compressor input stream is corrupt. No encoding "
+                             "progress can be made.");
                 throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
             default:
                 SPDLOG_ERROR(
@@ -150,50 +152,36 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void {
 }
 
 auto Compressor::flush_stream_output_block_buffer() -> void {
-    if (cCompressedStreamBlockBufferSize == m_compression_stream.avail_out) {
+    if (cCompressedStreamBlockBufferSize == m_lzma_stream.avail_out()) {
         return;
     }
     m_compressed_stream_file_writer->write(
             clp::size_checked_pointer_cast<char>(m_compressed_stream_block_buffer.data()),
-            cCompressedStreamBlockBufferSize - m_compression_stream.avail_out
+            cCompressedStreamBlockBufferSize - m_lzma_stream.avail_out()
     );
-    m_lzma_ops.attach_output_buffer();
-}
-
-auto Compressor::LzmaStreamOperations::is_flush_action(lzma_action action) -> bool {
-    return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action || LZMA_FULL_BARRIER == action
-           || LZMA_FINISH == action;
-}
-
-auto Compressor::LzmaStreamOperations::attach_output_buffer() -> void {
-    m_p->m_compression_stream.next_out = m_p->m_compressed_stream_block_buffer.data();
-    m_p->m_compression_stream.avail_out = m_p->m_compressed_stream_block_buffer.size();
-}
-
-auto Compressor::LzmaStreamOperations::detach_input_src() -> void {
-    m_p->m_compression_stream.next_in = nullptr;
-    m_p->m_compression_stream.avail_in = 0;
-}
-
-auto Compressor::LzmaStreamOperations::detach_output_buffer() -> void {
-    m_p->m_compression_stream.next_out = nullptr;
-    m_p->m_compression_stream.avail_out = 0;
+    if (false
+        == m_lzma_stream.attach_output(
+                m_compressed_stream_block_buffer.data(),
+                m_compressed_stream_block_buffer.size()
+        ))
+    {
+        throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__);
+    }
 }
 
-auto Compressor::LzmaStreamOperations::init_lzma_encoder(lzma_check check) -> void {
+Compressor::LzmaStream::LzmaStream(int compression_level, size_t dict_size, lzma_check check) {
     lzma_options_lzma options;
-    if (0 != lzma_lzma_preset(&options, m_p->m_compression_level)) {
+    if (0 != lzma_lzma_preset(&options, compression_level)) {
         SPDLOG_ERROR("Failed to initialize LZMA options' compression level.");
         throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
     }
-    options.dict_size = m_p->m_dict_size;
+    options.dict_size = dict_size;
     std::array<lzma_filter, 2> filters{{
             {.id = LZMA_FILTER_LZMA2, .options = &options},
             {.id = LZMA_VLI_UNKNOWN, .options = nullptr},
     }};
 
-    m_p->m_compression_stream = LZMA_STREAM_INIT;
-    auto const rc = lzma_stream_encoder(&m_p->m_compression_stream, filters.data(), check);
+    auto const rc = lzma_stream_encoder(&m_stream, filters.data(), check);
     if (LZMA_OK == rc) {
         return;
     }
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index 3e7af18ff..49a3e079a 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -11,9 +11,11 @@
 #include "../../FileWriter.hpp"
 #include "../../TraceableException.hpp"
 #include "../Compressor.hpp"
-#include "Constants.hpp"
 
 namespace clp::streaming_compression::lzma {
+/**
+ * Implements a LZMA compressor that compresses byte input data to a file.
+ */
 class Compressor : public ::clp::streaming_compression::Compressor {
 public:
     // Types
@@ -30,7 +32,8 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     };
 
     // Constructor
-    Compressor() = default;
+    Compressor(int compression_level, size_t dict_size, lzma_check check)
+            : m_lzma_stream{compression_level, dict_size, check} {}
 
     // Destructor
     ~Compressor() override = default;
@@ -43,14 +46,6 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     Compressor(Compressor&&) noexcept = default;
     auto operator=(Compressor&&) noexcept -> Compressor& = default;
 
-    /**
-     * Initializes the compression stream with the given compression level
-     *
-     * @param file_writer
-     * @param compression_level
-     */
-    auto open(FileWriter& file_writer, int compression_level) -> void;
-
     // Methods implementing the WriterInterface
     /**
      * Writes the given data to the compressor
@@ -74,61 +69,114 @@ class Compressor : public ::clp::streaming_compression::Compressor {
      */
     auto try_get_pos(size_t& pos) const -> ErrorCode override;
 
+    // Methods implementing the Compressor interface
     /**
      * Closes the compressor
      */
     auto close() -> void override;
 
-    // Methods implementing the Compressor interface
     /**
-     * Initializes the compression stream with the default compression level
+     * Open the compression stream for encoding to the file_writer.
+     *
      * @param file_writer
      */
-    auto open(FileWriter& file_writer) -> void override {
-        this->open(file_writer, cDefaultCompressionLevel);
-    }
+    auto open(FileWriter& file_writer) -> void override;
 
 private:
-    class LzmaStreamOperations {
+    /**
+     * Wrapper class around lzma_stream providing easier usage.
+     */
+    class LzmaStream {
     public:
-        // Constructor
-        LzmaStreamOperations(Compressor* parent) : m_p(parent) {}
+        /**
+         * Initializes an LZMA compression encoder and its streams.
+         *
+         * @param compression_level Compression preset level in the range [0-9] where the higher
+         * numbers use increasingly more memory for greater compression ratios.
+         * @param dict_size Max amount of recently processed uncompressed bytes to keep in the
+         * memory.
+         * @param check Type of check to verify the integrity of the uncompressed data.
+         * LZMA_CHECK_CRC64 is the default in the xz command line tool. If the .xz file needs to be
+         * decompressed with XZ-Embedded, use LZMA_CHECK_CRC32 instead.
+         *
+         * @throw `OperationFailed` `ErrorCode_BadParam` if the LZMA options are invalid or the
+         * encoder fails to initialize.
+         */
+        LzmaStream(int compression_level, size_t dict_size, lzma_check check);
 
         // Destructor
-        ~LzmaStreamOperations() = default;
+        ~LzmaStream() = default;
 
         // Delete copy constructor and assignment operator
-        LzmaStreamOperations(LzmaStreamOperations const&) = delete;
-        auto operator=(LzmaStreamOperations const&) -> LzmaStreamOperations& = delete;
+        LzmaStream(LzmaStream const&) = delete;
+        auto operator=(LzmaStream const&) -> LzmaStream& = delete;
 
         // Default move constructor and assignment operator
-        LzmaStreamOperations(LzmaStreamOperations&&) noexcept = default;
-        auto operator=(LzmaStreamOperations&&) noexcept -> LzmaStreamOperations& = default;
+        LzmaStream(LzmaStream&&) noexcept = default;
+        auto operator=(LzmaStream&&) noexcept -> LzmaStream& = default;
 
-        [[nodiscard]] static auto is_flush_action(lzma_action action) -> bool;
+        /**
+         * Attaches a pre-allocated block buffer to the encoder's input stream.
+         *
+         * @return false if the data buffer is null or empty.
+         * @return true on success.
+         */
+        [[nodiscard]] auto attach_input(uint8_t const* data_ptr, size_t data_length) -> bool {
+            if (nullptr == data_ptr || 0 == data_length) {
+                return false;
+            }
+            m_stream.next_in = data_ptr;
+            m_stream.avail_in = data_length;
+            return true;
+        }
 
         /**
-         * Attaches a pre-allocated block buffer to the encoder's output stream
+         * Attaches a pre-allocated block buffer to the encoder's output stream.
          *
-         * Subsequent calls to this function resets the output buffer to its initial state.
+         * @return false if the data buffer is null or empty.
+         * @return true on success.
          */
-        auto attach_output_buffer() -> void;
+        [[nodiscard]] auto attach_output(uint8_t* data_ptr, size_t data_length) -> bool {
+            if (nullptr == data_ptr || 0 == data_length) {
+                return false;
+            }
+            m_stream.next_out = data_ptr;
+            m_stream.avail_out = data_length;
+            return true;
+        }
 
-        auto detach_input_src() -> void;
+        [[nodiscard]] auto avail_in() const -> size_t { return m_stream.avail_in; }
 
-        auto detach_output_buffer() -> void;
+        [[nodiscard]] auto avail_out() const -> size_t { return m_stream.avail_out; }
 
         /**
-         * Initializes an LZMA compression encoder and its streams
-         *
-         * @param check Type of integrity check calculated from the uncompressed data.
-         * LZMA_CHECK_CRC64 is the default in the xz command line tool. If the .xz file needs to be
-         * decompressed with XZ-Embedded, use LZMA_CHECK_CRC32 instead.
+         * Unset the internal fields of the encoder's input stream.
+         */
+        auto detach_input() -> void {
+            m_stream.next_in = nullptr;
+            m_stream.avail_in = 0;
+        }
+
+        /**
+         * End the LZMA stream and unset the internal fields of the encoder's output stream.
          */
-        auto init_lzma_encoder(lzma_check check = LZMA_CHECK_CRC64) -> void;
+        auto end_and_detach_output() -> void {
+            lzma_end(&m_stream);
+            m_stream.next_out = nullptr;
+            m_stream.avail_out = 0;
+        }
+
+        [[nodiscard]] static auto is_flush_action(lzma_action action) -> bool {
+            return LZMA_SYNC_FLUSH == action || LZMA_FULL_FLUSH == action
+                   || LZMA_FULL_BARRIER == action || LZMA_FINISH == action;
+        }
+
+        [[nodiscard]] auto lzma_code(lzma_action action) -> lzma_ret {
+            return ::lzma_code(&m_stream, action);
+        }
 
     private:
-        Compressor* m_p;
+        lzma_stream m_stream = LZMA_STREAM_INIT;
     };
 
     static constexpr size_t cCompressedStreamBlockBufferSize{4096};  // 4KiB
@@ -170,12 +218,8 @@ class Compressor : public ::clp::streaming_compression::Compressor {
     FileWriter* m_compressed_stream_file_writer{nullptr};
 
     // Compressed stream variables
-    LzmaStreamOperations m_lzma_ops{this};
     Array<uint8_t> m_compressed_stream_block_buffer{cCompressedStreamBlockBufferSize};
-    int m_compression_level{cDefaultCompressionLevel};
-    lzma_stream m_compression_stream = LZMA_STREAM_INIT;
-    // Specifies how many bytes of the recently processed uncompressed data to keep in the memory
-    size_t m_dict_size{cDefaultDictionarySize};
+    LzmaStream m_lzma_stream;
     size_t m_uncompressed_stream_pos{0};
 };
 }  // namespace clp::streaming_compression::lzma
diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp
index 4076eb88f..8fc7f4286 100644
--- a/components/core/tests/test-StreamingCompression.cpp
+++ b/components/core/tests/test-StreamingCompression.cpp
@@ -9,6 +9,7 @@
 
 #include <boost/filesystem/operations.hpp>
 #include <Catch2/single_include/catch2/catch.hpp>
+#include <lzma.h>
 #include <zstd.h>
 
 #include "../src/clp/Array.hpp"
@@ -18,6 +19,7 @@
 #include "../src/clp/streaming_compression/Compressor.hpp"
 #include "../src/clp/streaming_compression/Decompressor.hpp"
 #include "../src/clp/streaming_compression/lzma/Compressor.hpp"
+#include "../src/clp/streaming_compression/lzma/Constants.hpp"
 #include "../src/clp/streaming_compression/passthrough/Compressor.hpp"
 #include "../src/clp/streaming_compression/passthrough/Decompressor.hpp"
 #include "../src/clp/streaming_compression/zstd/Compressor.hpp"
@@ -130,7 +132,11 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") {
     }
 
     SECTION("LZMA compression") {
-        compressor = std::make_unique<clp::streaming_compression::lzma::Compressor>();
+        compressor = std::make_unique<clp::streaming_compression::lzma::Compressor>(
+                clp::streaming_compression::lzma::cDefaultCompressionLevel,
+                clp::streaming_compression::lzma::cDefaultDictionarySize,
+                LZMA_CHECK_CRC64
+        );
         compress(std::move(compressor), uncompressed_buffer.data());
     }
 

From fcfc73ae3337f412afe8605a7b56c432f074f9b0 Mon Sep 17 00:00:00 2001
From: davidlion <david.lion@yscope.com>
Date: Fri, 20 Dec 2024 00:35:00 -0500
Subject: [PATCH 59/65] Fix accidental comment reflow.

---
 .../src/clp/streaming_compression/lzma/Compressor.cpp    | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 52febe232..67e03f871 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -129,14 +129,13 @@ auto Compressor::flush_lzma(lzma_action flush_action) -> void {
             case LZMA_OK:
                 break;
             case LZMA_STREAM_END:
-                // NOTE: flush may not have completed if a multithreaded encoder is using
-                // action LZMA_FULL_BARRIER. For now, we skip this check.
+                // NOTE: flush may not have completed if a multithreaded encoder is using action
+                // LZMA_FULL_BARRIER. For now, we skip this check.
                 flushed = true;
                 break;
             case LZMA_BUF_ERROR:
-                // NOTE: this can happen if we are using LZMA_FULL_FLUSH or
-                // LZMA_FULL_BARRIER. These two actions keeps encoding input data
-                // alongside flushing buffered encoded data.
+                // NOTE: this can happen if we are using LZMA_FULL_FLUSH or LZMA_FULL_BARRIER. These
+                // two actions keeps encoding input data alongside flushing buffered encoded data.
                 SPDLOG_ERROR("LZMA compressor input stream is corrupt. No encoding "
                              "progress can be made.");
                 throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);

From d139bd593e6585b66a3350645a8ec1c85b079f0a Mon Sep 17 00:00:00 2001
From: davidlion <davidlion2@protonmail.com>
Date: Fri, 20 Dec 2024 13:01:27 -0500
Subject: [PATCH 60/65] Apply suggestions from code review

Co-authored-by: Bingran Hu <bingranhu98@gmail.com>
---
 .../clp/streaming_compression/lzma/Compressor.cpp | 15 +++------------
 .../clp/streaming_compression/lzma/Compressor.hpp |  3 ++-
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 67e03f871..a6c5e197a 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -26,7 +26,7 @@ auto Compressor::open(FileWriter& file_writer) -> void {
                 m_compressed_stream_block_buffer.size()
         ))
     {
-        throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__);
+        throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
     }
     m_compressed_stream_file_writer = &file_writer;
     m_uncompressed_stream_pos = 0;
@@ -52,20 +52,11 @@ auto Compressor::write(char const* data, size_t data_length) -> void {
     if (nullptr == m_compressed_stream_file_writer) {
         throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
     }
-
-    if (0 == data_length) {
-        return;
-    }
-
-    if (nullptr == data) {
-        throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
-    }
-
     if (false
         == m_lzma_stream
                    .attach_input(clp::size_checked_pointer_cast<uint8_t const>(data), data_length))
     {
-        throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__);
+        throw OperationFailed(ErrorCode_BadParam, __FILENAME__, __LINE__);
     }
     encode_lzma();
     m_lzma_stream.detach_input();
@@ -164,7 +155,7 @@ auto Compressor::flush_stream_output_block_buffer() -> void {
                 m_compressed_stream_block_buffer.size()
         ))
     {
-        throw OperationFailed(ErrorCode_NotReady, __FILENAME__, __LINE__);
+        throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
     }
 }
 
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index 49a3e079a..5d35eb28e 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -31,7 +31,8 @@ class Compressor : public ::clp::streaming_compression::Compressor {
         }
     };
 
-    // Constructor
+    // Constructors
+    Compressor() : Compressor{cDefaultCompressionLevel, cDefaultDictionarySize, LZMA_CHECK_CRC64} {}
     Compressor(int compression_level, size_t dict_size, lzma_check check)
             : m_lzma_stream{compression_level, dict_size, check} {}
 

From 68c4c3677446ab9bdf3c4c7c10be8f6834839441 Mon Sep 17 00:00:00 2001
From: davidlion <david.lion@yscope.com>
Date: Fri, 20 Dec 2024 13:08:48 -0500
Subject: [PATCH 61/65] Add missing fixes for PR suggestion.

---
 .../core/src/clp/streaming_compression/lzma/Compressor.hpp | 2 ++
 components/core/tests/test-StreamingCompression.cpp        | 7 +------
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index 5d35eb28e..f46a7a58d 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -11,6 +11,7 @@
 #include "../../FileWriter.hpp"
 #include "../../TraceableException.hpp"
 #include "../Compressor.hpp"
+#include "Constants.hpp"
 
 namespace clp::streaming_compression::lzma {
 /**
@@ -33,6 +34,7 @@ class Compressor : public ::clp::streaming_compression::Compressor {
 
     // Constructors
     Compressor() : Compressor{cDefaultCompressionLevel, cDefaultDictionarySize, LZMA_CHECK_CRC64} {}
+
     Compressor(int compression_level, size_t dict_size, lzma_check check)
             : m_lzma_stream{compression_level, dict_size, check} {}
 
diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp
index 8fc7f4286..5ae5532a0 100644
--- a/components/core/tests/test-StreamingCompression.cpp
+++ b/components/core/tests/test-StreamingCompression.cpp
@@ -19,7 +19,6 @@
 #include "../src/clp/streaming_compression/Compressor.hpp"
 #include "../src/clp/streaming_compression/Decompressor.hpp"
 #include "../src/clp/streaming_compression/lzma/Compressor.hpp"
-#include "../src/clp/streaming_compression/lzma/Constants.hpp"
 #include "../src/clp/streaming_compression/passthrough/Compressor.hpp"
 #include "../src/clp/streaming_compression/passthrough/Decompressor.hpp"
 #include "../src/clp/streaming_compression/zstd/Compressor.hpp"
@@ -132,11 +131,7 @@ TEST_CASE("StreamingCompression", "[StreamingCompression]") {
     }
 
     SECTION("LZMA compression") {
-        compressor = std::make_unique<clp::streaming_compression::lzma::Compressor>(
-                clp::streaming_compression::lzma::cDefaultCompressionLevel,
-                clp::streaming_compression::lzma::cDefaultDictionarySize,
-                LZMA_CHECK_CRC64
-        );
+        compressor = std::make_unique<clp::streaming_compression::lzma::Compressor>();
         compress(std::move(compressor), uncompressed_buffer.data());
     }
 

From dcb843e60f1f20a2d80c3964f98e50876de8cbd8 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Fri, 20 Dec 2024 17:35:01 -0500
Subject: [PATCH 62/65] Address review concern

---
 .../core/src/clp/streaming_compression/lzma/Compressor.cpp    | 4 +---
 .../core/src/clp/streaming_compression/lzma/Compressor.hpp    | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index a6c5e197a..8061807da 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -38,9 +38,7 @@ auto Compressor::close() -> void {
     }
 
     if (m_lzma_stream.avail_in() > 0) {
-        SPDLOG_WARN("Trying to close LZMA compressor with unprocessed input data. Processing and "
-                    "flushing remaining data.");
-        flush_lzma(LZMA_FULL_FLUSH);
+        throw OperationFailed(ErrorCode_Corrupt, __FILENAME__, __LINE__);
     }
 
     flush_lzma(LZMA_FINISH);
diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
index f46a7a58d..de665eaf6 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.hpp
@@ -121,11 +121,11 @@ class Compressor : public ::clp::streaming_compression::Compressor {
         /**
          * Attaches a pre-allocated block buffer to the encoder's input stream.
          *
-         * @return false if the data buffer is null or empty.
+         * @return false if the data buffer is null.
          * @return true on success.
          */
         [[nodiscard]] auto attach_input(uint8_t const* data_ptr, size_t data_length) -> bool {
-            if (nullptr == data_ptr || 0 == data_length) {
+            if (nullptr == data_ptr) {
                 return false;
             }
             m_stream.next_in = data_ptr;

From b20162f4d721ac43b1c5df3b590078e49816f2ea Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Fri, 20 Dec 2024 18:19:41 -0500
Subject: [PATCH 63/65] Address review concern

---
 .../core/src/clp/streaming_compression/lzma/Compressor.cpp | 4 ++--
 components/core/tests/test-StreamingCompression.cpp        | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index 8061807da..a9fa0d5a0 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -26,7 +26,7 @@ auto Compressor::open(FileWriter& file_writer) -> void {
                 m_compressed_stream_block_buffer.size()
         ))
     {
-        throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
+        throw OperationFailed(ErrorCode_NoMem, __FILENAME__, __LINE__);
     }
     m_compressed_stream_file_writer = &file_writer;
     m_uncompressed_stream_pos = 0;
@@ -153,7 +153,7 @@ auto Compressor::flush_stream_output_block_buffer() -> void {
                 m_compressed_stream_block_buffer.size()
         ))
     {
-        throw OperationFailed(ErrorCode_NotInit, __FILENAME__, __LINE__);
+        throw OperationFailed(ErrorCode_NoMem, __FILENAME__, __LINE__);
     }
 }
 
diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp
index 5ae5532a0..c3e981562 100644
--- a/components/core/tests/test-StreamingCompression.cpp
+++ b/components/core/tests/test-StreamingCompression.cpp
@@ -35,13 +35,20 @@ using std::string_view;
 namespace {
 constexpr string_view cCompressedFilePath{"test_streaming_compressed_file.bin"};
 constexpr size_t cBufferSize{128L * 1024 * 1024};  // 128MB
+// Interleave no-ops to ensure the integrity of the compressor states at all times.
 constexpr auto cCompressionChunkSizes = std::to_array<size_t>(
         {cBufferSize / 100,
+         0,
          cBufferSize / 50,
+         0,
          cBufferSize / 25,
+         0,
          cBufferSize / 10,
+         0,
          cBufferSize / 5,
+         0,
          cBufferSize / 2,
+         0,
          cBufferSize}
 );
 

From df41b227e94fe7de2f9978ff3772ae97a8681443 Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Fri, 20 Dec 2024 18:37:52 -0500
Subject: [PATCH 64/65] nit fix

---
 components/core/tests/test-StreamingCompression.cpp | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp
index c3e981562..22582cff2 100644
--- a/components/core/tests/test-StreamingCompression.cpp
+++ b/components/core/tests/test-StreamingCompression.cpp
@@ -9,7 +9,6 @@
 
 #include <boost/filesystem/operations.hpp>
 #include <Catch2/single_include/catch2/catch.hpp>
-#include <lzma.h>
 #include <zstd.h>
 
 #include "../src/clp/Array.hpp"
@@ -35,20 +34,14 @@ using std::string_view;
 namespace {
 constexpr string_view cCompressedFilePath{"test_streaming_compressed_file.bin"};
 constexpr size_t cBufferSize{128L * 1024 * 1024};  // 128MB
-// Interleave no-ops to ensure the integrity of the compressor states at all times.
 constexpr auto cCompressionChunkSizes = std::to_array<size_t>(
-        {cBufferSize / 100,
-         0,
+        {0,  // no-op
+         cBufferSize / 100,
          cBufferSize / 50,
-         0,
          cBufferSize / 25,
-         0,
          cBufferSize / 10,
-         0,
          cBufferSize / 5,
-         0,
          cBufferSize / 2,
-         0,
          cBufferSize}
 );
 

From 524fe1d1cbd30d4b9839dfe58428b2590c1a102c Mon Sep 17 00:00:00 2001
From: Bingran Hu <bingran.hu@yscope.com>
Date: Sun, 22 Dec 2024 17:21:00 -0500
Subject: [PATCH 65/65] Change all instances of programming-error-induced error
 codes to ErrorCode_Failure

---
 .../core/src/clp/streaming_compression/lzma/Compressor.cpp  | 6 +++---
 components/core/tests/test-StreamingCompression.cpp         | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
index a9fa0d5a0..34c1a0e2b 100644
--- a/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
+++ b/components/core/src/clp/streaming_compression/lzma/Compressor.cpp
@@ -26,7 +26,7 @@ auto Compressor::open(FileWriter& file_writer) -> void {
                 m_compressed_stream_block_buffer.size()
         ))
     {
-        throw OperationFailed(ErrorCode_NoMem, __FILENAME__, __LINE__);
+        throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
     }
     m_compressed_stream_file_writer = &file_writer;
     m_uncompressed_stream_pos = 0;
@@ -38,7 +38,7 @@ auto Compressor::close() -> void {
     }
 
     if (m_lzma_stream.avail_in() > 0) {
-        throw OperationFailed(ErrorCode_Corrupt, __FILENAME__, __LINE__);
+        throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
     }
 
     flush_lzma(LZMA_FINISH);
@@ -153,7 +153,7 @@ auto Compressor::flush_stream_output_block_buffer() -> void {
                 m_compressed_stream_block_buffer.size()
         ))
     {
-        throw OperationFailed(ErrorCode_NoMem, __FILENAME__, __LINE__);
+        throw OperationFailed(ErrorCode_Failure, __FILENAME__, __LINE__);
     }
 }
 
diff --git a/components/core/tests/test-StreamingCompression.cpp b/components/core/tests/test-StreamingCompression.cpp
index 22582cff2..9f0df9306 100644
--- a/components/core/tests/test-StreamingCompression.cpp
+++ b/components/core/tests/test-StreamingCompression.cpp
@@ -35,7 +35,7 @@ namespace {
 constexpr string_view cCompressedFilePath{"test_streaming_compressed_file.bin"};
 constexpr size_t cBufferSize{128L * 1024 * 1024};  // 128MB
 constexpr auto cCompressionChunkSizes = std::to_array<size_t>(
-        {0,  // no-op
+        {0,
          cBufferSize / 100,
          cBufferSize / 50,
          cBufferSize / 25,