diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 7f6c698..0fbae57 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1,17 +1,19 @@ if (NOT DEFINED ENV{ALP_DATASET_DIR_PATH}) message(WARNING "Set ALP_DATASET_DIR_PATH environment variable") + message(WARNING "Set HURRICANE_ISABEL_DATASET_DIR_PATH" environment variable) else () endif () add_subdirectory(bench_speed) - -add_executable(bench_alp_compression_ratio bench_compression_ratio.cpp) -target_link_libraries(bench_alp_compression_ratio PUBLIC ALP gtest_main) -gtest_discover_tests(bench_alp_compression_ratio) - +add_executable(test_compression_ratio test_compression_ratio.cpp) +target_link_libraries(test_compression_ratio PUBLIC ALP gtest_main) +gtest_discover_tests(test_compression_ratio) +add_executable(bench_your_dataset bench_your_dataset.cpp) +target_link_libraries(bench_your_dataset PUBLIC ALP gtest_main) +gtest_discover_tests(bench_your_dataset) diff --git a/benchmarks/bench_compression_ratio.cpp b/benchmarks/bench_compression_ratio.cpp deleted file mode 100644 index 9df63a9..0000000 --- a/benchmarks/bench_compression_ratio.cpp +++ /dev/null @@ -1,274 +0,0 @@ -#include "alp.hpp" -#include "alp_result.hpp" -#include "data.hpp" -#include "test/mapper.hpp" -#include "gtest/gtest.h" -#include - -// ALP overhead per vector : bit_width + factor-idx + exponent-idx + ffor base; -template -PT get_overhead_per_vector() { - return static_cast(8 + // bit_width - 8 + // factor-idx - 8 + // exponent-idx - (sizeof(PT) * 8)) // ffor base - / alp::config::VECTOR_SIZE; -}; - -std::string get_alp_scheme_string(alp::Scheme& scheme) { - switch (scheme) { - case alp::Scheme::ALP: - return "ALP_PDE"; - case alp::Scheme::ALP_RD: - return "ALP_RD"; - default: - return "INVALID"; - } -} - -class ALPBench : public ::testing::Test { -public: - ~ALPBench() override = default; - - template - void bench_alp_compression_ratio(std::array columns, - const std::string& result_file_path) { - - // Internal Type - using UT = typename alp::inner_t::ut; - using ST = typename alp::inner_t::st; - - // init - PT* sample_buf {}; - PT* intput_buf {}; - PT* exc_arr {}; - uint16_t* rd_exc_arr {}; - uint16_t* pos_arr {}; - uint16_t* exc_c_arr {}; - ST* ffor_buf {}; - ST* unffor_arr {}; - ST* base_buf {}; - ST* encoded_buf {}; - PT* decoded_buf {}; - UT* ffor_right_buf {}; - uint16_t* ffor_left_arr {}; - UT* right_buf {}; - uint16_t* left_arr {}; - UT* unffor_right_buf {}; - uint16_t* unffor_left_arr {}; - PT* glue_buf {}; - uint8_t bit_width {}; - - // allocate - intput_buf = new PT[alp::config::VECTOR_SIZE]; - exc_arr = new PT[alp::config::VECTOR_SIZE]; - rd_exc_arr = new uint16_t[alp::config::VECTOR_SIZE]; - pos_arr = new uint16_t[alp::config::VECTOR_SIZE]; - encoded_buf = new ST[alp::config::VECTOR_SIZE]; - decoded_buf = new PT[alp::config::VECTOR_SIZE]; - exc_c_arr = new uint16_t[alp::config::VECTOR_SIZE]; - ffor_buf = new ST[alp::config::VECTOR_SIZE]; - unffor_arr = new ST[alp::config::VECTOR_SIZE]; - base_buf = new ST[alp::config::VECTOR_SIZE]; - sample_buf = new PT[alp::config::VECTOR_SIZE]; - right_buf = new UT[alp::config::VECTOR_SIZE]; - left_arr = new uint16_t[alp::config::VECTOR_SIZE]; - ffor_right_buf = new UT[alp::config::VECTOR_SIZE]; - ffor_left_arr = new uint16_t[alp::config::VECTOR_SIZE]; - unffor_right_buf = new UT[alp::config::VECTOR_SIZE]; - unffor_left_arr = new uint16_t[alp::config::VECTOR_SIZE]; - glue_buf = new PT[alp::config::VECTOR_SIZE]; - - std::ofstream ofile(result_file_path, std::ios::out); - ofile << "idx,dataset,size,alp_scheme,rowgroups_count,vectors_count\n"; - - for (auto& dataset : columns) { - std::cout << dataset.name << std::endl; - - std::vector data; - alp_data::read_data(data, dataset.csv_file_path, dataset.binary_file_path); - PT* data_column = data.data(); - size_t n_tuples = data.size(); - - std::vector compression_metadata; - PT value_to_encode {0.0}; - size_t vector_idx {0}; - size_t rowgroup_counter {0}; - size_t rowgroup_offset {0}; - alp::state stt; - size_t rowgroups_count = std::ceil(static_cast(n_tuples) / alp::config::ROWGROUP_SIZE); - size_t vectors_count = n_tuples / alp::config::VECTOR_SIZE; - - /* Init */ - alp::encoder::init(data_column, rowgroup_offset, n_tuples, sample_buf, stt); - - double compression_ratio {0}; - switch (stt.scheme) { - case alp::Scheme::ALP_RD: { - alp::rd_encoder::init(data_column, rowgroup_offset, n_tuples, sample_buf, stt); - - /* Encode - Decode - Validate. */ - for (size_t i = 0; i < n_tuples; i++) { - value_to_encode = data_column[i]; - intput_buf[vector_idx] = value_to_encode; - vector_idx = vector_idx + 1; - rowgroup_offset = rowgroup_offset + 1; - rowgroup_counter = rowgroup_counter + 1; - - if (vector_idx != alp::config::VECTOR_SIZE) { continue; } - - if (rowgroup_counter == alp::config::ROWGROUP_SIZE) { - rowgroup_counter = 0; - alp::encoder::init(data_column, rowgroup_offset, n_tuples, sample_buf, stt); - } - - // Encode - alp::rd_encoder::encode(intput_buf, rd_exc_arr, pos_arr, exc_c_arr, right_buf, left_arr, stt); - ffor::ffor(right_buf, ffor_right_buf, stt.right_bit_width, &stt.right_for_base); - ffor::ffor(left_arr, ffor_left_arr, stt.left_bit_width, &stt.left_for_base); - - // Decode - unffor::unffor(ffor_right_buf, unffor_right_buf, stt.right_bit_width, &stt.right_for_base); - unffor::unffor(ffor_left_arr, unffor_left_arr, stt.left_bit_width, &stt.left_for_base); - alp::rd_encoder::decode( - glue_buf, unffor_right_buf, unffor_left_arr, rd_exc_arr, pos_arr, exc_c_arr, stt); - - auto* dbl_glue_arr = reinterpret_cast(glue_buf); - for (size_t j = 0; j < alp::config::VECTOR_SIZE; ++j) { - auto l = intput_buf[j]; - auto r = dbl_glue_arr[j]; - if (l != r) { std::cerr << j << ", " << dataset.name << "\n"; } - - ASSERT_EQ(intput_buf[j], dbl_glue_arr[j]); - } - - alp_bench::VectorMetadata vector_metadata; - vector_metadata.right_bit_width = stt.right_bit_width; - vector_metadata.left_bit_width = stt.left_bit_width; - vector_metadata.exceptions_count = stt.exceptions_count; - - compression_metadata.push_back(vector_metadata); - vector_idx = 0; - bit_width = 0; - - vectors_count = vectors_count + 1; - } - - compression_ratio = calculate_alprd_compression_size(compression_metadata); - - } break; - case alp::Scheme::ALP: { - /* Encode - Decode - Validate. */ - for (size_t i = 0; i < n_tuples; i++) { - value_to_encode = data_column[i]; - intput_buf[vector_idx] = value_to_encode; - vector_idx = vector_idx + 1; - rowgroup_offset = rowgroup_offset + 1; - rowgroup_counter = rowgroup_counter + 1; - - if (vector_idx != alp::config::VECTOR_SIZE) { continue; } - if (rowgroup_counter == alp::config::ROWGROUP_SIZE) { - rowgroup_counter = 0; - rowgroups_count = rowgroups_count + 1; - alp::encoder::init(data_column, rowgroup_offset, n_tuples, sample_buf, stt); - } - alp::encoder::encode(intput_buf, exc_arr, pos_arr, exc_c_arr, encoded_buf, stt); - alp::encoder::analyze_ffor(encoded_buf, bit_width, base_buf); - ffor::ffor(encoded_buf, ffor_buf, bit_width, base_buf); - - unffor::unffor(ffor_buf, unffor_arr, bit_width, base_buf); - alp::decoder::decode(unffor_arr, stt.fac, stt.exp, decoded_buf); - alp::decoder::patch_exceptions(decoded_buf, exc_arr, pos_arr, exc_c_arr); - - for (size_t j = 0; j < alp::config::VECTOR_SIZE; j++) { - auto l = intput_buf[j]; - auto r = decoded_buf[j]; - if (l != r) { std::cerr << j << ", " << rowgroup_offset << ", " << dataset.name << "\n"; } - ASSERT_EQ(intput_buf[j], decoded_buf[j]); - } - compression_metadata.push_back({bit_width, exc_c_arr[0]}); - vector_idx = 0; - bit_width = 0; - } - compression_ratio = calculate_alp_compression_size(compression_metadata); - } break; - default: - ASSERT_TRUE(false); - } - - ofile << std::fixed << std::setprecision(2) << dataset.id << "," << dataset.name << "," << compression_ratio - << "," << get_alp_scheme_string(stt.scheme) << "," << rowgroups_count << "," << vectors_count - << std::endl; - - if (alp_bench::results.find(dataset.name) != - alp_bench::results.end()) { // To avoid error when tested dataset is not found on results - ASSERT_EQ(alp_bench::to_str(compression_ratio), alp_bench::results.find(dataset.name)->second); - } - } - delete[] sample_buf; - delete[] intput_buf; - delete[] exc_arr; - delete[] rd_exc_arr; - delete[] pos_arr; - delete[] encoded_buf; - delete[] decoded_buf; - delete[] exc_c_arr; - delete[] ffor_buf; - delete[] unffor_arr; - delete[] base_buf; - delete[] right_buf; - delete[] left_arr; - delete[] unffor_right_buf; - delete[] unffor_left_arr; - } - - template - double calculate_alp_compression_size(std::vector& vector_metadata) { - double avg_bits_per_value {0}; - for (auto& metadata : vector_metadata) { - avg_bits_per_value = avg_bits_per_value + metadata.bit_width; - avg_bits_per_value = - avg_bits_per_value + - (static_cast(metadata.exceptions_count) * - (alp::Constants::EXCEPTION_SIZE + alp::EXCEPTION_POSITION_SIZE) / alp::config::VECTOR_SIZE); - } - - avg_bits_per_value = avg_bits_per_value / vector_metadata.size(); - avg_bits_per_value = avg_bits_per_value + get_overhead_per_vector(); - return avg_bits_per_value; - } - - double alprd_overhead_per_vector {static_cast(alp::config::MAX_RD_DICTIONARY_SIZE * 16) / - alp::config::ROWGROUP_SIZE}; - - double calculate_alprd_compression_size(std::vector& vector_metadata) { - double avg_bits_per_value {0}; - for (auto& metadata : vector_metadata) { - avg_bits_per_value = avg_bits_per_value + metadata.right_bit_width + metadata.left_bit_width + - static_cast(metadata.exceptions_count * - (alp::RD_EXCEPTION_SIZE + alp::RD_EXCEPTION_POSITION_SIZE)) / - alp::config::VECTOR_SIZE; - } - - avg_bits_per_value = avg_bits_per_value / vector_metadata.size(); - avg_bits_per_value = avg_bits_per_value + alprd_overhead_per_vector; - - return avg_bits_per_value; - } -}; - -TEST_F(ALPBench, bench_alp_on_alp_dataset) { - std::string result_path = alp_bench::get_paths().alp_result_dir_path + "compression_ratio/double/alp_dataset.csv"; - bench_alp_compression_ratio(alp_bench::get_alp_dataset(), result_path); -} - -TEST_F(ALPBench, bench_alp_on_sp_dataset) { - std::string result_path = alp_bench::get_paths().alp_result_dir_path + "compression_ratio/float/sp_dataset.csv"; - bench_alp_compression_ratio(alp_bench::get_sp_datasets(), result_path); -} - -TEST_F(ALPBench, bench_alp_on_hurricane_isabel) { - auto result_path = - alp_bench::get_paths().alp_result_dir_path + "compression_ratio/float/hurricane_isabel_dataset.csv"; - bench_alp_compression_ratio(alp_bench::get_hurricane_isabel_dataset(), result_path); -} diff --git a/benchmarks/bench_speed/bench_alp_cutter_decode.cpp b/benchmarks/bench_speed/bench_alp_cutter_decode.cpp index fe25141..1ee787b 100644 --- a/benchmarks/bench_speed/bench_alp_cutter_decode.cpp +++ b/benchmarks/bench_speed/bench_alp_cutter_decode.cpp @@ -16,7 +16,7 @@ static __attribute__((noinline)) benchmark::BenchmarkReporter::Run b_a_e(const d int64_t* ffor_arr, int64_t* base_arr, alp::state& stt, - alp_bench::Column& dataset, + alp_bench::ALPColumnDescriptor& dataset, uint64_t* ffor_right_arr, uint16_t* ffor_left_arr, uint64_t* right_arr, diff --git a/benchmarks/bench_speed/bench_alp_cutter_encode.cpp b/benchmarks/bench_speed/bench_alp_cutter_encode.cpp index 07f0d1b..d3341d6 100644 --- a/benchmarks/bench_speed/bench_alp_cutter_encode.cpp +++ b/benchmarks/bench_speed/bench_alp_cutter_encode.cpp @@ -15,7 +15,7 @@ static __attribute__((noinline)) benchmark::BenchmarkReporter::Run b_a_e(const d int64_t* ffor_arr, int64_t* base_arr, alp::state& stt, - alp_bench::Column& dataset, + alp_bench::ALPColumnDescriptor& dataset, uint64_t* ffor_right_arr, uint16_t* ffor_left_arr, uint64_t* right_arr, diff --git a/benchmarks/bench_speed/bench_alp_encode.cpp b/benchmarks/bench_speed/bench_alp_encode.cpp index fa7c507..afd53a7 100644 --- a/benchmarks/bench_speed/bench_alp_encode.cpp +++ b/benchmarks/bench_speed/bench_alp_encode.cpp @@ -12,7 +12,7 @@ static __attribute__((noinline)) benchmark::BenchmarkReporter::Run b_a_e(const d int64_t* ffor_arr, int64_t* base_arr, alp::state& stt, - alp_bench::Column& dataset) { + alp_bench::ALPColumnDescriptor& dataset) { int benchmark_number = dataset.id; diff --git a/benchmarks/bench_speed/bench_alp_without_sampling.cpp b/benchmarks/bench_speed/bench_alp_without_sampling.cpp index a8fcd11..eec9633 100644 --- a/benchmarks/bench_speed/bench_alp_without_sampling.cpp +++ b/benchmarks/bench_speed/bench_alp_without_sampling.cpp @@ -16,7 +16,7 @@ static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_enc int64_t* ffor_arr, int64_t* base_arr, alp::state& stt, - alp_bench::Column& dataset) { + alp_bench::ALPColumnDescriptor& dataset) { int benchmark_number = dataset.id; @@ -48,7 +48,7 @@ static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_enc int64_t* encoded_arr, uint8_t fac, uint8_t exp, - alp_bench::Column& dataset, + alp_bench::ALPColumnDescriptor& dataset, uint8_t& bw, int64_t* base_arr, int64_t* ffor_arr) { diff --git a/benchmarks/bench_your_dataset.cpp b/benchmarks/bench_your_dataset.cpp new file mode 100644 index 0000000..d6f01c3 --- /dev/null +++ b/benchmarks/bench_your_dataset.cpp @@ -0,0 +1,3 @@ +#include "benchmark.hpp" + +TEST_F(ALPBench, bench_alp_on_your_dataset) { bench_dataset(); } \ No newline at end of file diff --git a/benchmarks/include/alp_result.hpp b/benchmarks/include/alp_result.hpp index 90083d2..9d188d3 100644 --- a/benchmarks/include/alp_result.hpp +++ b/benchmarks/include/alp_result.hpp @@ -19,6 +19,7 @@ struct VectorMetadata { uint64_t right_bit_width {0}; uint64_t left_bit_width {0}; std::vector> repetition_vec; + alp::Scheme scheme; }; inline std::string to_str(double val) { diff --git a/benchmarks/include/benchmark.hpp b/benchmarks/include/benchmark.hpp new file mode 100644 index 0000000..bcdd126 --- /dev/null +++ b/benchmarks/include/benchmark.hpp @@ -0,0 +1,432 @@ +#include "alp.hpp" +#include "alp_result.hpp" +#include "data.hpp" +#include "gtest/gtest.h" +#include + +#ifndef ALP_BENCH_ALP_HPP +#define ALP_BENCH_ALP_HPP + +using namespace alp::config; +using namespace alp_bench; + +// ALP overhead per vector : bit_width + factor-idx + exponent-idx + ffor base; +template +PT get_overhead_per_vector() { + return static_cast(8 + // bit_width + 8 + // factor-idx + 8 + // exponent-idx + (sizeof(PT) * 8)) // ffor base + / VECTOR_SIZE; +}; + +std::string get_alp_scheme_string(alp::Scheme& scheme) { + switch (scheme) { + case alp::Scheme::ALP: + return "ALP_PDE"; + case alp::Scheme::ALP_RD: + return "ALP_RD"; + default: + return "INVALID"; + } +} + +template +PT* get_data(size_t rg_idx, PT* data_column, size_t vector_idx) { + size_t offset = (rg_idx * N_VECTORS_PER_ROWGROUP + vector_idx) * VECTOR_SIZE; + PT* data_p = data_column + offset; + return data_p; +} + +template +PT* get_data(size_t rg_idx, PT* data_column) { + size_t offset = (rg_idx * N_VECTORS_PER_ROWGROUP) * VECTOR_SIZE; + PT* data_p = data_column + offset; + return data_p; +} + +// Mapping Definitions +std::unordered_map data_type_map = { + {"invalid", DataType::INVALID}, + {"double", DataType::DOUBLE}, + {"float", DataType::FLOAT}, +}; + +std::unordered_map file_type_map = { + {"invalid", FileType::INVALID}, + {"binary", FileType::BINARY}, + {"csv", FileType::CSV}, +}; + +// Helper Function to Convert String to Lowercase +std::string to_lower(const std::string& input) { + std::string result = input; + std::transform(result.begin(), result.end(), result.begin(), + [](unsigned char c) { return std::tolower(c); }); + return result; +} + +// Parse Function +std::vector parse_column_records(const std::string& filename) { + std::vector records; + std::ifstream file(filename); + + if (!file.is_open()) { + throw std::runtime_error("Could not open file: " + filename); + } + + std::string line; + // Skip the header line + std::getline(file, line); + + while (std::getline(file, line)) { + std::istringstream stream(line); + std::string id, column_name, data_type, path, file_type; + + // Parse each column + std::getline(stream, id, ','); + std::getline(stream, column_name, ','); + std::getline(stream, data_type, ','); + std::getline(stream, path, ','); + std::getline(stream, file_type, ','); + + // Convert strings to lowercase and map to enums + DataType data_type_enum = data_type_map.count(to_lower(data_type)) ? data_type_map[to_lower(data_type)] + : DataType::INVALID; + FileType file_type_enum = file_type_map.count(to_lower(file_type)) ? file_type_map[to_lower(file_type)] + : FileType::INVALID; + + // Add to records + records.push_back({ + std::stoi(id), // Convert ID to integer + data_type_enum, // Data type enum + column_name, // Column name + path, // File path + file_type_enum // File type enum + }); + } + + file.close(); + return records; +} + +template +std::string get_type_string() { + if constexpr (std::is_same_v) { + return "double"; + } else if constexpr (std::is_same_v) { + return "float"; + } else { + throw std::runtime_error("not supported!"); + } +} + +void write_result_header(std::ofstream& ofile) { ofile << "idx,column,data_type,size,rowgroups_count,vectors_count\n"; } + +template +ColumnDescriptor extract_column_descriptor(const ALPColumnDescriptor& alp_column) { + if (alp_column.csv_file_path.empty() && alp_column.binary_file_path.empty()) { + throw std::invalid_argument("Both csv_file_path and binary_file_path cannot be empty."); + } + + // Determine the file path and file type + std::string path = !alp_column.binary_file_path.empty() ? alp_column.binary_file_path : alp_column.csv_file_path; + FileType file_type = !alp_column.binary_file_path.empty() ? FileType::BINARY : FileType::CSV; + + // Infer the data type from PT + DataType data_type; + if constexpr (std::is_same_v) { + data_type = DataType::FLOAT; + } else if constexpr (std::is_same_v) { + data_type = DataType::DOUBLE; + } else { + data_type = DataType::INVALID; + } + + // Create and return the ColumnDescriptor + return ColumnDescriptor { + static_cast(alp_column.id), // Convert ID to int + data_type, // Inferred data type + alp_column.name, // Column name + path, // File path + file_type // Inferred file type + }; +} + +class ALPBench : public ::testing::Test { +public: + uint64_t* sample_buf {}; + uint64_t* exc_buf {}; + uint16_t* rd_exc_buf {}; + uint16_t* pos_buf {}; + uint16_t* exc_c_buf {}; + uint64_t* ffor_buf {}; + uint64_t* unffor_buf {}; + uint64_t* base_buf {}; + uint64_t* encoded_buf {}; + uint64_t* decoded_buf {}; + uint64_t* ffor_right_buf {}; + uint16_t* ffor_left_buf {}; + uint64_t* right_buf {}; + uint16_t* left_buf {}; + uint64_t* unffor_right_buf {}; + uint16_t* unffor_left_buf {}; + uint64_t* glue_buf {}; + uint8_t bit_width {}; + +public: + ~ALPBench() override = default; + + void SetUp() override { + exc_buf = new uint64_t[VECTOR_SIZE]; + rd_exc_buf = new uint16_t[VECTOR_SIZE]; + pos_buf = new uint16_t[VECTOR_SIZE]; + encoded_buf = new uint64_t[VECTOR_SIZE]; + decoded_buf = new uint64_t[VECTOR_SIZE]; + exc_c_buf = new uint16_t[VECTOR_SIZE]; + ffor_buf = new uint64_t[VECTOR_SIZE]; + unffor_buf = new uint64_t[VECTOR_SIZE]; + base_buf = new uint64_t[VECTOR_SIZE]; + sample_buf = new uint64_t[VECTOR_SIZE]; + right_buf = new uint64_t[VECTOR_SIZE]; + left_buf = new uint16_t[VECTOR_SIZE]; + ffor_right_buf = new uint64_t[VECTOR_SIZE]; + ffor_left_buf = new uint16_t[VECTOR_SIZE]; + unffor_right_buf = new uint64_t[VECTOR_SIZE]; + unffor_left_buf = new uint16_t[VECTOR_SIZE]; + glue_buf = new uint64_t[VECTOR_SIZE]; + } + + void TearDown() override { + delete[] sample_buf; + delete[] exc_buf; + delete[] rd_exc_buf; + delete[] pos_buf; + delete[] encoded_buf; + delete[] decoded_buf; + delete[] exc_c_buf; + delete[] ffor_buf; + delete[] unffor_buf; + delete[] base_buf; + delete[] right_buf; + delete[] left_buf; + delete[] unffor_right_buf; + delete[] unffor_left_buf; + } + + template + void typed_bench_column(const ColumnDescriptor& column, std::ofstream& ofile) { + // Internal Type + using UT = typename alp::inner_t::ut; + using ST = typename alp::inner_t::st; + + PT* sample_arr = reinterpret_cast(sample_buf); + PT* exc_arr = reinterpret_cast(exc_buf); + uint16_t* rd_exc_arr = reinterpret_cast(rd_exc_buf); + uint16_t* pos_arr = reinterpret_cast(pos_buf); + uint16_t* exc_c_arr = reinterpret_cast(exc_c_buf); + ST* ffor_arr = reinterpret_cast(ffor_buf); + ST* unffor_arr = reinterpret_cast(unffor_buf); + ST* base_arr = reinterpret_cast(base_buf); + ST* encoded_arr = reinterpret_cast(encoded_buf); + PT* decoded_arr = reinterpret_cast(decoded_buf); + UT* ffor_right_arr = reinterpret_cast(ffor_right_buf); + uint16_t* ffor_left_arr = reinterpret_cast(ffor_left_buf); + UT* right_arr = reinterpret_cast(right_buf); + uint16_t* left_arr = reinterpret_cast(left_buf); + UT* unffor_right_arr = reinterpret_cast(unffor_right_buf); + uint16_t* unffor_left_arr = reinterpret_cast(unffor_left_buf); + PT* glue_arr = reinterpret_cast(glue_buf); + + std::cout << column.name << std::endl; + + // read data + std::vector data; + alp_data::read_data(data, column); + PT* data_column = data.data(); + size_t n_tuples = data.size(); + + size_t n_vecs = n_tuples / VECTOR_SIZE; + auto n_rowgroups = static_cast(std::ceil(static_cast(n_tuples) / ROWGROUP_SIZE)); + std::vector compression_metadata; + PT value_to_encode {0.0}; + size_t rowgroup_counter {0}; + alp::state stt; + + /* Encode - Decode - Validate. */ + double compression_ratio {0}; + for (size_t rg_idx = 0; rg_idx < n_rowgroups; rg_idx++) { + /* Init */ + PT* cur_rg_p = get_data(rg_idx, data_column); + + auto n_vec_per_current_rg = + (rg_idx == n_rowgroups - 1) ? n_vecs % N_VECTORS_PER_ROWGROUP : N_VECTORS_PER_ROWGROUP; + auto n_values_per_current_rg = n_vec_per_current_rg * VECTOR_SIZE; + alp::encoder::init(cur_rg_p, rg_idx, n_values_per_current_rg, sample_arr, stt); + + switch (stt.scheme) { + case alp::Scheme::ALP_RD: { + alp::rd_encoder::init(cur_rg_p, 0, n_values_per_current_rg, sample_arr, stt); + for (size_t vector_idx {0}; vector_idx < n_vec_per_current_rg; vector_idx++) { + PT* cur_vec_p = get_data(rg_idx, data_column, vector_idx); + + // Encode + alp::rd_encoder::encode(cur_vec_p, rd_exc_arr, pos_arr, exc_c_arr, right_arr, left_arr, stt); + ffor::ffor(right_arr, ffor_right_arr, stt.right_bit_width, &stt.right_for_base); + ffor::ffor(left_arr, ffor_left_arr, stt.left_bit_width, &stt.left_for_base); + + // Decode + unffor::unffor(ffor_right_arr, unffor_right_arr, stt.right_bit_width, &stt.right_for_base); + unffor::unffor(ffor_left_arr, unffor_left_arr, stt.left_bit_width, &stt.left_for_base); + alp::rd_encoder::decode( + glue_arr, unffor_right_arr, unffor_left_arr, rd_exc_arr, pos_arr, exc_c_arr, stt); + + auto* dbl_glue_arr = reinterpret_cast(glue_arr); + for (size_t j = 0; j < VECTOR_SIZE; ++j) { + auto l = cur_vec_p[j]; + auto r = dbl_glue_arr[j]; + if (l != r) { std::cerr << j << ", " << column.name << "\n"; } + + ASSERT_EQ(cur_vec_p[j], dbl_glue_arr[j]); + } + + alp_bench::VectorMetadata vector_metadata; + vector_metadata.right_bit_width = stt.right_bit_width; + vector_metadata.left_bit_width = stt.left_bit_width; + vector_metadata.exceptions_count = stt.exceptions_count; + vector_metadata.scheme = alp::Scheme::ALP_RD; + + compression_metadata.push_back(vector_metadata); + } + } break; + case alp::Scheme::ALP: { + /* Encode - Decode - Validate. */ + for (size_t vector_idx {0}; vector_idx < n_vec_per_current_rg; vector_idx++) { + PT* data_p = get_data(rg_idx, data_column, vector_idx); + + alp::encoder::encode(data_p, exc_arr, pos_arr, exc_c_arr, encoded_arr, stt); + alp::encoder::analyze_ffor(encoded_arr, bit_width, base_arr); + ffor::ffor(encoded_arr, ffor_arr, bit_width, base_arr); + + unffor::unffor(ffor_arr, unffor_arr, bit_width, base_arr); + alp::decoder::decode(unffor_arr, stt.fac, stt.exp, decoded_arr); + alp::decoder::patch_exceptions(decoded_arr, exc_arr, pos_arr, exc_c_arr); + + for (size_t j = 0; j < VECTOR_SIZE; j++) { + auto l = data_p[j]; + auto r = decoded_arr[j]; + if (l != r) { std::cerr << j << ", " << rg_idx << ", " << column.name << "\n"; } + ASSERT_EQ(data_p[j], decoded_arr[j]); + } + + alp_bench::VectorMetadata vector_metadata; + vector_metadata.bit_width = bit_width; + vector_metadata.exceptions_count = exc_c_arr[0]; + vector_metadata.scheme = alp::Scheme::ALP; + + compression_metadata.push_back(vector_metadata); + bit_width = 0; + } + + } break; + default: + ASSERT_TRUE(false); + } + } + + compression_ratio = calculate_alp_compression_size(compression_metadata); + ofile << std::fixed << std::setprecision(2) << column.id << "," << column.name << "," << get_type_string() + << "," << compression_ratio << "," << n_rowgroups << "," << n_vecs << std::endl; + } + + template + void typed_bench_dataset(std::array columns, + const std::string& result_file_path) { + + std::ofstream ofile(result_file_path, std::ios::out); + write_result_header(ofile); + + for (auto& alp_column_descriptor : columns) { + auto column_descriptor = extract_column_descriptor(alp_column_descriptor); + typed_bench_column(column_descriptor, ofile); + } + } + + template + double calculate_alp_compression_size(std::vector& vector_metadatas) { + double avg_bits_per_value {0}; + for (auto& vector_metadata : vector_metadatas) { + switch (vector_metadata.scheme) { + case alp::Scheme::ALP_RD: { + avg_bits_per_value += calculate_alprd_compression_size(vector_metadata); + break; + } + case alp::Scheme::ALP: { + + avg_bits_per_value += calculate_alp_pde_compression_size(vector_metadata); + break; + } + default: + throw std::runtime_error("not scheme is chosen"); + } + } + + avg_bits_per_value = avg_bits_per_value / vector_metadatas.size(); + return avg_bits_per_value; + } + + template + double calculate_alp_pde_compression_size(alp_bench::VectorMetadata& vector_metadata) { + double avg_bits_per_value {0}; + avg_bits_per_value = avg_bits_per_value + vector_metadata.bit_width; + avg_bits_per_value = avg_bits_per_value + + (static_cast(vector_metadata.exceptions_count) * + (alp::Constants::EXCEPTION_SIZE + alp::EXCEPTION_POSITION_SIZE) / VECTOR_SIZE); + + avg_bits_per_value = avg_bits_per_value + get_overhead_per_vector(); + return avg_bits_per_value; + } + + double alprd_overhead_per_vector {static_cast(MAX_RD_DICTIONARY_SIZE * 16) / ROWGROUP_SIZE}; + + double calculate_alprd_compression_size(alp_bench::VectorMetadata& vector_metadata) { + double avg_bits_per_value {0}; + avg_bits_per_value = avg_bits_per_value + vector_metadata.right_bit_width + vector_metadata.left_bit_width + + static_cast(vector_metadata.exceptions_count * + (alp::RD_EXCEPTION_SIZE + alp::RD_EXCEPTION_POSITION_SIZE)) / + VECTOR_SIZE; + + avg_bits_per_value = avg_bits_per_value + alprd_overhead_per_vector; + return avg_bits_per_value; + } + + void bench_dataset() { + const std::string dataset_description_file = + ALP_CMAKE_SOURCE_DIR "/benchmarks/your_own_dataset.csv"; // Replace with your CSV file path + + const std::string result_file_path = + ALP_CMAKE_SOURCE_DIR "/benchmarks/your_own_dataset_result.csv"; // Replace with your CSV file path + + std::ofstream ofile(result_file_path, std::ios::out); + write_result_header(ofile); + + std::vector columns = parse_column_records(dataset_description_file); + for (const auto& column : columns) { + switch (column.data_type) { + case DataType::DOUBLE: { + typed_bench_column(column, ofile); + break; + } + case DataType::FLOAT: { + typed_bench_column(column, ofile); + break; + } + case DataType::INVALID: + default: + throw std::runtime_error("NOT supported type."); + } + } + } +}; + +#endif // ALP_BENCH_ALP_HPP diff --git a/benchmarks/result/compression_ratio/double/alp_dataset.csv b/benchmarks/result/compression_ratio/double/alp_dataset.csv index 2ef92bf..d4d367d 100644 --- a/benchmarks/result/compression_ratio/double/alp_dataset.csv +++ b/benchmarks/result/compression_ratio/double/alp_dataset.csv @@ -1,31 +1,31 @@ -idx,dataset,size,alp_scheme,rowgroups_count,vectors_count -1,Air-Pressure,16.43,ALP_PDE,2689,134493 -2,Arade/4,24.94,ALP_PDE,193,9657 -3,Basel-Temp,30.72,ALP_PDE,3,120 -4,Basel-Wind,29.81,ALP_PDE,3,120 -5,Bird-Mig,20.14,ALP_PDE,1,17 -6,Btc-Price,26.37,ALP_PDE,1,2 -7,Blockchain,36.49,ALP_PDE,5,225 -8,City-Temp,10.74,ALP_PDE,57,2837 -9,CMS/1,35.65,ALP_PDE,363,18140 -10,CMS/9,11.67,ALP_PDE,363,18140 -11,CMS/25,41.11,ALP_PDE,363,18140 -12,Dew-Temp,13.40,ALP_PDE,105,5287 -13,Bio-Temp,10.75,ALP_PDE,7437,371892 -14,Food-prices,23.65,ALP_PDE,41,2002 -15,Gov/10,30.99,ALP_PDE,2757,137816 -16,Gov/26,0.41,ALP_PDE,2757,137816 -17,Gov/30,7.48,ALP_PDE,2757,137816 -18,Gov/31,3.05,ALP_PDE,2757,137816 -19,Gov/40,0.83,ALP_PDE,2757,137816 -20,Medicare/1,39.35,ALP_PDE,181,9070 -21,Medicare/9,12.26,ALP_PDE,181,9070 -22,PM10-dust,8.56,ALP_PDE,5,216 -23,NYC/29,40.38,ALP_PDE,341,17037 -24,POI-lat,55.74,ALP_RD,5,828 -25,POI-lon,56.56,ALP_RD,5,828 -26,SD-bench,16.21,ALP_PDE,1,8 -27,Stocks-DE,11.01,ALP_PDE,851,42544 -28,Stocks-UK,12.59,ALP_PDE,1159,57915 -29,Stocks-USA,7.90,ALP_PDE,5509,275465 -30,Wind-dir,15.89,ALP_PDE,3885,194237 +idx,column,data_type,size,rowgroups_count,vectors_count +1,Air-Pressure,double,16.43,1345,134493 +2,Arade/4,double,24.94,97,9657 +3,Basel-Temp,double,29.66,2,120 +4,Basel-Wind,double,29.81,2,120 +5,Bird-Mig,double,20.14,1,17 +6,Btc-Price,double,26.37,1,2 +7,Blockchain,double,36.22,3,225 +8,City-Temp,double,10.74,29,2837 +9,CMS/1,double,35.66,182,18140 +10,CMS/9,double,11.68,182,18140 +11,CMS/25,double,41.11,182,18140 +12,Dew-Temp,double,13.39,53,5287 +13,Bio-Temp,double,10.75,3719,371892 +14,Food-prices,double,23.65,21,2002 +15,Gov/10,double,31.00,1379,137816 +16,Gov/26,double,0.41,1379,137816 +17,Gov/30,double,7.48,1379,137816 +18,Gov/31,double,3.06,1379,137816 +19,Gov/40,double,0.85,1379,137816 +20,Medicare/1,double,39.37,91,9070 +21,Medicare/9,double,12.27,91,9070 +22,PM10-dust,double,8.67,3,216 +23,NYC/29,double,40.38,171,17037 +24,POI-lat,double,55.54,5,414 +25,POI-lon,double,56.47,5,414 +26,SD-bench,double,16.21,1,8 +27,Stocks-DE,double,11.00,426,42544 +28,Stocks-UK,double,12.59,580,57915 +29,Stocks-USA,double,7.90,2755,275465 +30,Wind-dir,double,15.89,1943,194237 diff --git a/benchmarks/result/compression_ratio/float/hurricane_isabel_dataset.csv b/benchmarks/result/compression_ratio/float/hurricane_isabel_dataset.csv new file mode 100644 index 0000000..9bcc7f4 --- /dev/null +++ b/benchmarks/result/compression_ratio/float/hurricane_isabel_dataset.csv @@ -0,0 +1,21 @@ +idx,column,data_type,size,rowgroups_count,vectors_count +1,CLOUDf48,float,9.36,245,24414 +2,CLOUDf48-log10,float,22.39,245,24414 +3,PRECIPf48,float,29.91,245,24414 +4,PRECIPf48-log10,float,24.77,245,24414 +5,Pf48,float,26.21,245,24414 +6,QCLOUDf48,float,4.08,245,24414 +7,QCLOUDf48-log10,float,14.06,245,24414 +8,QGRAUPf48,float,30.59,245,24414 +9,QGRAUPf48-log10,float,25.04,245,24414 +10,QICEf48,float,7.54,245,24414 +11,QICEf48-log10,float,17.21,245,24414 +12,QRAINf48,float,30.46,245,24414 +13,QRAINf48-log10,float,25.07,245,24414 +14,QSNOWf48,float,29.97,245,24414 +15,QSNOWf48-log10,float,24.30,245,24414 +16,QVAPORf48,float,25.30,245,24414 +17,TCf48,float,22.86,245,24414 +18,Uf48,float,27.44,245,24414 +19,Vf48,float,27.26,245,24414 +20,Wf48,float,28.06,245,24414 diff --git a/benchmarks/result/compression_ratio/float/sp_dataset.csv b/benchmarks/result/compression_ratio/float/sp_dataset.csv index f0f2a52..245d42d 100644 --- a/benchmarks/result/compression_ratio/float/sp_dataset.csv +++ b/benchmarks/result/compression_ratio/float/sp_dataset.csv @@ -1,5 +1,5 @@ -idx,dataset,size,alp_scheme,rowgroups_count,vectors_count -1,Dino-Vitb16,28.78,ALP_RD,844,168728 -2,GPT2,28.01,ALP_RD,1216,243046 -3,Grammarly-lg,29.16,ALP_RD,7648,1529478 -4,W2V Tweets,28.86,ALP_RD,5,808 +idx,column,data_type,size,rowgroups_count,vectors_count +1,Dino-Vitb16,float,28.24,844,84364 +2,GPT2,float,27.68,1216,121523 +3,Grammarly-lg,float,27.73,7648,764739 +4,W2V Tweets,float,28.26,5,404 diff --git a/benchmarks/test_compression_ratio.cpp b/benchmarks/test_compression_ratio.cpp new file mode 100644 index 0000000..4557d4f --- /dev/null +++ b/benchmarks/test_compression_ratio.cpp @@ -0,0 +1,17 @@ +#include "benchmark.hpp" + +TEST_F(ALPBench, bench_alp_on_alp_dataset) { + std::string result_path = alp_bench::get_paths().alp_result_dir_path + "compression_ratio/double/alp_dataset.csv"; + typed_bench_dataset(alp_bench::get_alp_dataset(), result_path); +} + +TEST_F(ALPBench, bench_alp_on_sp_dataset) { + std::string result_path = alp_bench::get_paths().alp_result_dir_path + "compression_ratio/float/sp_dataset.csv"; + typed_bench_dataset(alp_bench::get_sp_datasets(), result_path); +} + +TEST_F(ALPBench, bench_alp_on_hurricane_isabel) { + auto result_path = + alp_bench::get_paths().alp_result_dir_path + "compression_ratio/float/hurricane_isabel_dataset.csv"; + typed_bench_dataset(alp_bench::get_hurricane_isabel_dataset(), result_path); +} diff --git a/benchmarks/your_own_dataset.csv b/benchmarks/your_own_dataset.csv new file mode 100644 index 0000000..87cbec1 --- /dev/null +++ b/benchmarks/your_own_dataset.csv @@ -0,0 +1,2 @@ +id,column_name,data_type,path,file_type +0,CLOUDf48.bin.f32,float,/Users/azim/CLionProjects/ALP/100x500x500/CLOUDf48.bin.f32,binary \ No newline at end of file diff --git a/benchmarks/your_own_dataset_result.csv b/benchmarks/your_own_dataset_result.csv new file mode 100644 index 0000000..349b8df --- /dev/null +++ b/benchmarks/your_own_dataset_result.csv @@ -0,0 +1,2 @@ +idx,column,data_type,size,rowgroups_count,vectors_count +0,CLOUDf48.bin.f32,float,9.36,245,24414 diff --git a/data/include/column.hpp b/data/include/column.hpp index c3b46ad..3521953 100644 --- a/data/include/column.hpp +++ b/data/include/column.hpp @@ -6,7 +6,28 @@ #include namespace alp_bench { -struct Column { + +enum class DataType : uint8_t { + INVALID = 0, + DOUBLE = 1, + FLOAT = 2, +}; + +enum class FileType : uint8_t { + INVALID = 0, + BINARY = 1, + CSV = 2, +}; + +struct ColumnDescriptor { + int id; + DataType data_type; + std::string name; + std::string path; + FileType file_type; +}; + +struct ALPColumnDescriptor { uint64_t id; std::string name; const std::string csv_file_path; diff --git a/data/include/data.hpp b/data/include/data.hpp index b1fd41b..6dd0d99 100644 --- a/data/include/data.hpp +++ b/data/include/data.hpp @@ -10,15 +10,17 @@ #include namespace alp_data { + // we prefer the binary_path over csv_path template -inline void read_data(std::vector& data, const std::string& csv_file_path, const std::string& bin_file_path) { - if (!bin_file_path.empty()) { +inline void read_data(std::vector& data, const alp_bench::ColumnDescriptor& column_descriptor) { + switch (column_descriptor.file_type) { + case alp_bench::FileType::BINARY: { // Open the binary file in input mode - std::ifstream file(bin_file_path, std::ios::binary | std::ios::in); + std::ifstream file(column_descriptor.path, std::ios::binary | std::ios::in); - if (!file) { throw std::runtime_error("Failed to open file: " + bin_file_path); } + if (!file) { throw std::runtime_error("Failed to open file: " + column_descriptor.path); } // Get the size of the file file.seekg(0, std::ios::end); @@ -38,10 +40,9 @@ inline void read_data(std::vector& data, const std::string& csv_file_path, c // Close the file file.close(); - return; - } - if (!csv_file_path.empty()) { - const auto& path = csv_file_path; + } break; + case alp_bench::FileType::CSV: { + const auto& path = column_descriptor.path; std::ifstream file(path); if (!file) { throw std::runtime_error("Failed to open file: " + path); } @@ -61,9 +62,12 @@ inline void read_data(std::vector& data, const std::string& csv_file_path, c } file.close(); - return; + } break; + case alp_bench::FileType::INVALID: + default: { + throw std::runtime_error("No bin or csv file specified"); + } } - throw std::runtime_error("No bin or csv file specified"); } } // namespace alp_data diff --git a/data/include/double/alp_dataset.hpp b/data/include/double/alp_dataset.hpp index 29e5b7d..97a5977 100644 --- a/data/include/double/alp_dataset.hpp +++ b/data/include/double/alp_dataset.hpp @@ -5,8 +5,8 @@ namespace alp_bench { -inline std::array get_alp_dataset() { - static std::array ALP_DATASET = {{ +inline std::array get_alp_dataset() { + static std::array ALP_DATASET = {{ {1, "Air-Pressure", @@ -286,8 +286,8 @@ inline std::array get_alp_dataset() { return ALP_DATASET; }; -inline std::array get_double_test_dataset() { - static std::array DOUBLE_TEST_DATASET = {{ +inline std::array get_double_test_dataset() { + static std::array DOUBLE_TEST_DATASET = {{ {0, "test_0", ALP_CMAKE_SOURCE_DIR "/data/double/test_0.csv", "", 0, 0, 0, 0}, }}; diff --git a/data/include/edge_case.hpp b/data/include/edge_case.hpp index 25d869a..79f03d8 100644 --- a/data/include/edge_case.hpp +++ b/data/include/edge_case.hpp @@ -6,7 +6,7 @@ namespace alp_bench { inline auto get_edge_case() { - static std::array EDGE_CASE = {{ + static std::array EDGE_CASE = {{ {1, "edge_case", get_paths().edge_dataset_csv_path + "edge_case.csv", "", 0, 0, 12, 0, true}, }}; diff --git a/data/include/evalimplsts.hpp b/data/include/evalimplsts.hpp index b5f4669..0c4e1ac 100644 --- a/data/include/evalimplsts.hpp +++ b/data/include/evalimplsts.hpp @@ -6,7 +6,7 @@ namespace alp_bench { inline auto get_evalimplsts() { - static std::array EVALIMPLSTS = {{ + static std::array EVALIMPLSTS = {{ // prev issue_8 {0, "active_power", get_paths().evalimplsts_csv_path + "active_power.csv", "", 0, 0, 0, 0, true}, diff --git a/data/include/float/hurricane_isabel.hpp b/data/include/float/hurricane_isabel.hpp index b8290e4..e62400b 100644 --- a/data/include/float/hurricane_isabel.hpp +++ b/data/include/float/hurricane_isabel.hpp @@ -5,8 +5,10 @@ namespace alp_bench { -inline std::array get_hurricane_isabel_dataset() { - static std::array HURRICANE_ISABEL = {{ +constexpr size_t N_HURRICANE_ISABEL_COLUMNS = 20; + +inline std::array get_hurricane_isabel_dataset() { + static std::array HURRICANE_ISABEL = {{ {1, "CLOUDf48", "", get_paths().hs + "CLOUDf48.bin.f32", 0, 0, 0, 0}, {2, "CLOUDf48-log10", "", get_paths().hs + "CLOUDf48.log10.bin.f32", 0, 0, 0, 0}, {3, "PRECIPf48", "", get_paths().hs + "PRECIPf48.bin.f32", 0, 0, 0, 0}, diff --git a/data/include/float/sp.hpp b/data/include/float/sp.hpp index 5053bf7..631097b 100644 --- a/data/include/float/sp.hpp +++ b/data/include/float/sp.hpp @@ -5,7 +5,7 @@ namespace alp_bench { inline auto get_sp_datasets() { - static std::array SP_DATASETS = {{ + static std::array SP_DATASETS = {{ {1, "Dino-Vitb16", "", get_paths().alp_dataset_binary_dir_path + "sp_dino_vitb16.bin", 0, 0, 0, 0, true}, {2, "GPT2", "", get_paths().alp_dataset_binary_dir_path + "sp_gpt2.bin", 0, 0, 0, 0, true}, {3, diff --git a/data/include/float/test.hpp b/data/include/float/test.hpp index 5128ffd..72060d9 100644 --- a/data/include/float/test.hpp +++ b/data/include/float/test.hpp @@ -5,7 +5,7 @@ namespace alp_bench { inline auto get_float_test_dataset() { - static std::array FLOAT_TEST_DATASET = {{ + static std::array FLOAT_TEST_DATASET = {{ {0, "Arade/4", get_paths().alp_dataset_csv_path + "arade4.csv", "", 0, 0, 0, 0}, {1, "test_0", ALP_CMAKE_SOURCE_DIR "/data/float/test_0.csv", "", 0, 0, 0, 4}, {2, "test_1", ALP_CMAKE_SOURCE_DIR "/data/float/test_1.csv", "", 0, 0, 0, 10}, diff --git a/data/include/generated_columns.hpp b/data/include/generated_columns.hpp index 9dbbfa9..678c5ce 100644 --- a/data/include/generated_columns.hpp +++ b/data/include/generated_columns.hpp @@ -5,7 +5,7 @@ namespace alp_bench { inline auto get_generated_cols() { - static std::array GENERATED_COLS = { + static std::array GENERATED_COLS = { { // {0, "bw0", get_paths().generated_columns_csv_path + "generated_doubles_bw0.csv", "", 0, 0, 0, 0}, diff --git a/include/alp/config.hpp b/include/alp/config.hpp index 7282440..a54369c 100644 --- a/include/alp/config.hpp +++ b/include/alp/config.hpp @@ -9,8 +9,10 @@ namespace alp::config { /// ALP Vector size (We recommend against changing this; it should be constant) inline constexpr size_t VECTOR_SIZE = 1024; +/// number of vectors per rowgroup +inline constexpr size_t N_VECTORS_PER_ROWGROUP = 100UL; /// Rowgroup size -inline constexpr size_t ROWGROUP_SIZE = 100UL * VECTOR_SIZE; +inline constexpr size_t ROWGROUP_SIZE = N_VECTORS_PER_ROWGROUP * VECTOR_SIZE; /// Vectors from the rowgroup from which to take samples; this will be used to then calculate the jumps inline constexpr size_t ROWGROUP_VECTOR_SAMPLES = 8; /// We calculate how many equidistant vector we must jump within a rowgroup diff --git a/publication/source_code/bench_compression_ratio/alp.cpp b/publication/source_code/bench_compression_ratio/alp.cpp index de31e10..f93aa28 100644 --- a/publication/source_code/bench_compression_ratio/alp.cpp +++ b/publication/source_code/bench_compression_ratio/alp.cpp @@ -175,7 +175,7 @@ class alp_test : public ::testing::Test { delete[] unffor_left_arr; } - void bench_alp_compression_ratio(const alp_bench::Column& dataset, std::ofstream& ofile) { + void bench_alp_compression_ratio(const alp_bench::ALPColumnDescriptor& dataset, std::ofstream& ofile) { if (dataset.suitable_for_cutting) { return; } std::cout << dataset.name << std::endl; @@ -238,7 +238,7 @@ class alp_test : public ::testing::Test { } } - void bench_alp_rd_compression_ratio(const alp_bench::Column& dataset, std::ofstream& ofile) { + void bench_alp_rd_compression_ratio(const alp_bench::ALPColumnDescriptor& dataset, std::ofstream& ofile) { if (!dataset.suitable_for_cutting) { return; } std::cout << dataset.name << std::endl; diff --git a/publication/source_code/bench_compression_ratio/chimp.cpp b/publication/source_code/bench_compression_ratio/chimp.cpp index 1795147..762347e 100644 --- a/publication/source_code/bench_compression_ratio/chimp.cpp +++ b/publication/source_code/bench_compression_ratio/chimp.cpp @@ -41,7 +41,7 @@ class chimp_test : public ::testing::Test { } template - void bench_compression_ratio(const std::array& datasets, const std::string& path) { + void bench_compression_ratio(const std::array& datasets, const std::string& path) { if (const auto v = std::getenv("ALP_DATASET_DIR_PATH"); v != nullptr) { alp_bench::get_paths().alp_dataset_binary_dir_path = *v; } diff --git a/publication/source_code/bench_compression_ratio/chimp128.cpp b/publication/source_code/bench_compression_ratio/chimp128.cpp index 6898d2f..712a55a 100644 --- a/publication/source_code/bench_compression_ratio/chimp128.cpp +++ b/publication/source_code/bench_compression_ratio/chimp128.cpp @@ -47,7 +47,7 @@ class chimp128_test : public ::testing::Test { } template - void bench_compression_ratio(const std::array& datasets, const std::string& path) { + void bench_compression_ratio(const std::array& datasets, const std::string& path) { using INNERTYPE = typename std::conditional_t, uint64_t, diff --git a/publication/source_code/bench_compression_ratio/gorillas.cpp b/publication/source_code/bench_compression_ratio/gorillas.cpp index 4b98938..3676aea 100644 --- a/publication/source_code/bench_compression_ratio/gorillas.cpp +++ b/publication/source_code/bench_compression_ratio/gorillas.cpp @@ -31,7 +31,7 @@ class gorillas_test : public ::testing::Test { double gorillas_overhead_per_vector {static_cast(16 + 16 + 16)}; template - void bench_compression_ratio(const std::array& datasets, const std::string& path) { + void bench_compression_ratio(const std::array& datasets, const std::string& path) { if (const auto v = std::getenv("ALP_DATASET_DIR_PATH"); v != nullptr) { alp_bench::get_paths().alp_dataset_binary_dir_path = *v; } diff --git a/publication/source_code/bench_compression_ratio/patas.cpp b/publication/source_code/bench_compression_ratio/patas.cpp index c0cde86..b0e2744 100644 --- a/publication/source_code/bench_compression_ratio/patas.cpp +++ b/publication/source_code/bench_compression_ratio/patas.cpp @@ -39,7 +39,7 @@ class patas_test : public ::testing::Test { double patas_overhead_per_vector {static_cast(16)}; template - void bench_compression_ratio(const std::array& datasets, const std::string& path) { + void bench_compression_ratio(const std::array& datasets, const std::string& path) { using INNERTYPE = typename std::conditional_t, uint64_t, diff --git a/publication/source_code/bench_compression_ratio/zstd.cpp b/publication/source_code/bench_compression_ratio/zstd.cpp index f243399..d84b76e 100644 --- a/publication/source_code/bench_compression_ratio/zstd.cpp +++ b/publication/source_code/bench_compression_ratio/zstd.cpp @@ -16,7 +16,7 @@ class zstd_test : public ::testing::Test { ~zstd_test() override {} template - void bench_compression_ratio(const std::array& datasets, const std::string& path) { + void bench_compression_ratio(const std::array& datasets, const std::string& path) { if (const auto v = std::getenv("ALP_DATASET_DIR_PATH"); v != nullptr) { alp_bench::get_paths().alp_dataset_binary_dir_path = *v; } diff --git a/publication/source_code/bench_speed/bench_alp_cutter_decode.cpp b/publication/source_code/bench_speed/bench_alp_cutter_decode.cpp index 998e464..83a6d22 100644 --- a/publication/source_code/bench_speed/bench_alp_cutter_decode.cpp +++ b/publication/source_code/bench_speed/bench_alp_cutter_decode.cpp @@ -16,7 +16,7 @@ static __attribute__((noinline)) benchmark::BenchmarkReporter::Run b_a_e(const d int64_t* ffor_arr, int64_t* base_arr, alp::state& stt, - alp_bench::Column& dataset, + alp_bench::ALPColumnDescriptor& dataset, uint64_t* ffor_right_arr, uint16_t* ffor_left_arr, uint64_t* right_arr, diff --git a/publication/source_code/bench_speed/bench_alp_cutter_encode.cpp b/publication/source_code/bench_speed/bench_alp_cutter_encode.cpp index bfe0353..868b89d 100644 --- a/publication/source_code/bench_speed/bench_alp_cutter_encode.cpp +++ b/publication/source_code/bench_speed/bench_alp_cutter_encode.cpp @@ -15,7 +15,7 @@ static __attribute__((noinline)) benchmark::BenchmarkReporter::Run b_a_e(const d int64_t* ffor_arr, int64_t* base_arr, alp::state& stt, - alp_bench::Column& dataset, + alp_bench::ALPColumnDescriptor& dataset, uint64_t* ffor_right_arr, uint16_t* ffor_left_arr, uint64_t* right_arr, diff --git a/publication/source_code/bench_speed/bench_alp_encode.cpp b/publication/source_code/bench_speed/bench_alp_encode.cpp index 7dd0066..7cbfa52 100644 --- a/publication/source_code/bench_speed/bench_alp_encode.cpp +++ b/publication/source_code/bench_speed/bench_alp_encode.cpp @@ -12,7 +12,7 @@ static __attribute__((noinline)) benchmark::BenchmarkReporter::Run b_a_e(const d int64_t* ffor_arr, int64_t* base_arr, alp::state& stt, - alp_bench::Column& dataset) { + alp_bench::ALPColumnDescriptor& dataset) { int benchmark_number = dataset.id; diff --git a/publication/source_code/bench_speed/bench_alp_without_sampling.cpp b/publication/source_code/bench_speed/bench_alp_without_sampling.cpp index c46ecee..2f7f402 100644 --- a/publication/source_code/bench_speed/bench_alp_without_sampling.cpp +++ b/publication/source_code/bench_speed/bench_alp_without_sampling.cpp @@ -16,7 +16,7 @@ static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_enc int64_t* ffor_arr, int64_t* base_arr, alp::state& stt, - alp_bench::Column& dataset) { + alp_bench::ALPColumnDescriptor& dataset) { int benchmark_number = dataset.id; @@ -48,7 +48,7 @@ static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_enc int64_t* encoded_arr, uint8_t fac, uint8_t exp, - alp_bench::Column& dataset, + alp_bench::ALPColumnDescriptor& dataset, uint8_t& bw, int64_t* base_arr, int64_t* ffor_arr) { diff --git a/publication/source_code/bench_speed/bench_chimp.cpp b/publication/source_code/bench_speed/bench_chimp.cpp index 751fe97..c3065ed 100644 --- a/publication/source_code/bench_speed/bench_chimp.cpp +++ b/publication/source_code/bench_speed/bench_chimp.cpp @@ -3,7 +3,7 @@ #include "data.hpp" static __attribute__((noinline)) benchmark::BenchmarkReporter::Run -bench_decode_chimp(alp_bench::Column& dataset, +bench_decode_chimp(alp_bench::ALPColumnDescriptor& dataset, idx_t leading_zero_block_size, uint32_t leading_zero_index, alp_bench::ChimpDecompressionState chimp_de_state, @@ -60,7 +60,7 @@ bench_decode_chimp(alp_bench::Column& dataset, } static __attribute__((noinline)) benchmark::BenchmarkReporter::Run -bench_encode_chimp(alp_bench::Column& dataset, +bench_encode_chimp(alp_bench::ALPColumnDescriptor& dataset, alp_bench::ChimpCompressionState state, uint8_t* data_arr, uint8_t* flags_arr, diff --git a/publication/source_code/bench_speed/bench_chimp128.cpp b/publication/source_code/bench_speed/bench_chimp128.cpp index 08b70a9..1310554 100644 --- a/publication/source_code/bench_speed/bench_chimp128.cpp +++ b/publication/source_code/bench_speed/bench_chimp128.cpp @@ -4,7 +4,7 @@ static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_decode_chimp128(const double* dbl_arr, - alp_bench::Column& dataset, + alp_bench::ALPColumnDescriptor& dataset, uint8_t leading_zero_block_count, alp_bench::Chimp128CompressionState com_stt, idx_t leading_zero_block_size, @@ -93,7 +93,7 @@ bench_decode_chimp128(const double* dbl_a } static __attribute__((noinline)) benchmark::BenchmarkReporter::Run -bench_encode_chimp128(alp_bench::Column& dataset, +bench_encode_chimp128(alp_bench::ALPColumnDescriptor& dataset, alp_bench::Chimp128CompressionState com_stt, uint8_t leading_zero_block_count, idx_t leading_zero_block_size, diff --git a/publication/source_code/bench_speed/bench_gorillas.cpp b/publication/source_code/bench_speed/bench_gorillas.cpp index 1f4de28..1b5ab2f 100644 --- a/publication/source_code/bench_speed/bench_gorillas.cpp +++ b/publication/source_code/bench_speed/bench_gorillas.cpp @@ -3,7 +3,7 @@ #include "gorillas/gorillas.hpp" static __attribute__((noinline)) benchmark::BenchmarkReporter::Run -bench_decode_gorillas(alp_bench::Column& dataset, +bench_decode_gorillas(alp_bench::ALPColumnDescriptor& dataset, alp_bench::GorillasDecompressionState gorillas_de_state, alp_bench::FlagBuffer flag_buffer, alp_bench::GorillasConstants::Flags* flags, @@ -44,7 +44,7 @@ bench_decode_gorillas(alp_bench::Column& datas } static __attribute__((noinline)) benchmark::BenchmarkReporter::Run -bench_encode_gorillas(alp_bench::Column& dataset, +bench_encode_gorillas(alp_bench::ALPColumnDescriptor& dataset, alp_bench::GorillasCompressionState state, uint8_t* data_arr, uint8_t* flags_arr, diff --git a/publication/source_code/bench_speed/bench_patas.cpp b/publication/source_code/bench_speed/bench_patas.cpp index bec0652..0c3166a 100644 --- a/publication/source_code/bench_speed/bench_patas.cpp +++ b/publication/source_code/bench_speed/bench_patas.cpp @@ -3,7 +3,7 @@ #include "patas/patas.hpp" static __attribute__((noinline)) benchmark::BenchmarkReporter::Run -bench_decoding_patas(alp_bench::Column& dataset, +bench_decoding_patas(alp_bench::ALPColumnDescriptor& dataset, uint16_t* packed_metadata, uint8_t* data_arr, uint64_t* dec_arr, @@ -46,7 +46,7 @@ bench_decoding_patas(alp_bench::Column& dataset, } static __attribute__((noinline)) benchmark::BenchmarkReporter::Run -bench_encoding_patas(alp_bench::Column& dataset, +bench_encoding_patas(alp_bench::ALPColumnDescriptor& dataset, alp_bench::patas::PatasCompressionState patas_state, uint8_t* data_arr, uint16_t* packed_metadata, diff --git a/publication/source_code/bench_speed/bench_zstd.cpp b/publication/source_code/bench_speed/bench_zstd.cpp index 4591ae4..9f3acbe 100644 --- a/publication/source_code/bench_speed/bench_zstd.cpp +++ b/publication/source_code/bench_speed/bench_zstd.cpp @@ -6,7 +6,7 @@ // NOLINTBEGIN static __attribute__((noinline)) benchmark::BenchmarkReporter::Run -bench_decode_zstd(alp_bench::Column& dataset, void* enc_arr, size_t enc_size, void* dec_arr) { +bench_decode_zstd(alp_bench::ALPColumnDescriptor& dataset, void* enc_arr, size_t enc_size, void* dec_arr) { int benchmark_number = dataset.id; @@ -31,7 +31,7 @@ bench_decode_zstd(alp_bench::Column& dataset, void* enc_arr, size_t enc_size, vo } static __attribute__((noinline)) benchmark::BenchmarkReporter::Run -bench_encode_zstd(alp_bench::Column& dataset, double* dbl_arr, void* enc_arr) { +bench_encode_zstd(alp_bench::ALPColumnDescriptor& dataset, double* dbl_arr, void* enc_arr) { int benchmark_number = dataset.id; diff --git a/publication/source_code/generated/arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_bench.cpp b/publication/source_code/generated/arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_bench.cpp index 48c9750..64d5af0 100644 --- a/publication/source_code/generated/arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_bench.cpp +++ b/publication/source_code/generated/arm64v8/neon_intrinsic_uf1/arm64v8_neon_intrinsic_1024_uf1_falp_bench.cpp @@ -1,7 +1,7 @@ #include "arm64v8_neon_intrinsic_1024_uf1_falp_bench.hpp" #include "alp.hpp" #include "data.hpp" -static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_fused_decode(alp_bench::Column& dataset, +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_fused_decode(alp_bench::ALPColumnDescriptor& dataset, int64_t* ffor_arr, uint8_t bw, int64_t* base_arr, @@ -37,7 +37,7 @@ static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_fus return benchmark::BenchmarkReporter::Run( benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); } -static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_decode(alp_bench::Column& dataset, +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_decode(alp_bench::ALPColumnDescriptor& dataset, int64_t* ffor_arr, int64_t* unffor_arr, uint8_t bw, diff --git a/publication/source_code/generated/fallback/scalar_aav_uf1/fallback_scalar_aav_1024_uf1_falp_bench.cpp b/publication/source_code/generated/fallback/scalar_aav_uf1/fallback_scalar_aav_1024_uf1_falp_bench.cpp index 1827838..0135b87 100644 --- a/publication/source_code/generated/fallback/scalar_aav_uf1/fallback_scalar_aav_1024_uf1_falp_bench.cpp +++ b/publication/source_code/generated/fallback/scalar_aav_uf1/fallback_scalar_aav_1024_uf1_falp_bench.cpp @@ -2,7 +2,7 @@ #include "data.hpp" #include "fallback_scalar_aav_1024_uf1_falp_bench.hpp" -static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_fused_decode(alp_bench::Column& dataset, +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_fused_decode(alp_bench::ALPColumnDescriptor& dataset, int64_t* ffor_arr, uint8_t bw, int64_t* base_arr, @@ -38,7 +38,7 @@ static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_fus return benchmark::BenchmarkReporter::Run( benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); } -static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_decode(alp_bench::Column& dataset, +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_decode(alp_bench::ALPColumnDescriptor& dataset, int64_t* ffor_arr, int64_t* unffor_arr, uint8_t bw, diff --git a/publication/source_code/generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_bench.cpp b/publication/source_code/generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_bench.cpp index 4a61c98..fdb1df2 100644 --- a/publication/source_code/generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_bench.cpp +++ b/publication/source_code/generated/fallback/scalar_nav_uf1/fallback_scalar_nav_1024_uf1_falp_bench.cpp @@ -3,7 +3,7 @@ #include "alp.hpp" #include "data.hpp" -static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_fused_decode(alp_bench::Column& dataset, +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_fused_decode(alp_bench::ALPColumnDescriptor& dataset, int64_t* ffor_arr, uint8_t bw, int64_t* base_arr, @@ -39,7 +39,7 @@ static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_fus return benchmark::BenchmarkReporter::Run( benchmark_number, benchmark_name, iterations, double(cycles) / (double(iterations) * 1024)); } -static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_decode(alp_bench::Column& dataset, +static __attribute__((noinline)) benchmark::BenchmarkReporter::Run bench_alp_decode(alp_bench::ALPColumnDescriptor& dataset, int64_t* ffor_arr, int64_t* unffor_arr, uint8_t bw, diff --git a/test/test_alp_sample.cpp b/test/test_alp_sample.cpp index 3f2c2d8..ad1db6c 100644 --- a/test/test_alp_sample.cpp +++ b/test/test_alp_sample.cpp @@ -95,7 +95,7 @@ class alp_test : public ::testing::Test { } template - void test_column(const alp_bench::Column& column) { + void test_column(const alp_bench::ALPColumnDescriptor& column) { using UT = typename alp::inner_t::ut; using ST = typename alp::inner_t::st;