From 494675f3696571689e60916ea0074fbf4ae49437 Mon Sep 17 00:00:00 2001 From: azim Date: Sat, 16 Nov 2024 21:58:08 +0100 Subject: [PATCH] add patas for float --- ...{patas_compression_ratio.csv => patas.csv} | 0 .../compression_ratio_result/float/patas.csv | 5 + .../bench_compression_ratio/patas.cpp | 205 ++++++++++-------- 3 files changed, 115 insertions(+), 95 deletions(-) rename publication/compression_ratio_result/double/{patas_compression_ratio.csv => patas.csv} (100%) create mode 100644 publication/compression_ratio_result/float/patas.csv diff --git a/publication/compression_ratio_result/double/patas_compression_ratio.csv b/publication/compression_ratio_result/double/patas.csv similarity index 100% rename from publication/compression_ratio_result/double/patas_compression_ratio.csv rename to publication/compression_ratio_result/double/patas.csv diff --git a/publication/compression_ratio_result/float/patas.csv b/publication/compression_ratio_result/float/patas.csv new file mode 100644 index 0000000..d185cab --- /dev/null +++ b/publication/compression_ratio_result/float/patas.csv @@ -0,0 +1,5 @@ +dataset,size,vectors_count +Dino-Vitb16,35.25,84364 +GPT2,35.24,121523 +Grammarly-lg,35.23,764739 +WAV2VEC,26.28,92183 diff --git a/publication/source_code/bench_compression_ratio/patas.cpp b/publication/source_code/bench_compression_ratio/patas.cpp index 46c4464..1a67df4 100644 --- a/publication/source_code/bench_compression_ratio/patas.cpp +++ b/publication/source_code/bench_compression_ratio/patas.cpp @@ -10,129 +10,144 @@ class patas_test : public ::testing::Test { public: uint8_t* data_arr; uint16_t* packed_data_arr; - double* dbl_arr; - double* dec_dbl_p; - uint64_t* uint64_p; - uint64_t* dec_arr; // Encode - uint16_t* packed_metadata; - alp_bench::patas::PatasCompressionState patas_state; - alp_bench::patas::PatasUnpackedValueStats* unpacked_data; + uint16_t* packed_metadata; + alp_bench::patas::PatasUnpackedValueStats* unpacked_data; // Decode alp_bench::ByteReader byte_reader; void SetUp() override { - dbl_arr = new double[1024]; data_arr = new uint8_t[8192 + 2048]; // We leave some overhead room in case of negative compression packed_data_arr = new uint16_t[1024]; packed_metadata = new uint16_t[1024]; - dec_arr = new uint64_t[1024]; unpacked_data = new alp_bench::patas::PatasUnpackedValueStats[1024]; } ~patas_test() override { - delete[] dbl_arr; - delete[] dec_arr; delete[] data_arr; delete[] packed_data_arr; delete[] packed_metadata; delete[] unpacked_data; } -}; -/* - * Patas overhead per vector in a hypothetic file format = next_block_offset; - * Next block offset is needed to be able to skip blocks of data - */ -double patas_overhead_per_vector {static_cast(16)}; - -TEST_F(patas_test, test_patas_on_whole_datasets) { - std::ofstream ofile(alp_bench::get_paths().result_dir_path + "patas_compression_ratio.csv", std::ios::out); - ofile << "dataset,size,vectors_count\n"; - - for (auto& dataset : alp_bench::get_alp_dataset()) { - - std::cout << dataset.name << std::endl; - - size_t compressed_data_size = 0; - - size_t tuples_count; - auto* data_column = mapper::mmap_file(tuples_count, dataset.binary_file_path); - double value_to_encode {0.0}; - size_t vector_idx {0}; - size_t rowgroup_offset {0}; - size_t vectors_count = {0}; - /* Encode - Decode - Validate. */ - for (size_t i = 0; i < tuples_count; i++) { - value_to_encode = data_column[i]; - dbl_arr[vector_idx] = value_to_encode; - vector_idx = vector_idx + 1; - rowgroup_offset = rowgroup_offset + 1; - - if (vector_idx != alp::config::VECTOR_SIZE) { continue; } - - // Init Encoding - patas_state.SetOutputBuffer(data_arr); - patas_state.packed_data_buffer.SetBuffer(packed_metadata); - patas_state.Reset(); - - /* - * - * Encode - * - */ - uint64_p = reinterpret_cast(dbl_arr); - for (size_t i {0}; i < alp::config::VECTOR_SIZE; ++i) { - alp_bench::patas::PatasCompression::Store(uint64_p[i], patas_state); - } + /* + * Patas overhead per vector in a hypothetic file format = next_block_offset; + * Next block offset is needed to be able to skip blocks of data + */ + double patas_overhead_per_vector {static_cast(16)}; + + template + void bench_compression_ratio(const std::array& datasets, const std::string& path) { + using INNERTYPE = + typename std::conditional_t, + uint64_t, + typename std::conditional_t, uint32_t, void>>; + + alp_bench::patas::PatasCompressionState patas_state; + INNERTYPE* uint64_p; + auto* dbl_arr = new double[1024]; + T* dec_dbl_p; + auto* dec_arr = new INNERTYPE[1024]; + + std::ofstream ofile(path, std::ios::out); + ofile << "dataset,size,vectors_count\n"; + + for (auto& dataset : datasets) { + std::cout << dataset.name << std::endl; + + size_t compressed_data_size = 0; + + size_t tuples_count; + auto* data_column = mapper::mmap_file(tuples_count, dataset.binary_file_path); + double value_to_encode {0.0}; + size_t vector_idx {0}; + size_t rowgroup_offset {0}; + size_t vectors_count = {0}; + /* Encode - Decode - Validate. */ + for (size_t i = 0; i < tuples_count; i++) { + value_to_encode = data_column[i]; + dbl_arr[vector_idx] = value_to_encode; + vector_idx = vector_idx + 1; + rowgroup_offset = rowgroup_offset + 1; + + if (vector_idx != alp::config::VECTOR_SIZE) { continue; } + + // Init Encoding + patas_state.SetOutputBuffer(data_arr); + patas_state.packed_data_buffer.SetBuffer(packed_metadata); + patas_state.Reset(); + + /* + * + * Encode + * + */ + uint64_p = reinterpret_cast(dbl_arr); + for (size_t i {0}; i < alp::config::VECTOR_SIZE; ++i) { + alp_bench::patas::PatasCompression::Store(uint64_p[i], patas_state); + } - // SUM COMPRESSION SIZE - size_t bytes_used_by_data = patas_state.byte_writer.BytesWritten(); - size_t packed_data_size = patas_state.packed_data_buffer.index * sizeof(uint16_t); - compressed_data_size += (alp_bench::AlignValue(bytes_used_by_data) + packed_data_size) * 8; - compressed_data_size += patas_overhead_per_vector; - - // Init decoding - byte_reader.SetStream(data_arr); - - /* - * - * DECODE - * - */ - // UNPACKING METADATA (16 bits) - for (size_t i = 0; i < alp::config::VECTOR_SIZE; i++) { - alp_bench::PackedDataUtils::Unpack(packed_metadata[i], - (alp_bench::UnpackedData&)unpacked_data[i]); - } + // SUM COMPRESSION SIZE + size_t bytes_used_by_data = patas_state.byte_writer.BytesWritten(); + size_t packed_data_size = patas_state.packed_data_buffer.index * sizeof(uint16_t); + compressed_data_size += (alp_bench::AlignValue(bytes_used_by_data) + packed_data_size) * 8; + compressed_data_size += patas_overhead_per_vector; + + // Init decoding + byte_reader.SetStream(data_arr); + + /* + * + * DECODE + * + */ + // UNPACKING METADATA (16 bits) + for (size_t i = 0; i < alp::config::VECTOR_SIZE; i++) { + alp_bench::PackedDataUtils::Unpack(packed_metadata[i], + (alp_bench::UnpackedData&)unpacked_data[i]); + } - // USING UNPACKED METADATA AND DATA BUFFER WE LOAD THE DOUBLE VALUES - dec_arr[0] = (uint64_t)0; - for (size_t i = 0; i < alp::config::VECTOR_SIZE; i++) { - dec_arr[i] = alp_bench::patas::PatasDecompression::DecompressValue( - byte_reader, - unpacked_data[i].significant_bytes, - unpacked_data[i].trailing_zeros, - dec_arr[i - unpacked_data[i].index_diff]); - } + // USING UNPACKED METADATA AND DATA BUFFER WE LOAD THE DOUBLE VALUES + dec_arr[0] = (uint64_t)0; + for (size_t i = 0; i < alp::config::VECTOR_SIZE; i++) { + dec_arr[i] = alp_bench::patas::PatasDecompression::DecompressValue( + byte_reader, + unpacked_data[i].significant_bytes, + unpacked_data[i].trailing_zeros, + dec_arr[i - unpacked_data[i].index_diff]); + } - for (size_t j = 0; j < alp::config::VECTOR_SIZE; j++) { - if (uint64_p[j] != dec_arr[j]) { - std::cout << j << ", " << rowgroup_offset << ", " << dataset.name << std::endl; + for (size_t j = 0; j < alp::config::VECTOR_SIZE; j++) { + if (uint64_p[j] != dec_arr[j]) { + std::cout << j << ", " << rowgroup_offset << ", " << dataset.name << std::endl; + } + ASSERT_EQ(uint64_p[j], dec_arr[j]); } - ASSERT_EQ(uint64_p[j], dec_arr[j]); + vector_idx = 0; + vectors_count += 1; } - vector_idx = 0; - vectors_count += 1; + auto processed_tuples = vectors_count * alp::config::VECTOR_SIZE; + auto compression_ratio = (double)compressed_data_size / processed_tuples; + + ofile << std::fixed << std::setprecision(2) << dataset.name << "," << compression_ratio << "," + << vectors_count << std::endl; } - auto processed_tuples = vectors_count * alp::config::VECTOR_SIZE; - auto compression_ratio = (double)compressed_data_size / processed_tuples; - ofile << std::fixed << std::setprecision(2) << dataset.name << "," << compression_ratio << "," << vectors_count - << std::endl; + delete[] dbl_arr; + delete[] dec_arr; } +}; + +TEST_F(patas_test, test_patas128_on_whole_datasets) { + auto result_path = alp_bench::get_paths().result_dir_path + "compression_ratio_result/double/patas.csv"; + bench_compression_ratio(alp_bench::get_alp_dataset(), result_path); +} + +TEST_F(patas_test, test_patas128_on_float_datasets) { + auto result_path = alp_bench::get_paths().result_dir_path + "compression_ratio_result/float/patas.csv"; + bench_compression_ratio(alp_bench::get_sp_datasets(), result_path); } // NOLINTEND