Skip to content

Commit

Permalink
add zstd for float
Browse files Browse the repository at this point in the history
  • Loading branch information
azimafroozeh committed Nov 16, 2024
1 parent 98ccea6 commit 716f3d7
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 46 deletions.
5 changes: 5 additions & 0 deletions publication/compression_ratio_result/float/zstd.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
dataset,size
Dino-Vitb16,39.57
GPT2,39.76
Grammarly-lg,39.37
WAV2VEC,43.15
107 changes: 61 additions & 46 deletions publication/source_code/bench_compression_ratio/zstd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,81 +6,96 @@

class zstd_test : public ::testing::Test {
public:
double* dbl_arr;
void* enc_dbl_arr;
void* dec_dbl_arr;
size_t zstd_vector_size =
void* enc_dbl_arr;
void* dec_dbl_arr;
size_t zstd_vector_size =
alp::config::ROWGROUP_SIZE; // For Zstd we compress rowgroups since it would be unfair to compress small vectors
size_t enc_size_upper_bound = zstd_vector_size * 8;
size_t input_size = zstd_vector_size * 8;
size_t dec_size = input_size;

void SetUp() override {
dbl_arr = new double[zstd_vector_size];
enc_dbl_arr = malloc(input_size);
dec_dbl_arr = malloc(input_size);
}

~zstd_test() override {
delete[] dbl_arr;
free(enc_dbl_arr);
free(dec_dbl_arr);
}
};

TEST_F(zstd_test, test_zstd_on_whole_datasets) {
std::ofstream ofile(alp_bench::get_paths().result_dir_path + "zstd_compression_ratio.csv", std::ios::out);
ofile << "dataset,size\n";
template <typename T, int N_DATASETS>
void bench_compression_ratio(const std::array<alp_bench::Column, N_DATASETS>& datasets, const std::string& path) {
if (const auto v = std::getenv("ALP_DATASET_DIR_PATH"); v != nullptr) {
alp_bench::get_paths().alp_dataset_binary_dir_path = *v;
}

for (auto& dataset : alp_bench::get_alp_dataset()) {
if (dataset.name.find("bw") != std::string::npos) { continue; }
std::ofstream ofile(path, std::ios::out);
ofile << "dataset,size\n";

size_t tuples_count;
const auto* data_column = mapper::mmap_file<double>(tuples_count, dataset.binary_file_path);
double value_to_encode = 0.0;
size_t vector_idx {0};
size_t processed_tuples = 0;
auto* dbl_arr = new T[zstd_vector_size];

size_t compressed_data_size = 0;
for (auto& dataset : datasets) {
if (dataset.name.find("bw") != std::string::npos) { continue; }
std::cout << dataset.name << std::endl;

std::cout << dataset.name << "\n";
size_t tuples_count;
const auto* data_column = mapper::mmap_file<T>(tuples_count, dataset.binary_file_path);
T value_to_encode = 0.0;
size_t vector_idx {0};
size_t processed_tuples = 0;
size_t compressed_data_size = 0;

if (tuples_count < zstd_vector_size) {
zstd_vector_size = tuples_count;
input_size = zstd_vector_size * 8;
enc_size_upper_bound = zstd_vector_size * 8;
}
if (tuples_count < zstd_vector_size) {
zstd_vector_size = tuples_count;
input_size = zstd_vector_size * 8;
enc_size_upper_bound = zstd_vector_size * 8;
}

/* Encode - Decode - Validate. */
for (size_t i = 0; i < tuples_count; i++) {
value_to_encode = data_column[i];
dbl_arr[vector_idx] = value_to_encode;
vector_idx = vector_idx + 1;
/* Encode - Decode - Validate. */
for (size_t i = 0; i < tuples_count; i++) {
value_to_encode = data_column[i];
dbl_arr[vector_idx] = value_to_encode;
vector_idx = vector_idx + 1;

if (vector_idx != zstd_vector_size) { continue; }
if (vector_idx != zstd_vector_size) { continue; }

processed_tuples += zstd_vector_size;
processed_tuples += zstd_vector_size;

// Encode
size_t const ENC_SIZE = ZSTD_compress(enc_dbl_arr, enc_size_upper_bound, dbl_arr, input_size, 3); // Level 3
// Encode
size_t const ENC_SIZE =
ZSTD_compress(enc_dbl_arr, enc_size_upper_bound, dbl_arr, input_size, 3); // Level 3

// SUM COMPRESSED SIZE
compressed_data_size += ENC_SIZE * 8;
// SUM COMPRESSED SIZE
compressed_data_size += ENC_SIZE * 8;

// Decode
ZSTD_decompress(dec_dbl_arr, dec_size, enc_dbl_arr, ENC_SIZE);
// Decode
ZSTD_decompress(dec_dbl_arr, dec_size, enc_dbl_arr, ENC_SIZE);

const auto* dec_dbl_arr_tmp = static_cast<double*>(dec_dbl_arr);
for (size_t j = 0; j < zstd_vector_size; ++j) {
const auto l = dbl_arr[j];
if (const auto r = dec_dbl_arr_tmp[j]; l != r) { std::cerr << j << ", " << dataset.name << "\n"; }
ASSERT_EQ(dbl_arr[j], dec_dbl_arr_tmp[j]);
const auto* dec_dbl_arr_tmp = static_cast<T*>(dec_dbl_arr);
for (size_t j = 0; j < zstd_vector_size; ++j) {
const auto l = dbl_arr[j];
if (const auto r = dec_dbl_arr_tmp[j]; l != r) { std::cerr << j << ", " << dataset.name << "\n"; }
ASSERT_EQ(dbl_arr[j], dec_dbl_arr_tmp[j]);
}
vector_idx = 0;
}
vector_idx = 0;
}

auto compression_ratio = (double)compressed_data_size / processed_tuples;
auto compression_ratio = (double)compressed_data_size / processed_tuples;

ofile << std::fixed << std::setprecision(2) << dataset.name << "," << compression_ratio << "\n";
ofile << std::fixed << std::setprecision(2) << dataset.name << "," << compression_ratio << "\n";
}

delete[] dbl_arr;
}
};

TEST_F(zstd_test, test_zstd128_on_whole_datasets) {
auto result_path = alp_bench::get_paths().result_dir_path + "compression_ratio_result/double/zstd.csv";
bench_compression_ratio<double, 30>(alp_bench::get_alp_dataset(), result_path);
}

TEST_F(zstd_test, test_zstd128_on_float_datasets) {
auto result_path = alp_bench::get_paths().result_dir_path + "compression_ratio_result/float/zstd.csv";
bench_compression_ratio<float, 4>(alp_bench::get_sp_datasets(), result_path);
}

0 comments on commit 716f3d7

Please sign in to comment.