From a432fca2f4ca66fd12b799740153ee5895af57f9 Mon Sep 17 00:00:00 2001
From: azim <afroozeh3@gmail.com>
Date: Sun, 1 Dec 2024 22:42:25 +0100
Subject: [PATCH] add a guide to how to benchmark your own data

---
 BENCHMARKING.md                        |  30 +------
 benchmarks/CMakeLists.txt              |  10 +--
 benchmarks/your_own_dataset.csv        |   1 -
 benchmarks/your_own_dataset_result.csv |   1 -
 how_to_benchmark_your_dataset.md       | 109 +++++++++++++++++++++++++
 5 files changed, 113 insertions(+), 38 deletions(-)
 create mode 100644 how_to_benchmark_your_dataset.md

diff --git a/BENCHMARKING.md b/BENCHMARKING.md
index f014768..ddd17b5 100644
--- a/BENCHMARKING.md
+++ b/BENCHMARKING.md
@@ -148,32 +148,4 @@ We benchmarked PseudoDecimals within BtrBlocks. Results are located on `publicat
 
 ### ELF Speed Test
 
-We benchmarked Elf using their Java implementation.
-
-
-
-## How to benchmark with your own data
-
-## Build
-
-```shell
-cmake [OPTIONS] .
-make
-```
-
-### Setup Data
-
-Inside `data/include/double_columns.hpp` you can find an array containing information regarding the datasets used to
-benchmark ALP. Datasets information includes a path to a sample of one vector (1024 values) in CSV format (inside
-`/data/samples/`) and a path to the entire file in binary format.
-
-The binary file is used to benchmark ALP compression ratios, while the CSV sample is used to benchmark ALP speed. To
-ensure the correctness of the speed tests we also keep extra variables from each dataset, which include the number of
-exceptions and the bitwidth resulting after compression (unless the algorithm changes, these should remain consistent),
-and the factor/exponent indexes used to encode/decode the doubles into integers.
-
-To set up the data you want to run the test on, add or remove entries in the array found
-in [double_columns.hpp](/data/include/double/double_dataset.hpp) and `make` again. The data needed for each entry is detailed
-in [column.hpp](/data/include/column.hpp). To replicate the compression ratio tests you only need to set the dataset id,
-name, and binary_file_path.
-
+We benchmarked Elf using their Java implementation.
\ No newline at end of file
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 0fbae57..61be9d8 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -2,18 +2,14 @@ if (NOT DEFINED ENV{ALP_DATASET_DIR_PATH})
     message(WARNING "Set ALP_DATASET_DIR_PATH environment variable")
     message(WARNING "Set HURRICANE_ISABEL_DATASET_DIR_PATH" environment variable)
 else ()
-
+    add_executable(test_compression_ratio test_compression_ratio.cpp)
+    target_link_libraries(test_compression_ratio PUBLIC ALP gtest_main)
+    gtest_discover_tests(test_compression_ratio)
 endif ()
 
 add_subdirectory(bench_speed)
 
-add_executable(test_compression_ratio test_compression_ratio.cpp)
-target_link_libraries(test_compression_ratio PUBLIC ALP gtest_main)
-gtest_discover_tests(test_compression_ratio)
-
-
 add_executable(bench_your_dataset bench_your_dataset.cpp)
 target_link_libraries(bench_your_dataset PUBLIC ALP gtest_main)
 gtest_discover_tests(bench_your_dataset)
 
-
diff --git a/benchmarks/your_own_dataset.csv b/benchmarks/your_own_dataset.csv
index 87cbec1..0f85fa1 100644
--- a/benchmarks/your_own_dataset.csv
+++ b/benchmarks/your_own_dataset.csv
@@ -1,2 +1 @@
 id,column_name,data_type,path,file_type
-0,CLOUDf48.bin.f32,float,/Users/azim/CLionProjects/ALP/100x500x500/CLOUDf48.bin.f32,binary
\ No newline at end of file
diff --git a/benchmarks/your_own_dataset_result.csv b/benchmarks/your_own_dataset_result.csv
index 349b8df..c22b484 100644
--- a/benchmarks/your_own_dataset_result.csv
+++ b/benchmarks/your_own_dataset_result.csv
@@ -1,2 +1 @@
 idx,column,data_type,size,rowgroups_count,vectors_count
-0,CLOUDf48.bin.f32,float,9.36,245,24414
diff --git a/how_to_benchmark_your_dataset.md b/how_to_benchmark_your_dataset.md
new file mode 100644
index 0000000..e06d38c
--- /dev/null
+++ b/how_to_benchmark_your_dataset.md
@@ -0,0 +1,109 @@
+# Using Your Own Data in ALP Benchmarks
+
+This guide explains how to set up and benchmark your own dataset using ALP.
+
+---
+
+## Step 1: Understand the Dataset Configuration Format
+
+The dataset configuration is provided in a [CSV file](benchmarks/your_own_dataset.csv), where each row describes a column in your dataset.
+
+Below is the explanation of each parameter:
+
+### Example:
+```csv
+id,column_name,data_type,path,file_type
+0,CLOUDf48.bin.f32,float,/Users/azim/CLionProjects/ALP/100x500x500/CLOUDf48.bin.f32,binary
+```
+
+### Parameters:
+1. **`id`**:
+   - A unique integer identifier for the column.
+   - Example: `0`
+
+2. **`column_name`**:
+   - A descriptive name for the column.
+   - Example: `CLOUDf48.bin.f32`
+
+3. **`data_type`**:
+   - The type of data in the column.
+   - Allowed values: `float`, `double`
+   - Example: `float`
+
+4. **`path`**:
+   - The absolute path to the data file for the column.
+   - Example: `/Users/azim/CLionProjects/ALP/100x500x500/CLOUDf48.bin.f32`
+
+5. **`file_type`**:
+   - The format of the data file.
+   - Allowed values: `binary`, `csv`
+   - Example: `binary`
+
+---
+
+## Step 2: Create Your Dataset Configuration File
+
+Edit the [CSV file](benchmarks/your_own_dataset.csv) to define your dataset using the format described above.
+
+### Example:
+```csv
+id,column_name,data_type,path,file_type
+0,AnotherDoubleColumn,double,/Users/azim/CLionProjects/ALP/another_double_column.csv,csv
+1,AnotherFloatColumn,float,/Users/azim/CLionProjects/ALP/another_float_column.csv,binary
+```
+
+---
+
+## Step 3: Build ALP with Benchmarking Enabled
+
+To enable benchmarking in ALP:
+
+1. Configure the build using CMake with the `ALP_BUILD_BENCHMARKING` option set to `ON`:
+   ```bash
+   cmake -DALP_BUILD_BENCHMARKING=ON -S . -B build
+   ```
+
+2. Build the project:
+   ```bash
+   cmake --build build
+   ```
+
+---
+
+## Step 4: Run the Benchmark
+
+Run the benchmark executable:
+
+```bash
+cd build
+./benchmarks/bench_your_dataset
+```
+
+---
+
+## Step 5: Analyze the Results
+
+The benchmark tool will save the results [here](benchmarks/your_own_dataset_result.csv).
+
+The results include the following columns:
+- **`idx`**: Column index.
+- **`column`**: Column name.
+- **`data_type`**: Data type (`float`, `double`).
+- **`size`**: Number of bits used to encode this dataset per value.
+- **`rowgroups_count`**: Number of row groups. A row group is composed of 100 vectors.
+- **`vectors_count`**: Number of vectors. A vector always has 1024 values.
+
+---
+
+### Notes
+
+1. Ensure all file paths in your dataset configuration are valid and accessible.
+2. Verify that the `data_type` and `file_type` values in the CSV match the format of your data files.
+3. If benchmarking fails, check the logs for errors such as missing files or unsupported formats.
+
+---
+
+By following these steps, you can configure and benchmark your own datasets in ALP, allowing you to evaluate ALP's performance with your data.
+
+We would love to hear about your data and results, so please share them with us. Your feedback can help improve ALP further.
+