Based on: Delta Lake: High-Performance ACID Table Storage over Cloud Object Stores
git submodule init
git submodule update
sudo apt-get install unzip zip gcc make flex bison byacc git build-essential -y
curl -s "https://get.sdkman.io" | bash
source "$HOME/.sdkman/bin/sdkman-init.sh"
sdk install java 8.0.302-open
sdk install sbt 0.13.18
wget https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
mkdir spark3
tar -xzf spark-3.2.0-bin-hadoop3.2.tgz --strip 1 -C spark3
NOTE: Deploy spark in standalone cluster mode.
wget -P /tmp https://github.com/sbt/sbt/releases/download/v0.13.18/sbt-0.13.18.tgz
tar -xf /tmp/sbt-0.13.18.tgz -C /tmp
cd spark-sql-perf
cp /tmp/sbt/bin/sbt-launch.jar build/sbt-launch-0.13.18.jar
bin/run
sbt +package
cd ../tpcds-kit/tools
make OS=LINUX
cd ../../tpch-dbgen
git checkout 0469309147b42abac8857fa61b4cf69a6d3128a8 -- bm_utils.c
make
NOTE: This should be installed on all cluster nodes with the same location and build tpcds-kit
, tpch-dbgen
NOTE: Change master-ip
executor-memory
, num-executors
,executor-cores
a/c to your machine specifications in .sh
files.
cd ../tpch
#For generating ~100GB parquet data
./gendata_parquet.sh
# For runing all 22 TPC-H Queries
./runtpch_parquet.sh
cd ../tpch
#For generating ~100GB orc data
./gendata_orc.sh
# For runing all 22 TPC-H Queries
./runtpch_orc.sh
cd ../tpch
#For generating ~100GB csv data
./gendata_csv.sh
# For runing all 22 TPC-H Queries
./runtpch_csv.sh
cd ../tpch
#For generating ~100GB csv data
./gendata_json.sh
# For runing all 22 TPC-H Queries
./runtpch_son.sh
cd ../tpcds
#For generating ~100GB parquet data
./gendata_parquet.sh
# For runing all 99 TPC-DS Queries
./runtpch_parquet.sh
cd ../tpcds
#For generating ~100GB orc data
./gendata_orc.sh
# For runing all 99 TPC-DS Queries
./runtpch_orc.sh
cd ../tpcds
#For generating ~100GB csv data
./gendata_csv.sh
# For runing all 99 TPC-DS Queries
./runtpch_csv.sh
cd ../tpcds
#For generating ~100GB csv data
./gendata_json.sh
# For runing all 99 TPC-DS Queries
./runtpch_json.sh
## 5. Reports
### 5.1 TPCH reports
```bash
cd tpch/tpch_<parquet,orc,csv>_reports
# result will be present in part*.csv file
cd tpcds/tpcds_<parquet,orc,csv,json>_reports
# result will be present in part*.csv file