diff --git a/settings/init.sql b/settings/init.sql index 382e6fe..f8adb47 100644 --- a/settings/init.sql +++ b/settings/init.sql @@ -5,10 +5,11 @@ set hive.optimize.reducededuplication.min.reducer=1; set hive.optimize.mapjoin.mapreduce=true; set hive.stats.autogather=true; -set mapred.reduce.parallel.copies=30; -set mapred.job.shuffle.input.buffer.percent=0.5; -set mapred.job.reduce.input.buffer.percent=0.2; -set mapred.map.child.java.opts=-server -Xmx2800m -Djava.net.preferIPv4Stack=true; -set mapred.reduce.child.java.opts=-server -Xmx3800m -Djava.net.preferIPv4Stack=true; -set mapreduce.map.memory.mb=3072; -set mapreduce.reduce.memory.mb=4096; +-- Uncomment since we are using tez engine. +-- set mapred.reduce.parallel.copies=30; +-- set mapred.job.shuffle.input.buffer.percent=0.5; +-- set mapred.job.reduce.input.buffer.percent=0.2; +-- set mapred.map.child.java.opts=-server -Xmx2800m -Djava.net.preferIPv4Stack=true; +-- set mapred.reduce.child.java.opts=-server -Xmx3800m -Djava.net.preferIPv4Stack=true; +-- set mapreduce.map.memory.mb=3072; +-- set mapreduce.reduce.memory.mb=4096; diff --git a/tpch-setup.sh b/tpch-setup.sh index 021b253..addee92 100755 --- a/tpch-setup.sh +++ b/tpch-setup.sh @@ -1,10 +1,50 @@ #!/bin/bash function usage { - echo "Usage: tpch-setup.sh scale_factor [temp_directory]" - exit 1 + + echo " Usage: tpch-setup.sh [--cli --server --port --tempdir ] scale_factor" + echo " This script will generate and optimize data for Hive server benchmark testing." + echo " " + echo -e " --cli\t\tCLI to use for Hive. Options are 'beeline' or 'hive'. Default is 'hive'." + echo " " + echo -e " --jdbc\tOptional parameter when using beeline CLI. This is the server for the\n\t\tdatabase connection sring." + echo " " + echo -e " --tempdir\tOptional parameter for data generation path." + echo " " + echo -e " scale_factor\tScale factor for data generation in GB." + exit 1 + } +# Get options +while test $# -gt 0; do + case "$1" in + -h|--help) + usage + exit 0 + ;; + --cli) + shift + CLITYPE="$1" + shift + ;; + --jdbc) + shift + URL="$1" + shift + ;; + --tempdir) + shift + DIR="$1" + shift + ;; + *) + SCALE="$1" + shift + ;; + esac +done + function runcommand { if [ "X$DEBUG_SCRIPT" != "X" ]; then $1 @@ -17,6 +57,20 @@ if [ ! -f tpch-gen/target/tpch-gen-1.0-SNAPSHOT.jar ]; then echo "Please build the data generator with ./tpch-build.sh first" exit 1 fi + +# if no CLI is supplied, default to hive +if [ "X$CLITYPE" == "X" ]; then + $CLITYPE="hive" +fi + +if [ "$CLITYPE" == "beeline" ]; then + if [ "X$URL" == "X" ]; then + echo "Server URL must be supplied if attempting to run beeline CLI" + usage + exit 1 + fi +fi + which hive > /dev/null 2>&1 if [ $? -ne 0 ]; then echo "Script must be run where Hive is installed" @@ -25,10 +79,6 @@ fi # Tables in the TPC-H schema. TABLES="part partsupp supplier customer orders lineitem nation region" - -# Get the parameters. -SCALE=$1 -DIR=$2 BUCKETS=13 if [ "X$DEBUG_SCRIPT" != "X" ]; then set -x @@ -62,8 +112,12 @@ echo "TPC-H text data generation complete." # Create the text/flat tables as external tables. These will be later be converted to ORCFile. echo "Loading text data into external tables." -runcommand "hive -i settings/load-flat.sql -f ddl-tpch/bin_flat/alltables.sql -d DB=tpch_text_${SCALE} -d LOCATION=${DIR}/${SCALE}" +if [ "$CLITYPE" == "beeline" ]; then + runcommand "beeline -u ${URL} -i settings/load-flat.sql --silent=true --hivevar DB=tpch_text_${SCALE} --hivevar LOCATION=${DIR}/${SCALE} -f ddl-tpch/bin_flat/alltables.sql" +else + runcommand "hive -i settings/load-flat.sql -f ddl-tpch/bin_flat/alltables.sql -d DB=tpch_text_${SCALE} -d LOCATION=${DIR}/${SCALE}" +fi # Create the optimized tables. i=1 total=8 @@ -77,23 +131,32 @@ fi DATABASE=tpch_${SCHEMA_TYPE}_orc_${SCALE} MAX_REDUCERS=2600 # ~7 years of data REDUCERS=$((test ${SCALE} -gt ${MAX_REDUCERS} && echo ${MAX_REDUCERS}) || echo ${SCALE}) - for t in ${TABLES} do - echo "Optimizing table $t ($i/$total)." - COMMAND="hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/${t}.sql \ - -d DB=${DATABASE} \ - -d SOURCE=tpch_text_${SCALE} -d BUCKETS=${BUCKETS} \ - -d SCALE=${SCALE} -d REDUCERS=${REDUCERS} \ - -d FILE=orc" - runcommand "$COMMAND" - if [ $? -ne 0 ]; then - echo "Command failed, try 'export DEBUG_SCRIPT=ON' and re-running" - exit 1 - fi - i=`expr $i + 1` + echo "Optimizing table $t ($i/$total)." + if [ "$CLITYPE" == "beeline" ]; then + COMMAND="beeline -u ${URL} -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/${t}.sql \ + --silent=true --hivevar DB=${DATABASE} \ + --hivevar SOURCE=tpch_text_${SCALE} --hivevar BUCKETS=${BUCKETS} \ + --hivevar SCALE=${SCALE} --hivevar REDUCERS=${REDUCERS} \ + --hivevar FILE=orc" + else + COMMAND="hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/${t}.sql \ + -d DB=${DATABASE} \ + -d SOURCE=tpch_text_${SCALE} -d BUCKETS=${BUCKETS} \ + -d SCALE=${SCALE} -d REDUCERS=${REDUCERS} \ + -d FILE=orc" + fi + runcommand "$COMMAND" + if [ $? -ne 0 ]; then + echo "Command failed, try 'export DEBUG_SCRIPT=ON' and re-running" + exit 1 + fi + i=`expr $i + 1` done - -hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/analyze.sql --database ${DATABASE}; - +if [ "$CLITYPE" == "beeline" ]; then + beeline -u ${URL} -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/analyze.sql; +else + hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/analyze.sql --database ${DATABASE}; +fi echo "Data loaded into database ${DATABASE}."