Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hdp26 #13

Open
wants to merge 3 commits into
base: hdp26
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions settings/init.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ set hive.optimize.reducededuplication.min.reducer=1;
set hive.optimize.mapjoin.mapreduce=true;
set hive.stats.autogather=true;

set mapred.reduce.parallel.copies=30;
set mapred.job.shuffle.input.buffer.percent=0.5;
set mapred.job.reduce.input.buffer.percent=0.2;
set mapred.map.child.java.opts=-server -Xmx2800m -Djava.net.preferIPv4Stack=true;
set mapred.reduce.child.java.opts=-server -Xmx3800m -Djava.net.preferIPv4Stack=true;
set mapreduce.map.memory.mb=3072;
set mapreduce.reduce.memory.mb=4096;
-- Uncomment since we are using tez engine.
-- set mapred.reduce.parallel.copies=30;
-- set mapred.job.shuffle.input.buffer.percent=0.5;
-- set mapred.job.reduce.input.buffer.percent=0.2;
-- set mapred.map.child.java.opts=-server -Xmx2800m -Djava.net.preferIPv4Stack=true;
-- set mapred.reduce.child.java.opts=-server -Xmx3800m -Djava.net.preferIPv4Stack=true;
-- set mapreduce.map.memory.mb=3072;
-- set mapreduce.reduce.memory.mb=4096;
109 changes: 86 additions & 23 deletions tpch-setup.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,50 @@
#!/bin/bash

function usage {
echo "Usage: tpch-setup.sh scale_factor [temp_directory]"
exit 1

echo " Usage: tpch-setup.sh [--cli --server --port --tempdir ] scale_factor"
echo " This script will generate and optimize data for Hive server benchmark testing."
echo " "
echo -e " --cli\t\tCLI to use for Hive. Options are 'beeline' or 'hive'. Default is 'hive'."
echo " "
echo -e " --jdbc\tOptional parameter when using beeline CLI. This is the server for the\n\t\tdatabase connection sring."
echo " "
echo -e " --tempdir\tOptional parameter for data generation path."
echo " "
echo -e " scale_factor\tScale factor for data generation in GB."
exit 1

}

# Get options
while test $# -gt 0; do
case "$1" in
-h|--help)
usage
exit 0
;;
--cli)
shift
CLITYPE="$1"
shift
;;
--jdbc)
shift
URL="$1"
shift
;;
--tempdir)
shift
DIR="$1"
shift
;;
*)
SCALE="$1"
shift
;;
esac
done

function runcommand {
if [ "X$DEBUG_SCRIPT" != "X" ]; then
$1
Expand All @@ -17,6 +57,20 @@ if [ ! -f tpch-gen/target/tpch-gen-1.0-SNAPSHOT.jar ]; then
echo "Please build the data generator with ./tpch-build.sh first"
exit 1
fi

# if no CLI is supplied, default to hive
if [ "X$CLITYPE" == "X" ]; then
$CLITYPE="hive"
fi

if [ "$CLITYPE" == "beeline" ]; then
if [ "X$URL" == "X" ]; then
echo "Server URL must be supplied if attempting to run beeline CLI"
usage
exit 1
fi
fi

which hive > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo "Script must be run where Hive is installed"
Expand All @@ -25,10 +79,6 @@ fi

# Tables in the TPC-H schema.
TABLES="part partsupp supplier customer orders lineitem nation region"

# Get the parameters.
SCALE=$1
DIR=$2
BUCKETS=13
if [ "X$DEBUG_SCRIPT" != "X" ]; then
set -x
Expand Down Expand Up @@ -62,8 +112,12 @@ echo "TPC-H text data generation complete."

# Create the text/flat tables as external tables. These will be later be converted to ORCFile.
echo "Loading text data into external tables."
runcommand "hive -i settings/load-flat.sql -f ddl-tpch/bin_flat/alltables.sql -d DB=tpch_text_${SCALE} -d LOCATION=${DIR}/${SCALE}"

if [ "$CLITYPE" == "beeline" ]; then
runcommand "beeline -u ${URL} -i settings/load-flat.sql --silent=true --hivevar DB=tpch_text_${SCALE} --hivevar LOCATION=${DIR}/${SCALE} -f ddl-tpch/bin_flat/alltables.sql"
else
runcommand "hive -i settings/load-flat.sql -f ddl-tpch/bin_flat/alltables.sql -d DB=tpch_text_${SCALE} -d LOCATION=${DIR}/${SCALE}"
fi
# Create the optimized tables.
i=1
total=8
Expand All @@ -77,23 +131,32 @@ fi
DATABASE=tpch_${SCHEMA_TYPE}_orc_${SCALE}
MAX_REDUCERS=2600 # ~7 years of data
REDUCERS=$((test ${SCALE} -gt ${MAX_REDUCERS} && echo ${MAX_REDUCERS}) || echo ${SCALE})

for t in ${TABLES}
do
echo "Optimizing table $t ($i/$total)."
COMMAND="hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/${t}.sql \
-d DB=${DATABASE} \
-d SOURCE=tpch_text_${SCALE} -d BUCKETS=${BUCKETS} \
-d SCALE=${SCALE} -d REDUCERS=${REDUCERS} \
-d FILE=orc"
runcommand "$COMMAND"
if [ $? -ne 0 ]; then
echo "Command failed, try 'export DEBUG_SCRIPT=ON' and re-running"
exit 1
fi
i=`expr $i + 1`
echo "Optimizing table $t ($i/$total)."
if [ "$CLITYPE" == "beeline" ]; then
COMMAND="beeline -u ${URL} -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/${t}.sql \
--silent=true --hivevar DB=${DATABASE} \
--hivevar SOURCE=tpch_text_${SCALE} --hivevar BUCKETS=${BUCKETS} \
--hivevar SCALE=${SCALE} --hivevar REDUCERS=${REDUCERS} \
--hivevar FILE=orc"
else
COMMAND="hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/${t}.sql \
-d DB=${DATABASE} \
-d SOURCE=tpch_text_${SCALE} -d BUCKETS=${BUCKETS} \
-d SCALE=${SCALE} -d REDUCERS=${REDUCERS} \
-d FILE=orc"
fi
runcommand "$COMMAND"
if [ $? -ne 0 ]; then
echo "Command failed, try 'export DEBUG_SCRIPT=ON' and re-running"
exit 1
fi
i=`expr $i + 1`
done

hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/analyze.sql --database ${DATABASE};

if [ "$CLITYPE" == "beeline" ]; then
beeline -u ${URL} -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/analyze.sql;
else
hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/analyze.sql --database ${DATABASE};
fi
echo "Data loaded into database ${DATABASE}."