-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
changes to use hs2 interface and to run suite in a loop #5
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,26 +1,33 @@ | ||
#!/bin/bash | ||
#Script Usage : ./RunQueriesAndCollectPATData.sh SCALE_FACTOR CLUSTER_SSH_PASSWORD | ||
if [ $# -ne 2 ] | ||
if [ $# -lt 2 ] | ||
then | ||
echo "usage:./RunQueriesAndCollectPATData.sh SCALE_FACTOR CLUSTER_SSH_PASSWORD" | ||
echo "usage:./RunQueriesAndCollectPATData.sh SCALE_FACTOR CLUSTER_SSH_PASSWORD [RUN_ID]" | ||
exit 1 | ||
fi | ||
|
||
if [ -z "$3" ] | ||
then | ||
RUN_ID=1 | ||
else | ||
RUN_ID=$3 | ||
fi | ||
|
||
BENCH_HOME=$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd ); | ||
echo "\$BENCH_HOME is set to $BENCH_HOME"; | ||
|
||
BENCHMARK=hive-testbench | ||
|
||
RESULT_DIR=$BENCH_HOME/$BENCHMARK/results/ | ||
RESULT_DIR=$BENCH_HOME/$BENCHMARK/results_$RUN_ID/ | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we include everything about one run under a single dir? |
||
mkdir $RESULT_DIR | ||
chmod -R 777 $RESULT_DIR | ||
|
||
LOG_DIR=$BENCH_HOME/$BENCHMARK/logs/ | ||
LOG_DIR=$BENCH_HOME/$BENCHMARK/logs_$RUN_ID/ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we include everything about one run under a single dir? |
||
mkdir $LOG_DIR | ||
|
||
# Initialize log file for data loading times | ||
LOG_FILE_EXEC_TIMES="${BENCH_HOME}/${BENCHMARK}/logs/query_times.csv" | ||
LOG_FILE_EXEC_TIMES="${LOG_DIR}/query_times.csv" | ||
|
||
if [ ! -e "$LOG_FILE_EXEC_TIMES" ] | ||
then | ||
touch "$LOG_FILE_EXEC_TIMES" | ||
|
@@ -36,8 +43,8 @@ fi | |
|
||
for i in {1..22} | ||
do | ||
./GetPatData.sh $2 ./TpchQueryExecute.sh $1 $i tpch_query_$i | ||
./GetPatData.sh $2 ./TpchQueryExecute.sh $1 $i $RUN_ID $RUN_ID/tpch_query_$i | ||
done | ||
|
||
echo "collecting perf data" | ||
./CollectPerfData.sh $RESULT_DIR | ||
./CollectPerfData.sh $RUN_ID $RESULT_DIR |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/bin/bash | ||
#usage: ./RunSingleQueryLoop QUERY_NUMBER REPEAT_COUNT SCALCE_FACTOR CLUSTER_SSH_PASSWORD | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wrong usage. |
||
|
||
if [ $# -ne 3 ] | ||
then | ||
echo "Usage ./RunSuiteLoop REPEAT_COUNT SCALE_FACTOR CLUSTER_SSH_PASSWORD" | ||
exit 1 | ||
fi | ||
|
||
counter=0 | ||
while [ $counter -lt $1 ]; do | ||
STARTDATE="`date +%Y/%m/%d:%H:%M:%S`" | ||
STARTTIME="`date +%s`" | ||
REPEAT_COUNT=$1 | ||
let counter=counter+1 | ||
echo "Running Iteration $counter" | ||
RUN_ID=$counter | ||
for i in {1..22} | ||
do | ||
./GetPatData.sh $3 ./TpchQueryExecute.sh $2 $i $RUN_ID $RUN_ID/tpch_query_$i | ||
done | ||
done |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,14 +2,21 @@ | |
#usage: TpchQueryExecute.sh SCALE_FACTOR QUERY_NUMBER | ||
# This script runs the hive queries on the data generated from the tpch suite and reports query execution times | ||
|
||
if [ $# -ne 2 ] | ||
if [ $# -lt 2 ] | ||
then | ||
echo "Usage: ./TpchQueryExecute.sh SCALE_FACTOR QUERY_NUMBER" | ||
echo "Usage: ./TpchQueryExecute.sh SCALE_FACTOR QUERY_NUMBER [RUN_ID] [JDBC_CONNECTION_STRING]" | ||
exit 1 | ||
else | ||
SCALE="$1" | ||
fi | ||
|
||
if [ -z "$3" ] | ||
then | ||
RUN_ID=1 | ||
else | ||
RUN_ID=$3 | ||
fi | ||
|
||
# get home path | ||
BENCH_HOME=$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd ); | ||
echo "\$BENCH_HOME is set to $BENCH_HOME"; | ||
|
@@ -20,9 +27,9 @@ HIVE_SETTING=$BENCH_HOME/$BENCHMARK/sample-queries-tpch/testbench.settings | |
# Set path to tpc-h queries | ||
QUERY_DIR=$BENCH_HOME/$BENCHMARK/sample-queries-tpch | ||
|
||
RESULT_DIR=$BENCH_HOME/$BENCHMARK/results/ | ||
RESULT_DIR=$BENCH_HOME/$BENCHMARK/results_$RUN_ID/ | ||
|
||
PLAN_DIR=$BENCH_HOME/$BENCHMARK/plans/ | ||
PLAN_DIR=$BENCH_HOME/$BENCHMARK/plans_$RUN_ID/ | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same as above. Under single dir? |
||
if [ ! -d "$RESULT_DIR" ]; then | ||
mkdir $RESULT_DIR | ||
|
@@ -32,7 +39,10 @@ if [ ! -d "$PLAN_DIR" ]; then | |
mkdir $PLAN_DIR | ||
fi | ||
|
||
LOG_FILE_EXEC_TIMES="${BENCH_HOME}/${BENCHMARK}/logs/query_times.csv" | ||
LOG_DIR=$BENCH_HOME/$BENCHMARK/logs_$RUN_ID/ | ||
mkdir $LOG_DIR | ||
|
||
LOG_FILE_EXEC_TIMES="${BENCH_HOME}/${BENCHMARK}/logs_$RUN_ID/query_times.csv" | ||
|
||
if [ ! -e "$LOG_FILE_EXEC_TIMES" ] | ||
then | ||
|
@@ -69,8 +79,14 @@ STARTTIME="`date +%s`" # seconds since epochstart | |
echo "Hive query: ${2}" | ||
while [ $RETURN_VAL -ne 0 -a $EXECUTION_COUNT -lt $RETRY_COUNT ] | ||
do | ||
if [ -z $4 ] | ||
then | ||
CONNECTION_STRING="jdbc:hive2://localhost:10001/$DATABASE;transportMode=http" | ||
else | ||
CONNECTION_STRING=$4 | ||
fi | ||
|
||
timeout ${TIMEOUT} hive -i ${HIVE_SETTING} --database ${DATABASE} -d EXPLAIN="" -f ${QUERY_DIR}/tpch_query${2}.sql > ${RESULT_DIR}/${DATABASE}_query${j}.txt 2>&1 | ||
beeline -u ${CONNECTION_STRING} -i ${HIVE_SETTING} --hivevar EXPLAIN="" -f ${QUERY_DIR}/tpch_query${2}.sql > ${RESULT_DIR}/${DATABASE}_query${j}.txt 2>&1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: extra space at the start. |
||
RETURN_VAL=$? | ||
((EXECUTION_COUNT++)) | ||
|
||
|
@@ -89,5 +105,5 @@ STARTTIME="`date +%s`" # seconds since epochstart | |
DURATION="$(($DIFF_IN_SECONDS / 3600 ))h $((($DIFF_IN_SECONDS % 3600) / 60))m $(($DIFF_IN_SECONDS % 60))s" | ||
# log the times in load_time.csv file | ||
echo "Query${j},${DIFF_IN_SECONDS},${STARTTIME},${STOPTIME},${BENCHMARK},${DATABASE},${SCALE},${FILE_FORMAT},${STATUS}" >> ${LOG_FILE_EXEC_TIMES} | ||
hive -i ${HIVE_SETTING} --database ${DATABASE} -d EXPLAIN="explain" -f ${QUERY_DIR}/tpch_query${2}.sql > ${PLAN_DIR}/plan_${DATABASE}_query${j}.txt 2>&1 | ||
beeline -u ${CONNECTION_STRING} -i ${HIVE_SETTING} --hivevar EXPLAIN="explain" -f ${QUERY_DIR}/tpch_query${2}.sql > ${PLAN_DIR}/plan_${DATABASE}_query${j}.txt 2>&1 | ||
done |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,8 @@ fi | |
|
||
>${STATS_DIR}/tableinfo_${DATABASE}.txt; | ||
|
||
hive -d DB=${DATABASE} -f gettpchtablecounts.sql > ${STATS_DIR}/tablecounts_${DATABASE}.txt ; | ||
hive -d DB=${DATABASE} -f gettpchtableinfo.sql >> ${STATS_DIR}/tableinfo_${DATABASE}.txt ; | ||
CONNECTION_STRING="jdbc:hive2://localhost:10001/${DATABASE};transportMode=http" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will this work in case of failover? |
||
|
||
beeline -u ${CONNECTION_STRING} --hivevar DB=${DATABASE} -f $BENCH_HOME/$BENCHMARK/tpch-scripts/gettpchtablecounts.sql > ${STATS_DIR}/tablecounts_${DATABASE}.txt ; | ||
beeline -u ${CONNECTION_STRING} --hivevar DB=${DATABASE} -f $BENCH_HOME/$BENCHMARK/tpch-scripts/ gettpchtableinfo.sql >> ${STATS_DIR}/tableinfo_${DATABASE}.txt ; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
#/bin/bash | ||
#!/bin/bash | ||
|
||
function usage { | ||
echo "Usage: tpch-setup.sh scale_factor [temp_directory]" | ||
|
@@ -53,7 +53,7 @@ hdfs dfs -mkdir -p ${DIR} | |
hdfs dfs -ls ${DIR}/${SCALE}/lineitem > /dev/null | ||
if [ $? -ne 0 ]; then | ||
echo "Generating data at scale factor $SCALE." | ||
(cd tpch-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE}) | ||
(cd tpch-gen; hadoop jar target/*.jar -D mapreduce.map.memory.mb=8192 -d ${DIR}/${SCALE}/ -s ${SCALE}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should not hard code settings here. May be have a global variable or something if you really want. |
||
fi | ||
hdfs dfs -ls ${DIR}/${SCALE}/lineitem > /dev/null | ||
if [ $? -ne 0 ]; then | ||
|
@@ -65,7 +65,10 @@ echo "TPC-H text data generation complete." | |
DATAGENTIME="`date +%s`" | ||
# Create the text/flat tables as external tables. These will be later be converted to ORCFile. | ||
echo "Loading text data into external tables." | ||
runcommand "hive -i settings/load-flat.sql -f ddl-tpch/bin_flat/alltables.sql -d DB=tpch_text_${SCALE} -d LOCATION=${DIR}/${SCALE}" | ||
|
||
DATABASE=tpch_text_${SCALE} | ||
CONNECTION_STRING="jdbc:hive2://localhost:10001/$DATABASE;transportMode=http" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above. |
||
runcommand "beeline -u ${CONNECTION_STRING} -i settings/load-flat.sql -f ddl-tpch/bin_flat/alltables.sql --hivevar DB=tpch_text_${SCALE} --hivevar LOCATION=${DIR}/${SCALE}" | ||
|
||
EXTERNALTABLELOAD="`date +%s`" | ||
# Create the optimized tables. | ||
|
@@ -79,15 +82,16 @@ else | |
fi | ||
|
||
DATABASE=tpch_${SCHEMA_TYPE}_orc_${SCALE} | ||
CONNECTION_STRING="jdbc:hive2://localhost:10001/$DATABASE;transportMode=http" | ||
|
||
for t in ${TABLES} | ||
do | ||
echo "Optimizing table $t ($i/$total)." | ||
COMMAND="hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/${t}.sql \ | ||
-d DB=${DATABASE} \ | ||
-d SOURCE=tpch_text_${SCALE} -d BUCKETS=${BUCKETS} \ | ||
-d SCALE=${SCALE} \ | ||
-d FILE=orc" | ||
COMMAND="beeline -u ${CONNECTION_STRING} -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/${t}.sql \ | ||
--hivevar DB=${DATABASE} \ | ||
--hivevar SOURCE=tpch_text_${SCALE} --hivevar BUCKETS=${BUCKETS} \ | ||
--hivevar SCALE=${SCALE} \ | ||
--hivevar FILE=orc" | ||
runcommand "$COMMAND" | ||
if [ $? -ne 0 ]; then | ||
echo "Command failed, try 'export DEBUG_SCRIPT=ON' and re-running" | ||
|
@@ -98,7 +102,7 @@ done | |
|
||
ORCLOAD="`date +%s`" | ||
|
||
ANALYZE_COMMAND="hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/analyze.sql --database ${DATABASE}" | ||
ANALYZE_COMMAND="beeline -u ${CONNECTION_STRING} -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/analyze.sql" | ||
|
||
if $RUN_ANALYZE; then | ||
echo "Running analyze" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We currently Zip full path in the zip (e.g. home/hdiuser/hive-testbench/PerfData_2/pat/tpch_query_2/.... ). Can we correct the zipping to not include the unnecessary /hdiuser/hive-testbench/ ?