diff --git a/tpch-scripts/CollectPerfData.sh b/tpch-scripts/CollectPerfData.sh index 39cf86a..d628829 100644 --- a/tpch-scripts/CollectPerfData.sh +++ b/tpch-scripts/CollectPerfData.sh @@ -6,29 +6,36 @@ BENCHMARK=hive-testbench if [ $# -eq 0 ] then - echo "Usage ./CollectPerfData.sh RESULTS_DIR PERFDATA_OUTPUTDIR SERVER" + echo "Usage ./CollectPerfData.sh RUN_ID RESULTS_DIR PERFDATA_OUTPUTDIR SERVER" echo "Default Values will be used if you do not provide command line parameters" fi if [ -z $1 ] then - RESULTS_DIR=$BENCH_HOME/$BENCHMARK/results/ + RUN_ID=1 else - RESULTS_DIR=$1 + RUN_ID=$1 fi if [ -z $2 ] then - PERFDATA_OUTPUTDIR=$BENCH_HOME/$BENCHMARK/PerfData/ + RESULTS_DIR=$BENCH_HOME/$BENCHMARK/results_$RUN_ID/ else - PERFDATA_OUTPUTDIR=$2 + RESULTS_DIR=$2 fi if [ -z $3 ] +then + PERFDATA_OUTPUTDIR=$BENCH_HOME/$BENCHMARK/PerfData_$RUN_ID/ +else + PERFDATA_OUTPUTDIR=$3 +fi + +if [ -z $4 ] then SERVER=http://headnodehost:8188/ws/v1/timeline else - SERVER=$3 + SERVER=$4 fi echo "RESULTS_DIR is set to $RESULTS_DIR" @@ -52,10 +59,10 @@ mkdir $PERFDATA_OUTPUTDIR ./GetATSDAG.sh $PERFDATA_OUTPUTDIR -cp -R $BENCH_HOME/$BENCHMARK/tpch-scripts/PAT-master/PAT/results $PERFDATA_OUTPUTDIR/pat +cp -R $BENCH_HOME/$BENCHMARK/tpch-scripts/PAT-master/PAT/results/$RUN_ID $PERFDATA_OUTPUTDIR/pat echo "Completed Running PerfData Collection Scripts" -zip -r $BENCH_HOME/$BENCHMARK/PerfData.zip $PERFDATA_OUTPUTDIR +zip -r $BENCH_HOME/$BENCHMARK/PerfData_$RUN_ID.zip $PERFDATA_OUTPUTDIR echo "zipped Perfdata to $BENCH_HOME/$BENCHMARK/PerfData.zip" diff --git a/tpch-scripts/GetPatData.sh b/tpch-scripts/GetPatData.sh index b83a548..4efe6b1 100644 --- a/tpch-scripts/GetPatData.sh +++ b/tpch-scripts/GetPatData.sh @@ -38,12 +38,12 @@ ALL_NODES: `cat /etc/hadoop/conf/slaves | tr '\r\n' ' '` WORKER_SCRIPT_DIR: /tmp/PAT WORKER_TMP_DIR: /tmp/PAT_TMP -CMD_PATH: `readlink -e $2` $3 $4 +CMD_PATH: `readlink -e $2` $3 $4 $5 SAMPLE_RATE: 1 INSTRUMENTS: cpustat memstat netstat iostat vmstat jvms perf EOM cd PAT-master/PAT/ -./pat run $5 +./pat run $6 diff --git a/tpch-scripts/RunQueriesAndCollectPATData.sh b/tpch-scripts/RunQueriesAndCollectPATData.sh index 2e6b5ea..8823dd8 100644 --- a/tpch-scripts/RunQueriesAndCollectPATData.sh +++ b/tpch-scripts/RunQueriesAndCollectPATData.sh @@ -1,26 +1,33 @@ #!/bin/bash #Script Usage : ./RunQueriesAndCollectPATData.sh SCALE_FACTOR CLUSTER_SSH_PASSWORD -if [ $# -ne 2 ] +if [ $# -lt 2 ] then - echo "usage:./RunQueriesAndCollectPATData.sh SCALE_FACTOR CLUSTER_SSH_PASSWORD" + echo "usage:./RunQueriesAndCollectPATData.sh SCALE_FACTOR CLUSTER_SSH_PASSWORD [RUN_ID]" exit 1 fi +if [ -z "$3" ] +then + RUN_ID=1 +else + RUN_ID=$3 +fi + BENCH_HOME=$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd ); echo "\$BENCH_HOME is set to $BENCH_HOME"; BENCHMARK=hive-testbench -RESULT_DIR=$BENCH_HOME/$BENCHMARK/results/ +RESULT_DIR=$BENCH_HOME/$BENCHMARK/results_$RUN_ID/ mkdir $RESULT_DIR -chmod -R 777 $RESULT_DIR -LOG_DIR=$BENCH_HOME/$BENCHMARK/logs/ +LOG_DIR=$BENCH_HOME/$BENCHMARK/logs_$RUN_ID/ mkdir $LOG_DIR # Initialize log file for data loading times -LOG_FILE_EXEC_TIMES="${BENCH_HOME}/${BENCHMARK}/logs/query_times.csv" +LOG_FILE_EXEC_TIMES="${LOG_DIR}/query_times.csv" + if [ ! -e "$LOG_FILE_EXEC_TIMES" ] then touch "$LOG_FILE_EXEC_TIMES" @@ -36,8 +43,8 @@ fi for i in {1..22} do -./GetPatData.sh $2 ./TpchQueryExecute.sh $1 $i tpch_query_$i +./GetPatData.sh $2 ./TpchQueryExecute.sh $1 $i $RUN_ID $RUN_ID/tpch_query_$i done echo "collecting perf data" -./CollectPerfData.sh $RESULT_DIR \ No newline at end of file +./CollectPerfData.sh $RUN_ID $RESULT_DIR diff --git a/tpch-scripts/RunSuiteLoop.sh b/tpch-scripts/RunSuiteLoop.sh new file mode 100644 index 0000000..e7bd75f --- /dev/null +++ b/tpch-scripts/RunSuiteLoop.sh @@ -0,0 +1,22 @@ +#!/bin/bash +#usage: ./RunSingleQueryLoop QUERY_NUMBER REPEAT_COUNT SCALCE_FACTOR CLUSTER_SSH_PASSWORD + +if [ $# -ne 3 ] +then + echo "Usage ./RunSuiteLoop REPEAT_COUNT SCALE_FACTOR CLUSTER_SSH_PASSWORD" + exit 1 +fi + +counter=0 +while [ $counter -lt $1 ]; do +STARTDATE="`date +%Y/%m/%d:%H:%M:%S`" +STARTTIME="`date +%s`" +REPEAT_COUNT=$1 +let counter=counter+1 +echo "Running Iteration $counter" +RUN_ID=$counter +for i in {1..22} +do +./GetPatData.sh $3 ./TpchQueryExecute.sh $2 $i $RUN_ID $RUN_ID/tpch_query_$i +done +done diff --git a/tpch-scripts/TpchQueryExecute.sh b/tpch-scripts/TpchQueryExecute.sh index 6abd204..370124e 100644 --- a/tpch-scripts/TpchQueryExecute.sh +++ b/tpch-scripts/TpchQueryExecute.sh @@ -2,14 +2,21 @@ #usage: TpchQueryExecute.sh SCALE_FACTOR QUERY_NUMBER # This script runs the hive queries on the data generated from the tpch suite and reports query execution times -if [ $# -ne 2 ] +if [ $# -lt 2 ] then - echo "Usage: ./TpchQueryExecute.sh SCALE_FACTOR QUERY_NUMBER" + echo "Usage: ./TpchQueryExecute.sh SCALE_FACTOR QUERY_NUMBER [RUN_ID] [JDBC_CONNECTION_STRING]" exit 1 else SCALE="$1" fi +if [ -z "$3" ] +then + RUN_ID=1 +else + RUN_ID=$3 +fi + # get home path BENCH_HOME=$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd ); echo "\$BENCH_HOME is set to $BENCH_HOME"; @@ -20,9 +27,9 @@ HIVE_SETTING=$BENCH_HOME/$BENCHMARK/sample-queries-tpch/testbench.settings # Set path to tpc-h queries QUERY_DIR=$BENCH_HOME/$BENCHMARK/sample-queries-tpch -RESULT_DIR=$BENCH_HOME/$BENCHMARK/results/ +RESULT_DIR=$BENCH_HOME/$BENCHMARK/results_$RUN_ID/ -PLAN_DIR=$BENCH_HOME/$BENCHMARK/plans/ +PLAN_DIR=$BENCH_HOME/$BENCHMARK/plans_$RUN_ID/ if [ ! -d "$RESULT_DIR" ]; then mkdir $RESULT_DIR @@ -32,7 +39,10 @@ if [ ! -d "$PLAN_DIR" ]; then mkdir $PLAN_DIR fi -LOG_FILE_EXEC_TIMES="${BENCH_HOME}/${BENCHMARK}/logs/query_times.csv" +LOG_DIR=$BENCH_HOME/$BENCHMARK/logs_$RUN_ID/ +mkdir $LOG_DIR + +LOG_FILE_EXEC_TIMES="${BENCH_HOME}/${BENCHMARK}/logs_$RUN_ID/query_times.csv" if [ ! -e "$LOG_FILE_EXEC_TIMES" ] then @@ -69,8 +79,14 @@ STARTTIME="`date +%s`" # seconds since epochstart echo "Hive query: ${2}" while [ $RETURN_VAL -ne 0 -a $EXECUTION_COUNT -lt $RETRY_COUNT ] do + if [ -z $4 ] + then + CONNECTION_STRING="jdbc:hive2://localhost:10001/$DATABASE;transportMode=http" + else + CONNECTION_STRING=$4 + fi - timeout ${TIMEOUT} hive -i ${HIVE_SETTING} --database ${DATABASE} -d EXPLAIN="" -f ${QUERY_DIR}/tpch_query${2}.sql > ${RESULT_DIR}/${DATABASE}_query${j}.txt 2>&1 + beeline -u ${CONNECTION_STRING} -i ${HIVE_SETTING} --hivevar EXPLAIN="" -f ${QUERY_DIR}/tpch_query${2}.sql > ${RESULT_DIR}/${DATABASE}_query${j}.txt 2>&1 RETURN_VAL=$? ((EXECUTION_COUNT++)) @@ -89,5 +105,5 @@ STARTTIME="`date +%s`" # seconds since epochstart DURATION="$(($DIFF_IN_SECONDS / 3600 ))h $((($DIFF_IN_SECONDS % 3600) / 60))m $(($DIFF_IN_SECONDS % 60))s" # log the times in load_time.csv file echo "Query${j},${DIFF_IN_SECONDS},${STARTTIME},${STOPTIME},${BENCHMARK},${DATABASE},${SCALE},${FILE_FORMAT},${STATUS}" >> ${LOG_FILE_EXEC_TIMES} - hive -i ${HIVE_SETTING} --database ${DATABASE} -d EXPLAIN="explain" -f ${QUERY_DIR}/tpch_query${2}.sql > ${PLAN_DIR}/plan_${DATABASE}_query${j}.txt 2>&1 + beeline -u ${CONNECTION_STRING} -i ${HIVE_SETTING} --hivevar EXPLAIN="explain" -f ${QUERY_DIR}/tpch_query${2}.sql > ${PLAN_DIR}/plan_${DATABASE}_query${j}.txt 2>&1 done diff --git a/tpch-scripts/ValidateDataGen.sh b/tpch-scripts/ValidateDataGen.sh index 3f6f06a..9ed6a9f 100644 --- a/tpch-scripts/ValidateDataGen.sh +++ b/tpch-scripts/ValidateDataGen.sh @@ -13,6 +13,8 @@ fi >${STATS_DIR}/tableinfo_${DATABASE}.txt; -hive -d DB=${DATABASE} -f gettpchtablecounts.sql > ${STATS_DIR}/tablecounts_${DATABASE}.txt ; -hive -d DB=${DATABASE} -f gettpchtableinfo.sql >> ${STATS_DIR}/tableinfo_${DATABASE}.txt ; +CONNECTION_STRING="jdbc:hive2://localhost:10001/${DATABASE};transportMode=http" + +beeline -u ${CONNECTION_STRING} --hivevar DB=${DATABASE} -f $BENCH_HOME/$BENCHMARK/tpch-scripts/gettpchtablecounts.sql > ${STATS_DIR}/tablecounts_${DATABASE}.txt ; +beeline -u ${CONNECTION_STRING} --hivevar DB=${DATABASE} -f $BENCH_HOME/$BENCHMARK/tpch-scripts/ gettpchtableinfo.sql >> ${STATS_DIR}/tableinfo_${DATABASE}.txt ; diff --git a/tpch-setup.sh b/tpch-setup.sh index e5e5909..4c9a33b 100644 --- a/tpch-setup.sh +++ b/tpch-setup.sh @@ -1,4 +1,4 @@ -#/bin/bash +#!/bin/bash function usage { echo "Usage: tpch-setup.sh scale_factor [temp_directory]" @@ -53,7 +53,7 @@ hdfs dfs -mkdir -p ${DIR} hdfs dfs -ls ${DIR}/${SCALE}/lineitem > /dev/null if [ $? -ne 0 ]; then echo "Generating data at scale factor $SCALE." - (cd tpch-gen; hadoop jar target/*.jar -d ${DIR}/${SCALE}/ -s ${SCALE}) + (cd tpch-gen; hadoop jar target/*.jar -D mapreduce.map.memory.mb=8192 -d ${DIR}/${SCALE}/ -s ${SCALE}) fi hdfs dfs -ls ${DIR}/${SCALE}/lineitem > /dev/null if [ $? -ne 0 ]; then @@ -65,7 +65,10 @@ echo "TPC-H text data generation complete." DATAGENTIME="`date +%s`" # Create the text/flat tables as external tables. These will be later be converted to ORCFile. echo "Loading text data into external tables." -runcommand "hive -i settings/load-flat.sql -f ddl-tpch/bin_flat/alltables.sql -d DB=tpch_text_${SCALE} -d LOCATION=${DIR}/${SCALE}" + +DATABASE=tpch_text_${SCALE} +CONNECTION_STRING="jdbc:hive2://localhost:10001/$DATABASE;transportMode=http" +runcommand "beeline -u ${CONNECTION_STRING} -i settings/load-flat.sql -f ddl-tpch/bin_flat/alltables.sql --hivevar DB=tpch_text_${SCALE} --hivevar LOCATION=${DIR}/${SCALE}" EXTERNALTABLELOAD="`date +%s`" # Create the optimized tables. @@ -79,15 +82,16 @@ else fi DATABASE=tpch_${SCHEMA_TYPE}_orc_${SCALE} +CONNECTION_STRING="jdbc:hive2://localhost:10001/$DATABASE;transportMode=http" for t in ${TABLES} do echo "Optimizing table $t ($i/$total)." - COMMAND="hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/${t}.sql \ - -d DB=${DATABASE} \ - -d SOURCE=tpch_text_${SCALE} -d BUCKETS=${BUCKETS} \ - -d SCALE=${SCALE} \ - -d FILE=orc" + COMMAND="beeline -u ${CONNECTION_STRING} -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/${t}.sql \ + --hivevar DB=${DATABASE} \ + --hivevar SOURCE=tpch_text_${SCALE} --hivevar BUCKETS=${BUCKETS} \ + --hivevar SCALE=${SCALE} \ + --hivevar FILE=orc" runcommand "$COMMAND" if [ $? -ne 0 ]; then echo "Command failed, try 'export DEBUG_SCRIPT=ON' and re-running" @@ -98,7 +102,7 @@ done ORCLOAD="`date +%s`" -ANALYZE_COMMAND="hive -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/analyze.sql --database ${DATABASE}" +ANALYZE_COMMAND="beeline -u ${CONNECTION_STRING} -i settings/load-${SCHEMA_TYPE}.sql -f ddl-tpch/bin_${SCHEMA_TYPE}/analyze.sql" if $RUN_ANALYZE; then echo "Running analyze"