-
Notifications
You must be signed in to change notification settings - Fork 1
/
run-full-profiles.sh
executable file
·84 lines (62 loc) · 1.93 KB
/
run-full-profiles.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env bash
SECONDS=0
VERSION=$1
# VERSION=v2019-03
if [[ ("$#" -ne 1) || ("$VERSION" == "") ]]; then
echo "You should add an a version (such as 'v2018-08')!"
exit 1
fi
echo "# version: ${VERSION}"
export BASE_DIR=$(readlink -e .)
echo "# base dir: $BASE_DIR"
source base-dirs.sh
SOURCE_DIR=$BASE_SOURCE_DIR/${VERSION}/full
echo "# source dir: ${SOURCE_DIR}"
OUTPUT_DIR=$BASE_OUTPUT_DIR/${VERSION}
echo "# output dir: ${OUTPUT_DIR}"
WEB_DATA_DIR=$BASE_WEB_DATA_DIR/${VERSION}
echo "# web data dir: ${WEB_DATA_DIR}"
if [[ ! -d limbo ]]; then
mkdir limbo
fi
LIMBO=$(readlink -e limbo)
CSV=$LIMBO/${VERSION}-completeness.csv
echo "# csv: ${CSV}"
PARQUET=$LIMBO/${VERSION}-profiles.parquet
echo "# parquet: ${PARQUET}"
if [[ ! -d output ]]; then
mkdir output
fi
if [[ ! -d logs ]]; then
mkdir logs
fi
LOG_DIR=$(readlink -e logs)
echo "# log dir: ${LOG_DIR}"
if [ -e ${PARQUET} ]; then
rm -rf ${PARQUET}
fi
time=$(date +"%F %T")
LOG_FILE=${LOG_DIR}/profile-to-parquet.log
echo "$time> create parquet file. Check log file: ${LOG_FILE}"
scripts/analysis/profile-to-parquet.sh ${CSV} ${PARQUET} &> ${LOG_FILE}
time=$(date +"%F %T")
LOG_FILE=${LOG_DIR}/profiles-analysis.log
echo "$time> run profile analysis. Check log file: ${LOG_FILE}"
scripts/analysis/profile-all.sh ${PARQUET} keep_dirs &> ${LOG_FILE}
cd scripts/
time=$(date +"%F %T")
LOG_FILE=${LOG_DIR}/profiles-split.log
echo "$time> split results to ${WEB_DATA_DIR}. Check log file: ${LOG_FILE}"
./split-profiles.sh ${WEB_DATA_DIR} &> ${LOG_FILE}
time=$(date +"%F %T")
LOG_FILE=${LOG_DIR}/profiles-clustering.log
echo "$time> clustering profiles in ${WEB_DATA_DIR}. Check log file: ${LOG_FILE}"
analysis/profile-clustering.sh ${WEB_DATA_DIR} &> ${LOG_FILE}
cd ..
duration=$SECONDS
hours=$(($duration / (60*60)))
mins=$(($duration % (60*60) / 60))
secs=$(($duration % 60))
time=$(date +"%F %T")
echo "$time> run-full-profiles DONE"
printf "%02d:%02d:%02d elapsed.\n" $hours $mins $secs