-
Notifications
You must be signed in to change notification settings - Fork 1
/
ingest-ml.sh
204 lines (182 loc) · 8.44 KB
/
ingest-ml.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/bin/bash
source ./server/config.sh
env GOOS=linux GOARCH=amd64 go get -a -v github.com/uncharted-distil/distil-ingest/cmd/distil-merge@$BRANCH
env GOOS=linux GOARCH=amd64 go get -a -v github.com/uncharted-distil/distil-ingest/cmd/distil-classify@$BRANCH
env GOOS=linux GOARCH=amd64 go get -a -v github.com/uncharted-distil/distil-ingest/cmd/distil-rank@$BRANCH
env GOOS=linux GOARCH=amd64 go get -a -v github.com/uncharted-distil/distil-ingest/cmd/distil-ingest@$BRANCH
env GOOS=linux GOARCH=amd64 go get -a -v github.com/uncharted-distil/distil-ingest/cmd/distil-summary@$BRANCH
env GOOS=linux GOARCH=amd64 go get -a -v github.com/uncharted-distil/distil-ingest/cmd/distil-cluster@$BRANCH
env GOOS=linux GOARCH=amd64 go get -a -v github.com/uncharted-distil/distil-ingest/cmd/distil-geocode@$BRANCH
env GOOS=linux GOARCH=amd64 go get -a -v github.com/uncharted-distil/distil-ingest/cmd/distil-format@$BRANCH
env GOOS=linux GOARCH=amd64 go get -a -v github.com/uncharted-distil/distil-ingest/cmd/distil-clean@$BRANCH
mv $GOPATH/bin/distil-merge ./server
mv $GOPATH/bin/distil-classify ./server
mv $GOPATH/bin/distil-rank ./server
mv $GOPATH/bin/distil-ingest ./server
mv $GOPATH/bin/distil-summary ./server
mv $GOPATH/bin/distil-cluster ./server
mv $GOPATH/bin/distil-geocode ./server
mv $GOPATH/bin/distil-format ./server
mv $GOPATH/bin/distil-clean ./server
rm -rf $HOST_DATA_DIR_COPY
mkdir -p $HOST_DATA_DIR_COPY
for DATASET in "${DATASETS_SEED[@]}"
do
echo "cp $HOST_DATA_DIR/$DATASET into $HOST_DATA_DIR_COPY/$DATASET"
cp -r $HOST_DATA_DIR/$DATASET $HOST_DATA_DIR_COPY
done
for DATASET in "${DATASETS_EVAL[@]}"
do
echo "cp $HOST_DATA_DIR_EVAL/$DATASET into $HOST_DATA_DIR_COPY/$DATASET"
cp -r $HOST_DATA_DIR_EVAL/$DATASET $HOST_DATA_DIR_COPY
done
for DATASET in "${DATASETS_DA[@]}"
do
echo "cp $HOST_DATA_DIR_DA/$DATASET into $HOST_DATA_DIR_COPY/$DATASET"
cp -r $HOST_DATA_DIR_DA/$DATASET $HOST_DATA_DIR_COPY
done
rm -rf $OUTPUT_DATA_DIR
mkdir -p $OUTPUT_DATA_DIR
#docker run \
# --name distil-auto-ml \
# --rm \
# -d \
# -p 45042:45042 \
# --env D3MOUTPUTDIR=$OUTPUT_DATA_DIR \
# --env D3MINPUTDIR=$HOST_DATA_DIR_COPY \
# --env D3MSTATICDIR=$D3MSTATICDIR \
# --env PROGRESS_INTERVAL=60 \
# -v $HOST_DATA_DIR_COPY:$HOST_DATA_DIR_COPY \
# -v $OUTPUT_DATA_DIR:$OUTPUT_DATA_DIR \
# -v $D3MSTATICDIR:$D3MSTATICDIR \
# registry.datadrivendiscovery.org/uncharted/distil-integration/distil-auto-ml:latest
echo "Waiting for the pipeline runner to be available..."
sleep 200
SCHEMA=/datasetDoc.json
HAS_HEADER=1
PRIMITIVE_ENDPOINT=localhost:45042
CLUSTER_OUTPUT_FOLDER=clusters
CLUSTER_OUTPUT_DATA=clusters/tables/learningData.csv
CLUSTER_OUTPUT_SCHEMA=clusters/datasetDoc.json
for DATASET in "${DATASETS[@]}"
do
echo "--------------------------------------------------------------------------------"
echo " Clustering $DATASET dataset"
echo "--------------------------------------------------------------------------------"
./server/distil-cluster \
--endpoint="$PRIMITIVE_ENDPOINT" \
--dataset="${DATASET}" \
--input="$HOST_DATA_DIR_COPY" \
--schema="$HOST_DATA_DIR_COPY/${DATASET}/TRAIN/dataset_TRAIN/$SCHEMA" \
--output="$OUTPUT_DATA_DIR/${DATASET}/TRAIN/dataset_TRAIN/$CLUSTER_OUTPUT_FOLDER"
done
MERGED_DATASET_FOLDER=merged
MERGED_OUTPUT_PATH=merged/tables/mergedNoHeader.csv
MERGED_OUTPUT_PATH_RELATIVE=tables/learningData.csv
MERGED_OUTPUT_HEADER_PATH=merged/tables/learningData.csv
MERGED_OUTPUT_SCHEMA=merged/datasetDoc.json
for DATASET in "${DATASETS[@]}"
do
echo "--------------------------------------------------------------------------------"
echo " Merging $DATASET dataset"
echo "--------------------------------------------------------------------------------"
./server/distil-merge \
--endpoint="$PRIMITIVE_ENDPOINT" \
--dataset="${DATASET}" \
--input="$HOST_DATA_DIR_COPY" \
--schema="$HOST_DATA_DIR_COPY/${DATASET}/TRAIN/dataset_TRAIN/$SCHEMA" \
--output="$OUTPUT_DATA_DIR/${DATASET}/TRAIN/dataset_TRAIN/$MERGED_DATASET_FOLDER"
done
FORMAT_OUTPUT_FOLDER=format
FORMAT_OUTPUT_DATA=format/tables/learningData.csv
FORMAT_OUTPUT_SCHEMA=format/datasetDoc.json
for DATASET in "${DATASETS[@]}"
do
echo "--------------------------------------------------------------------------------"
echo " FORMATTING $DATASET dataset"
echo "--------------------------------------------------------------------------------"
./server/distil-format \
--endpoint="$PRIMITIVE_ENDPOINT" \
--dataset="${DATASET}" \
--input="$HOST_DATA_DIR_COPY" \
--schema="$HOST_DATA_DIR_COPY/${DATASET}/TRAIN/dataset_TRAIN/$SCHEMA" \
--output="$OUTPUT_DATA_DIR/${DATASET}/TRAIN/dataset_TRAIN/$FORMAT_OUTPUT_FOLDER"
done
CLEANING_OUTPUT_SCHEMA=clean/datasetDoc.json
CLEANING_DATASET_FOLDER=clean
for DATASET in "${DATASETS[@]}"
do
echo "--------------------------------------------------------------------------------"
echo " Cleaning $DATASET dataset"
echo "--------------------------------------------------------------------------------"
./server/distil-clean \
--endpoint="$PRIMITIVE_ENDPOINT" \
--dataset="${DATASET}" \
--input="$HOST_DATA_DIR_COPY" \
--schema="$HOST_DATA_DIR_COPY/${DATASET}/TRAIN/dataset_TRAIN/$SCHEMA" \
--output="$OUTPUT_DATA_DIR/${DATASET}/TRAIN/dataset_TRAIN/$CLEANING_DATASET_FOLDER"
done
CLASSIFICATION_OUTPUT_PATH=classification.json
for DATASET in "${DATASETS[@]}"
do
echo "--------------------------------------------------------------------------------"
echo " Classifying $DATASET dataset"
echo "--------------------------------------------------------------------------------"
./server/distil-classify \
--endpoint="$PRIMITIVE_ENDPOINT" \
--input="$HOST_DATA_DIR_COPY" \
--schema="$HOST_DATA_DIR_COPY/${DATASET}/TRAIN/dataset_TRAIN/$SCHEMA" \
--dataset="${DATASET}" \
--output="$OUTPUT_DATA_DIR/${DATASET}/TRAIN/dataset_TRAIN/$CLASSIFICATION_OUTPUT_PATH"
done
IMPORTANCE_OUTPUT=importance.json
for DATASET in "${DATASETS[@]}"
do
echo "--------------------------------------------------------------------------------"
echo " Ranking $DATASET dataset"
echo "--------------------------------------------------------------------------------"
./server/distil-rank \
--endpoint="$PRIMITIVE_ENDPOINT" \
--input="$HOST_DATA_DIR_COPY" \
--schema="$HOST_DATA_DIR_COPY/${DATASET}/TRAIN/dataset_TRAIN/$SCHEMA" \
--dataset="${DATASET}" \
--output="$OUTPUT_DATA_DIR/${DATASET}/TRAIN/dataset_TRAIN/$IMPORTANCE_OUTPUT"
done
SUMMARY_MACHINE_OUTPUT=summary-machine.json
# Duke fails on large dataset (geolife)
for DATASET in "${DATASETS[@]}"
do
echo "--------------------------------------------------------------------------------"
echo " Summarizing $DATASET dataset"
echo "--------------------------------------------------------------------------------"
if [ "$DATASET" == "LL1_336_MS_Geolife_transport_mode_prediction_separate_lat_lon" ];
then
echo "SKIPPING SUMMARY"
else
./server/distil-summary \
--endpoint="$PRIMITIVE_ENDPOINT" \
--input="$HOST_DATA_DIR_COPY" \
--schema="$HOST_DATA_DIR_COPY/${DATASET}/TRAIN/dataset_TRAIN/$SCHEMA" \
--dataset="${DATASET}" \
--output="$OUTPUT_DATA_DIR/${DATASET}/TRAIN/dataset_TRAIN/$SUMMARY_MACHINE_OUTPUT"
fi
done
GEO_OUTPUT_FOLDER=geocoded
GEO_OUTPUT_DATA=geocoded/tables/learningData.csv
GEO_OUTPUT_SCHEMA=geocoded/datasetDoc.json
for DATASET in "${DATASETS[@]}"
do
echo "--------------------------------------------------------------------------------"
echo " Geocoding $DATASET dataset"
echo "--------------------------------------------------------------------------------"
# ./server/distil-geocode \
# --endpoint="$PRIMITIVE_ENDPOINT" \
# --input="$HOST_DATA_DIR_COPY" \
# --dataset="${DATASET}" \
# --schema="$HOST_DATA_DIR_COPY/${DATASET}/TRAIN/dataset_TRAIN/$SCHEMA" \
# --output="$OUTPUT_DATA_DIR/${DATASET}/TRAIN/dataset_TRAIN/$GEO_OUTPUT_FOLDER"
# copy the data to the right path for ingest, and also copy it so that the dataset folder gets set properly on ingest
mkdir -p "$OUTPUT_DATA_DIR/${DATASET}/TRAIN"
cp -r "$HOST_DATA_DIR_COPY/${DATASET}/TRAIN/dataset_TRAIN" "$OUTPUT_DATA_DIR/${DATASET}/TRAIN/"
cp -r "$HOST_DATA_DIR_COPY/${DATASET}/TRAIN/dataset_TRAIN" "$OUTPUT_DATA_DIR/${DATASET}/"
done