-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathensemble.py
110 lines (86 loc) · 3.19 KB
/
ensemble.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import sys
import time
import json
from datetime import datetime
import numpy as np
import pandas as pd
import hashlib
import gzip
#from models import models
from utils import read_df, read_numpy
import logging
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
def save_results(data):
logger_file = 'submissions.csv'
if not os.path.exists(logger_file):
df = pd.DataFrame([data])
df.to_csv(logger_file, index=False)
else:
df = pd.read_csv(logger_file)
df = df.append(data, ignore_index=True)
df.to_csv(logger_file, index=False)
def generate_file_sha256(filepath, blocksize=2**20):
m = hashlib.sha256()
with open(filepath , "rb") as f:
while True:
buf = f.read(blocksize)
if not buf:
break
m.update(buf)
return m.hexdigest()
def ensemble(model_names, weights=None, save_result=True):
data = []
if weights is None:
weights = [1]*len(model_names)
total = sum(weights)
for model_name in model_names:
data.append(read_numpy(os.path.join('./predictions', f'{model_name}.csv')))
result = np.zeros(data[0].shape)
for i, submission in enumerate(data):
result += submission * (weights[i]/total)
#result = result/len(model_names)
predictions = (result / result.sum(axis=1)[:,None]).round(4)
filename = "_".join(model_names) + "_weighted_" + "_".join([str(w) for w in weights])
if not os.path.exists('predictions/ensemble'):
os.makedirs('predictions/ensemble')
sub_filepath = f'predictions/ensemble/{filename}.csv'
df_predictions = pd.DataFrame(predictions)
df_predictions.to_csv(sub_filepath, index=False, header=False)
if not os.path.exists('submissions/ensemble'):
os.makedirs('submissions/ensemble')
date = datetime.now()
unix_time = int(time.mktime(date.timetuple()))
gz_filepath = 'submissions/ensemble/'+filename+'-'+str(unix_time)+'.csv.gz'
logger.info(f'Calculating hash of {sub_filepath}...')
filehash = generate_file_sha256(sub_filepath)
logger.info(f'Compressing {sub_filepath} to {gz_filepath}...')
with open(sub_filepath, 'rb') as f_original:
with gzip.open(gz_filepath, 'wb') as f_gz:
f_gz.write(f_original.read())
data= {
'sha256': filehash,
'datetime': date,
'unix_time': unix_time,
'original_filepath': sub_filepath,
'original_filename': filename,
'saved_filepath': gz_filepath,
'result': 0.0
}
if save_result:
save_results(data)
return data
if __name__ == "__main__":
#model_names = sys.argv[1:]
model_weights = {'xgboost_features2_v4_5_normalize': 7,
'xgboost_features2_v4_3_eliminate_features': 3,
'xgboost_features2_v4_4_select_3': 5,
'xgboost_features2_v4_6_more_target_feature': 1,
'xgboost_features2_v4_2_normalize_features': 5}
model_names = list(model_weights.keys())
weights = list(model_weights.values())
print(model_names)
print(weights)
ensemble(model_names, weights=weights)