-
Notifications
You must be signed in to change notification settings - Fork 93
/
firstNCharCVTE.py
133 lines (97 loc) · 4.51 KB
/
firstNCharCVTE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""Target-encode high cardinality categorical text by their first few characters in the string """
"""The str columns must be first marked as text in Data Sets page before recipe can take effect """
from h2oaicore.transformer_utils import CustomTransformer
import datatable as dt
import numpy as np
from h2oaicore.transformers import CVTargetEncodeTransformer
from sklearn.preprocessing import LabelEncoder
class firstNChars:
def fit_transform(self, X: dt.Frame, n):
return self.transform(X, n)
def transform(self, X: dt.Frame, n):
assert X.ncols == 1
return dt.Frame(X.to_pandas().apply(lambda x: x[0:n], axis=1))
class frst1ChrsCVTE(CustomTransformer):
_testing_can_skip_failure = False # ensure tested as if shouldn't fail
_unsupervised = False # uses target
_uses_target = True # uses target
@staticmethod
def get_default_properties():
return dict(col_type="text", min_cols=1, max_cols=1, relative_importance=1)
def fit_transform(self, X: dt.Frame, y: np.array = None):
self.binner = firstNChars()
X = self.binner.fit_transform(X, 1)
# Compute mean target (out of fold) per same string
self.cvte = CVTargetEncodeTransformer(cat_cols=X.names)
if self.labels is not None:
# for classification, always turn y into numeric form, even if already integer
y = dt.Frame(LabelEncoder().fit(self.labels).transform(y))
X = self.cvte.fit_transform(X, y)
return X
def transform(self, X: dt.Frame):
X = self.binner.transform(X, 1)
X = self.cvte.transform(X)
return X
class frst2ChrsCVTE(CustomTransformer):
_testing_can_skip_failure = False # ensure tested as if shouldn't fail
_unsupervised = False # uses target
_uses_target = True # uses target
@staticmethod
def get_default_properties():
return dict(col_type="text", min_cols=1, max_cols=1, relative_importance=1)
def fit_transform(self, X: dt.Frame, y: np.array = None):
self.binner = firstNChars()
X = self.binner.fit_transform(X, 2)
# Compute mean target (out of fold) per same string
self.cvte = CVTargetEncodeTransformer(cat_cols=X.names)
if self.labels is not None:
# for classification, always turn y into numeric form, even if already integer
y = dt.Frame(LabelEncoder().fit(self.labels).transform(y))
X = self.cvte.fit_transform(X, y)
return X
def transform(self, X: dt.Frame):
X = self.binner.transform(X, 2)
X = self.cvte.transform(X)
return X
class frst3ChrsCVTE(CustomTransformer):
_testing_can_skip_failure = False # ensure tested as if shouldn't fail
_unsupervised = False # uses target
_uses_target = True # uses target
@staticmethod
def get_default_properties():
return dict(col_type="text", min_cols=1, max_cols=1, relative_importance=1)
def fit_transform(self, X: dt.Frame, y: np.array = None):
self.binner = firstNChars()
X = self.binner.fit_transform(X, 3)
# Compute mean target (out of fold) per same string
self.cvte = CVTargetEncodeTransformer(cat_cols=X.names)
if self.labels is not None:
# for classification, always turn y into numeric form, even if already integer
y = dt.Frame(LabelEncoder().fit(self.labels).transform(y))
X = self.cvte.fit_transform(X, y)
return X
def transform(self, X: dt.Frame):
X = self.binner.transform(X, 3)
X = self.cvte.transform(X)
return X
class frst4ChrsCVTE(CustomTransformer):
_testing_can_skip_failure = False # ensure tested as if shouldn't fail
_unsupervised = False # uses target
_uses_target = True # uses target
@staticmethod
def get_default_properties():
return dict(col_type="text", min_cols=1, max_cols=1, relative_importance=1)
def fit_transform(self, X: dt.Frame, y: np.array = None):
self.binner = firstNChars()
X = self.binner.fit_transform(X, 4)
# Compute mean target (out of fold) per same string
self.cvte = CVTargetEncodeTransformer(cat_cols=X.names)
if self.labels is not None:
# for classification, always turn y into numeric form, even if already integer
y = dt.Frame(LabelEncoder().fit(self.labels).transform(y))
X = self.cvte.fit_transform(X, y)
return X
def transform(self, X: dt.Frame):
X = self.binner.transform(X, 4)
X = self.cvte.transform(X)
return X