-
Notifications
You must be signed in to change notification settings - Fork 0
/
profilers.py
164 lines (130 loc) · 4.49 KB
/
profilers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
""""""
# from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV as gs
from sklearn.pipeline import Pipeline as p
from collections import defaultdict
from enum import Enum
import pandas as pd
import numpy as np
import pickle
from analyzers import DataFrameAnalyzer, DataScale
from messages import Message
class ErrorType(Enum):
UNAVAILABLE = -1
UNDEFINED = 0
MISSING_VALUE = 1
ANOMALY = 2
TYPO = 3
DUPLICATE = 4
INTEGRITY = 5
NOT_IN_RANGE = 6
class Severity(Enum):
UNDEFINED = 0
INFO = 1
CRITICAL = 2
class ColumnProfile:
def __init__(self, column_name,
dtype, scale, num_missing,
num_unique, histogram):
self.column_name = column_name
self.dtype = dtype
self.scale = scale
self.num_missing = num_missing
self.num_unique = num_unique
self.histogram = histogram
if scale == DataScale.NOMINAL:
self.range = [t[0] for t in self.histogram]
else:
self.range = None
def str(self):
return self.__repr__()
def __repr__(self):
return """
column : %s
type : %s
missing : %d
unique : %d
histogram : %s
""" % (self.column_name, self.dtype.name, self.num_missing,
self.num_unique, self.histogram)
class Profiler:
def __init__(self):
pass
class DataFrameProfiler(Profiler):
def __init__(self):
self.profiles = []
def __enter__(self):
return self
def __exit__(self):
return False
def on(self, data):
return self.run(data, columns=None)
def for_column(self, column_name):
columns = [profile.column_name for profile in self.profiles]
assert column_name in columns, Message().no_column % column_name
return self.profiles[columns.index(column_name)]
def run(self, data, columns=None):
self.profiles = []
self.analyzer = DataFrameAnalyzer()
self.analyzer.run(data, columns)
for i, col in enumerate(data.columns):
# print(stats)
scale = self.analyzer.scales[i]
profile = ColumnProfile(col, self.analyzer.dtypes[i],
scale,
np.sum(~data.iloc[i].notna()),
self.analyzer.stats.loc['unique', col],
self.analyzer.histograms[i])
self.profiles.append(profile)
return self
class PipelineProfiler(Profiler):
def __init__(self):
pass
class TensorflowPipelineProfiler(Profiler):
def __init__(self):
pass
class SklearnPipelineProfiler(Profiler):
def __init__(self):
self.profiles = defaultdict(list)
self.rules = dict({
'OneHotEncodingTransformer': lambda x: vars(x.encoder),
'OrdinalScaleTransformer': lambda x: vars(x),
'ColumnTransformer': lambda x: self.__analyzeCT(x),
'StandardScaler': lambda x: [x.mean_, x.var_],
'SimpleImputer': lambda x: x.statistics_,
'RandomForest': lambda x: vars(pickle.loads(x.dump))
})
def __analyzeCT(self, func):
for each in func.transformers:
new_name, transformer, name = each
print("%s\n%s\n%s\n" % (name, transformer.__class__.__name__,
self.derive_info(transformer)))
def derive_info(self, transformer):
name = transformer.__class__.__name__
return self.rules[name](transformer) if name in self.rules else None
def on(self, pipeline):
return self.run(pipeline)
def run(self, pipeline):
print(type(pipeline))
# assert isinstance(pipeline, p), "sklearn.pipeline.Pipeline required"
if isinstance(pipeline, gs):
pipeline = pipeline.estimator
print(type(pipeline))
# print(pipeline)
for step in pipeline.steps:
name, func = step
# print("%s\n%s\n" % (name, self.derive_info(func)))
return None
def main():
"""
"""
from pipelines import CreditGPipeline
from models import RandomForest
dataframe = pd.read_csv('resources/data/dataset_31_credit-g.csv')
DataFrameProfiler().on(dataframe)
pipeline = CreditGPipeline()
SklearnPipelineProfiler().on(pipeline.with_estimator(RandomForest(40)))
if __name__ == "__main__":
main()