Skip to content

Commit

Permalink
Fix scikit-learn 0.23 + pandas 1.0 compatibility issues (#66)
Browse files Browse the repository at this point in the history
* add test fails TruncatedSVD with mixture of int, float sparse Serie

* add failing test sklearn KMeans seed doesn't work

* add  test conversion DataFrame to Numpy when mixtype sparse int/float

* remove deprecated args

* remove sparsity using function instead of using .values

* change name calinski_harabaz_score -> calinski_harabasz_score

* add 'remove_sparse_serie' into wrapper argument to force the removal of sparse

* fix n_init to force non random behavior

* remove deprecated attribut

* remove too specific test (value depends on sklearn version)

* clean up old code

* Update pythonapp.yml

* add kmeans_other_params in KMeansTransformer

* fix n_init = 1 (no make KMeans deterministic)

* allow small difference with fit_transform and transform for KMeansTransformer

* split test errror truncatedsvd, keep sklearn test as 'expected fail'

* add 'reason' argument

* Deactivate SparseDataFrame tests for pandas<1

* Deactivate tests on sparse data frames if pandas<1

* Skip sparse data frame tests if pandas<1

Deactivate test_TruncatedSVDWrapperSparseData_wrapper test

Co-authored-by: Guillaume Fournier <[email protected]>
  • Loading branch information
LionelMassoulard and gfournier authored Aug 20, 2020
1 parent 4f489bc commit 1b82e8d
Show file tree
Hide file tree
Showing 20 changed files with 186 additions and 250 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/pythonapp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
scikitlearn-version: [0.21.3, 0.22.2]
pandas-version: [0.25.3]
scikitlearn-version: [0.21.3, 0.22.2, 0.23.1]
pandas-version: [0.25.3, 1.0.5]

steps:
- uses: actions/checkout@v2
Expand Down
2 changes: 1 addition & 1 deletion aikit/ml_machine/ml_machine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1287,7 +1287,7 @@ def guess_scoring(self, auto_ml_config):
self.scoring = ["accuracy", "log_loss_patched", "avg_roc_auc", "f1_macro"]

elif auto_ml_config.type_of_problem == en.TypeOfProblem.CLUSTERING:
self.scoring = ["silhouette", "calinski_harabaz", "davies_bouldin"]
self.scoring = ["silhouette", "calinski_harabasz", "davies_bouldin"]

else:
self.scoring = ["r2", "neg_mean_squared_error", "neg_mean_absolute_error"]
Expand Down
2 changes: 1 addition & 1 deletion aikit/ml_machine/ml_machine_guider.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def get_metric_default_transformation(metric_name):
# Metric where 'perfection' is 0 => focus on differences with 0
return lambda x: -np.log10(-x)

elif metric_name in {"calinski_harabaz"}:
elif metric_name in {"calinski_harabasz"}:
return lambda x: np.log10(1 + x)

else:
Expand Down
198 changes: 0 additions & 198 deletions aikit/ml_machine/model_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,201 +597,3 @@ def _rec_convert_graph_to_code(
composition_already_done=composition_already_done,
)


# In[] : Old functions


def convert_graph_to_code_OLD(G, all_models_params):
""" convertion of Graphical model into a json representation
Parameters
----------
G : nx.DiGraph
graph representing the model, each node should be a 2-uple : (name_of_step,name_of_model)
all_models_params : dict
parameters of each models, key = node, value = dictionnary of hyper-parameters for this node
Returns
-------
json like python object representing the model
"""
all_params = {}
for node in G.nodes:
all_params[node] = (node[1][1], all_models_params.get(node, {}))

assert_model_graph_structure(G)

return _rec_convert_graph_to_code_OLD(G, all_params)


def _rec_convert_graph_to_code_OLD(G, all_params):
""" recursive function to convert a graph into a json representation """
if len(G.nodes) == 0:
return {}

### 1) Find First composition node
has_composition = False
for node in gh.iter_graph(G):
if StepCategories.is_composition_step(node[0]):
has_composition = True
break

return_gpipe = not has_composition

if has_composition:
### If there is a composition node, I need to split between what is above and what is bellow
predecessors = gh.get_all_predecessors(G, node)
successors = gh.get_all_successors(G, node)

if not gh.is_it_a_partition(list(G.nodes), [predecessors, [node], successors]):
raise ValueError("Incorrect graph, wrong split around node %s" % str(node))

if len(successors) == 0:
# If nothing bellow, I'll be able to return something
return_gpipe = True

if return_gpipe:

if len(G.nodes) > 1:
### I'll create a GraphPipeline object

edges = gh.edges_from_graph(G)

model_name_mapping = _create_name_mapping(list(G.nodes))
# each node in graph will be mapped to a name within the GraphPipeline

models = {model_name_mapping[n]: all_params[n] for n in G.nodes}

edges = [tuple((model_name_mapping[e] for e in edge)) for edge in edges]

return (SpecialModels.GraphPipeline, {"models": models, "edges": edges})

else:
### Otherwise it is just the model_name with its parameters
return node[1][1], all_params[list(G.nodes)[0]]

G_above = G.subgraph(predecessors + [node])
G_bellow = G.subgraph(successors)

connected_Gbellow = gh.get_connected_graphs(G_bellow)
if len(connected_Gbellow) == 1:
# what is bellow is a 'connected graph' : it means that the composition need should be applied to One model
all_params[node] = _rec_convert_graph_to_code_OLD(G_bellow, all_params)

else:
# otherwise, the composition will be applied to a list of models
all_params[node] = [_rec_convert_graph_to_code_OLD(g, all_params) for g in connected_Gbellow]

return _rec_convert_graph_to_code_OLD(G_above, all_params)


def convert_graph_to_code_OLD2(Graph, all_models_params, also_returns_mapping=False):
""" convertion of a Graph representing a model into its json code
Parameter
---------
Graph : nx.DirectGraph
the graph of the model, each node as the form ( step, (step, klass) )
all_models_params : dict
hyperparameters of each model, keys = node of Graph, values = corresponding hyper-parameters
also_returns_mapping : boolean, default = False
if True will return a dictionnary with 'name_mapping' and 'json_code' as its key.
So that the name in the GraphPipeline can be accessed
otherwise will just return the json_code
Return
------
a json-like object representing the model than can be translated into a model using 'sklearn_model_from_param'
"""
models_dico = {node: (_klass_from_node(node), all_models_params[node]) for node in Graph.nodes}

model_name_mapping = _create_name_mapping(Graph.nodes)

rec_result = _rec_convert_graph_to_code_OLD(
Graph=Graph, all_models_params=all_models_params, models_dico=models_dico, model_name_mapping=model_name_mapping
)

if not also_returns_mapping:
return rec_result
else:
return {"name_mapping": model_name_mapping, "json_code": rec_result}


def _rec_convert_graph_to_code_OLD2(Graph, all_models_params, models_dico, model_name_mapping=None):
""" recursive function used to convert a Graph into a json code
See convert_graph_to_code
"""

### ** only one node in Graph : I'll return what was saved in models_dico ** ###
if len(Graph.nodes) == 1:
node = list(Graph.nodes)[0]
return models_dico[node]

node = _find_first_composition_node(Graph)

if node is not None:
predecessors = gh.get_all_predecessors(Graph, node)
successors = gh.get_all_successors(Graph, node)

if not gh.is_it_a_partition(list(Graph.nodes), [predecessors, [node], successors]):
raise ValueError("Incorrect graph, wrong split around node %s" % str(node))
else:
predecessors = []
successors = []

if node is None or len(successors) == 0:
### ** It's means I'll return a GraphPipeline ** ###
edges = gh.edges_from_graph(Graph)

if model_name_mapping is None:
model_name_mapping = _create_name_mapping(list(Graph.nodes))
# each node in graph will be mapped to a name within the GraphPipeline

models = {model_name_mapping[n]: models_dico[n] for n in Graph.nodes}

edges = [tuple((model_name_mapping[e] for e in edge)) for edge in edges]

return (SpecialModels.GraphPipeline, {"models": models, "edges": edges})

Graph_bellow = Graph.subgraph(successors)

connected_Gbellow = gh.get_connected_graphs(Graph_bellow)

if len(predecessors) == 0 and len(connected_Gbellow) > 1:

return (
_klass_from_node(node),
[
_rec_convert_graph_to_code_OLD2(Gb, all_models_params, models_dico, model_name_mapping)
for Gb in connected_Gbellow
],
all_models_params[node],
)

elif len(predecessors) == 0 and len(connected_Gbellow) == 1:

return (
_klass_from_node(node),
_rec_convert_graph_to_code_OLD2(Graph_bellow, all_models_params, models_dico, model_name_mapping),
all_models_params[node],
)

else:

G_bellow_and_node = Graph.subgraph([node] + successors)
G_above = Graph.subgraph(predecessors + [node])

models_dico[node] = _rec_convert_graph_to_code_OLD2(
G_bellow_and_node, all_models_params, models_dico, model_name_mapping
)

return _rec_convert_graph_to_code(G_above, all_models_params, models_dico, model_name_mapping)
22 changes: 10 additions & 12 deletions aikit/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,21 +56,19 @@ def __init__(
verbose=0,
random_state=None,
copy_x=True,
n_jobs=None,
algorithm="auto",
):
super(KMeansWrapper, self).__init__(
n_clusters,
init,
n_init,
max_iter,
tol,
precompute_distances,
verbose,
random_state,
copy_x,
n_jobs,
algorithm,
n_clusters=n_clusters,
init=init,
n_init=n_init,
max_iter=max_iter,
tol=tol,
precompute_distances=precompute_distances,
verbose=verbose,
random_state=random_state,
copy_x=copy_x,
algorithm=algorithm,
)

def fit(self, X, y=None, sample_weight=None):
Expand Down
2 changes: 2 additions & 0 deletions aikit/models/random_forest_addins.py
Original file line number Diff line number Diff line change
Expand Up @@ -931,6 +931,7 @@ def __init__(
work_on_one_column_only=False,
all_columns_at_once=True,
accepted_input_types=None,
remove_sparse_serie=False,
column_prefix=None,
desired_output_type=desired_output_type,
must_transform_to_get_features_name=False,
Expand Down Expand Up @@ -987,6 +988,7 @@ def __init__(
work_on_one_column_only=False,
all_columns_at_once=True,
accepted_input_types=None,
remove_sparse_serie=False,
column_prefix=None,
desired_output_type=desired_output_type,
must_transform_to_get_features_name=False,
Expand Down
12 changes: 9 additions & 3 deletions aikit/scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,13 @@

import sklearn.metrics
from sklearn.metrics.regression import _check_reg_targets, r2_score
from sklearn.metrics import silhouette_score, calinski_harabaz_score, davies_bouldin_score
from sklearn.metrics import silhouette_score, davies_bouldin_score
try:
from sklearn.metrics import calinski_harabasz_score
except ImportError:
from sklearn.metrics import calinski_harabaz_score
calinski_harabasz_score = calinski_harabaz_score


from sklearn.metrics.scorer import SCORERS, _BaseScorer, type_of_target

Expand Down Expand Up @@ -288,7 +294,7 @@ def _max_proba_is_true(sub_group):

log_r2_scorer = sklearn.metrics.make_scorer(log_r2_score)
silhouette_scorer = make_scorer_clustering(silhouette_score, metric="euclidean", greater_is_better=True)
calinski_harabaz_scorer = make_scorer_clustering(calinski_harabaz_score, greater_is_better=True)
calinski_harabasz_scorer = make_scorer_clustering(calinski_harabasz_score, greater_is_better=True)

davies_bouldin_scorer = make_scorer_clustering(davies_bouldin_score, greater_is_better=False)

Expand All @@ -299,5 +305,5 @@ def _max_proba_is_true(sub_group):
SCORERS["confidence_score"] = confidence_score()
SCORERS["log_r2"] = log_r2_scorer
SCORERS["silhouette"] = silhouette_scorer
SCORERS["calinski_harabaz"] = calinski_harabaz_scorer
SCORERS["calinski_harabasz"] = calinski_harabasz_scorer
SCORERS["davies_bouldin"] = davies_bouldin_scorer
2 changes: 1 addition & 1 deletion aikit/tools/data_structure_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def convert_to_array(xx, mapped_type=None):
return convert_to_array(convert_to_dataframe(xx))

if mapped_type == DataTypes.DataFrame:
return get_rid_of_categories(xx).values
return get_rid_of_categories(get_rid_of_sparse_columns(xx)).values

elif mapped_type == DataTypes.Serie:
return xx.values.reshape((xx.shape[0], 1))
Expand Down
Loading

0 comments on commit 1b82e8d

Please sign in to comment.