Fix scikit-learn 0.23 + pandas 1.0 compatibility issues (#66)

* add test fails TruncatedSVD with mixture of int, float sparse Serie * add failing test sklearn KMeans seed doesn't work * add test conversion DataFrame to Numpy when mixtype sparse int/float * remove deprecated args * remove sparsity using function instead of using .values * change name calinski_harabaz_score -> calinski_harabasz_score * add 'remove_sparse_serie' into wrapper argument to force the removal of sparse * fix n_init to force non random behavior * remove deprecated attribut * remove too specific test (value depends on sklearn version) * clean up old code * Update pythonapp.yml * add kmeans_other_params in KMeansTransformer * fix n_init = 1 (no make KMeans deterministic) * allow small difference with fit_transform and transform for KMeansTransformer * split test errror truncatedsvd, keep sklearn test as 'expected fail' * add 'reason' argument * Deactivate SparseDataFrame tests for pandas<1 * Deactivate tests on sparse data frames if pandas<1 * Skip sparse data frame tests if pandas<1 Deactivate test_TruncatedSVDWrapperSparseData_wrapper test Co-authored-by: Guillaume Fournier <[email protected]>
societe-generale · Aug 20, 2020 · 1b82e8d · 1b82e8d
1 parent 4f489bc
commit 1b82e8d
Show file tree

Hide file tree

Showing 20 changed files with 186 additions and 250 deletions.
diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml
@@ -70,8 +70,8 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        scikitlearn-version: [0.21.3, 0.22.2]
-        pandas-version: [0.25.3]
+        scikitlearn-version: [0.21.3, 0.22.2, 0.23.1]
+        pandas-version: [0.25.3, 1.0.5]
 
     steps:
     - uses: actions/checkout@v2

diff --git a/aikit/ml_machine/ml_machine.py b/aikit/ml_machine/ml_machine.py
@@ -1287,7 +1287,7 @@ def guess_scoring(self, auto_ml_config):
             self.scoring = ["accuracy", "log_loss_patched", "avg_roc_auc", "f1_macro"]
 
         elif auto_ml_config.type_of_problem == en.TypeOfProblem.CLUSTERING:
-            self.scoring = ["silhouette", "calinski_harabaz", "davies_bouldin"]
+            self.scoring = ["silhouette", "calinski_harabasz", "davies_bouldin"]
 
         else:
             self.scoring = ["r2", "neg_mean_squared_error", "neg_mean_absolute_error"]

diff --git a/aikit/ml_machine/ml_machine_guider.py b/aikit/ml_machine/ml_machine_guider.py
@@ -76,7 +76,7 @@ def get_metric_default_transformation(metric_name):
         # Metric where 'perfection' is 0 => focus on differences with 0
         return lambda x: -np.log10(-x)
 
-    elif metric_name in {"calinski_harabaz"}:
+    elif metric_name in {"calinski_harabasz"}:
         return lambda x: np.log10(1 + x)
 
     else:

diff --git a/aikit/ml_machine/model_graph.py b/aikit/ml_machine/model_graph.py
@@ -597,201 +597,3 @@ def _rec_convert_graph_to_code(
         composition_already_done=composition_already_done,
     )
 
-
-# In[] : Old functions
-
-
-def convert_graph_to_code_OLD(G, all_models_params):
-    """ convertion of Graphical model into a json representation
-    
-    Parameters
-    ----------
-    G : nx.DiGraph
-        graph representing the model, each node should be a 2-uple : (name_of_step,name_of_model)
-        
-    all_models_params : dict
-        parameters of each models, key = node, value = dictionnary of hyper-parameters for this node
-        
-    Returns
-    -------
-    json like python object representing the model
-    
-    """
-    all_params = {}
-    for node in G.nodes:
-        all_params[node] = (node[1][1], all_models_params.get(node, {}))
-
-    assert_model_graph_structure(G)
-
-    return _rec_convert_graph_to_code_OLD(G, all_params)
-
-
-def _rec_convert_graph_to_code_OLD(G, all_params):
-    """ recursive function to convert a graph into a json representation """
-    if len(G.nodes) == 0:
-        return {}
-
-    ### 1) Find First composition node
-    has_composition = False
-    for node in gh.iter_graph(G):
-        if StepCategories.is_composition_step(node[0]):
-            has_composition = True
-            break
-
-    return_gpipe = not has_composition
-
-    if has_composition:
-        ### If there is a composition node, I need to split between what is above and what is bellow
-        predecessors = gh.get_all_predecessors(G, node)
-        successors = gh.get_all_successors(G, node)
-
-        if not gh.is_it_a_partition(list(G.nodes), [predecessors, [node], successors]):
-            raise ValueError("Incorrect graph, wrong split around node %s" % str(node))
-
-        if len(successors) == 0:
-            # If nothing bellow, I'll be able to return something
-            return_gpipe = True
-
-    if return_gpipe:
-
-        if len(G.nodes) > 1:
-            ### I'll create a GraphPipeline object
-
-            edges = gh.edges_from_graph(G)
-
-            model_name_mapping = _create_name_mapping(list(G.nodes))
-            # each node in graph will be mapped to a name within the GraphPipeline
-
-            models = {model_name_mapping[n]: all_params[n] for n in G.nodes}
-
-            edges = [tuple((model_name_mapping[e] for e in edge)) for edge in edges]
-
-            return (SpecialModels.GraphPipeline, {"models": models, "edges": edges})
-
-        else:
-            ### Otherwise it is just the model_name with its parameters
-            return node[1][1], all_params[list(G.nodes)[0]]
-
-    G_above = G.subgraph(predecessors + [node])
-    G_bellow = G.subgraph(successors)
-
-    connected_Gbellow = gh.get_connected_graphs(G_bellow)
-    if len(connected_Gbellow) == 1:
-        # what is bellow is a 'connected graph' : it means that the composition need should be applied to One model
-        all_params[node] = _rec_convert_graph_to_code_OLD(G_bellow, all_params)
-
-    else:
-        # otherwise, the composition will be applied to a list of models
-        all_params[node] = [_rec_convert_graph_to_code_OLD(g, all_params) for g in connected_Gbellow]
-
-    return _rec_convert_graph_to_code_OLD(G_above, all_params)
-
-
-def convert_graph_to_code_OLD2(Graph, all_models_params, also_returns_mapping=False):
-    """ convertion of a Graph representing a model into its json code 
-    
-    Parameter
-    ---------
-    
-    Graph : nx.DirectGraph
-        the graph of the model, each node as the form ( step, (step, klass) )
-        
-    all_models_params : dict
-        hyperparameters of each model, keys = node of Graph, values = corresponding hyper-parameters
-    
-    also_returns_mapping : boolean, default = False
-        if True will return a dictionnary with 'name_mapping' and 'json_code' as its key.
-        So that the name in the GraphPipeline can be accessed 
-        otherwise will just return the json_code
-        
-    Return
-    ------
-    
-    a json-like object representing the model than can be translated into a model using 'sklearn_model_from_param'
-        
-    
-    """
-    models_dico = {node: (_klass_from_node(node), all_models_params[node]) for node in Graph.nodes}
-
-    model_name_mapping = _create_name_mapping(Graph.nodes)
-
-    rec_result = _rec_convert_graph_to_code_OLD(
-        Graph=Graph, all_models_params=all_models_params, models_dico=models_dico, model_name_mapping=model_name_mapping
-    )
-
-    if not also_returns_mapping:
-        return rec_result
-    else:
-        return {"name_mapping": model_name_mapping, "json_code": rec_result}
-
-
-def _rec_convert_graph_to_code_OLD2(Graph, all_models_params, models_dico, model_name_mapping=None):
-    """ recursive function used to convert a Graph into a json code 
-   
-    See convert_graph_to_code
-    """
-
-    ### ** only one node in Graph : I'll return what was saved in models_dico ** ###
-    if len(Graph.nodes) == 1:
-        node = list(Graph.nodes)[0]
-        return models_dico[node]
-
-    node = _find_first_composition_node(Graph)
-
-    if node is not None:
-        predecessors = gh.get_all_predecessors(Graph, node)
-        successors = gh.get_all_successors(Graph, node)
-
-        if not gh.is_it_a_partition(list(Graph.nodes), [predecessors, [node], successors]):
-            raise ValueError("Incorrect graph, wrong split around node %s" % str(node))
-    else:
-        predecessors = []
-        successors = []
-
-    if node is None or len(successors) == 0:
-        ### ** It's means I'll return a GraphPipeline ** ###
-        edges = gh.edges_from_graph(Graph)
-
-        if model_name_mapping is None:
-            model_name_mapping = _create_name_mapping(list(Graph.nodes))
-        # each node in graph will be mapped to a name within the GraphPipeline
-
-        models = {model_name_mapping[n]: models_dico[n] for n in Graph.nodes}
-
-        edges = [tuple((model_name_mapping[e] for e in edge)) for edge in edges]
-
-        return (SpecialModels.GraphPipeline, {"models": models, "edges": edges})
-
-    Graph_bellow = Graph.subgraph(successors)
-
-    connected_Gbellow = gh.get_connected_graphs(Graph_bellow)
-
-    if len(predecessors) == 0 and len(connected_Gbellow) > 1:
-
-        return (
-            _klass_from_node(node),
-            [
-                _rec_convert_graph_to_code_OLD2(Gb, all_models_params, models_dico, model_name_mapping)
-                for Gb in connected_Gbellow
-            ],
-            all_models_params[node],
-        )
-
-    elif len(predecessors) == 0 and len(connected_Gbellow) == 1:
-
-        return (
-            _klass_from_node(node),
-            _rec_convert_graph_to_code_OLD2(Graph_bellow, all_models_params, models_dico, model_name_mapping),
-            all_models_params[node],
-        )
-
-    else:
-
-        G_bellow_and_node = Graph.subgraph([node] + successors)
-        G_above = Graph.subgraph(predecessors + [node])
-
-        models_dico[node] = _rec_convert_graph_to_code_OLD2(
-            G_bellow_and_node, all_models_params, models_dico, model_name_mapping
-        )
-
-        return _rec_convert_graph_to_code(G_above, all_models_params, models_dico, model_name_mapping)
diff --git a/aikit/models/base.py b/aikit/models/base.py
@@ -56,21 +56,19 @@ def __init__(
         verbose=0,
         random_state=None,
         copy_x=True,
-        n_jobs=None,
         algorithm="auto",
     ):
         super(KMeansWrapper, self).__init__(
-            n_clusters,
-            init,
-            n_init,
-            max_iter,
-            tol,
-            precompute_distances,
-            verbose,
-            random_state,
-            copy_x,
-            n_jobs,
-            algorithm,
+            n_clusters=n_clusters,
+            init=init,
+            n_init=n_init,
+            max_iter=max_iter,
+            tol=tol,
+            precompute_distances=precompute_distances,
+            verbose=verbose,
+            random_state=random_state,
+            copy_x=copy_x,
+            algorithm=algorithm,
         )
 
     def fit(self, X, y=None, sample_weight=None):

diff --git a/aikit/models/random_forest_addins.py b/aikit/models/random_forest_addins.py
@@ -931,6 +931,7 @@ def __init__(
             work_on_one_column_only=False,
             all_columns_at_once=True,
             accepted_input_types=None,
+            remove_sparse_serie=False,
             column_prefix=None,
             desired_output_type=desired_output_type,
             must_transform_to_get_features_name=False,
@@ -987,6 +988,7 @@ def __init__(
             work_on_one_column_only=False,
             all_columns_at_once=True,
             accepted_input_types=None,
+            remove_sparse_serie=False,
             column_prefix=None,
             desired_output_type=desired_output_type,
             must_transform_to_get_features_name=False,

diff --git a/aikit/scorer.py b/aikit/scorer.py
@@ -9,7 +9,13 @@
 
 import sklearn.metrics
 from sklearn.metrics.regression import _check_reg_targets, r2_score
-from sklearn.metrics import silhouette_score, calinski_harabaz_score, davies_bouldin_score
+from sklearn.metrics import silhouette_score, davies_bouldin_score
+try:
+    from sklearn.metrics import calinski_harabasz_score
+except ImportError:
+    from sklearn.metrics import calinski_harabaz_score
+    calinski_harabasz_score = calinski_harabaz_score
+
 
 from sklearn.metrics.scorer import SCORERS, _BaseScorer, type_of_target
 
@@ -288,7 +294,7 @@ def _max_proba_is_true(sub_group):
 
 log_r2_scorer = sklearn.metrics.make_scorer(log_r2_score)
 silhouette_scorer = make_scorer_clustering(silhouette_score, metric="euclidean", greater_is_better=True)
-calinski_harabaz_scorer = make_scorer_clustering(calinski_harabaz_score, greater_is_better=True)
+calinski_harabasz_scorer = make_scorer_clustering(calinski_harabasz_score, greater_is_better=True)
 
 davies_bouldin_scorer = make_scorer_clustering(davies_bouldin_score, greater_is_better=False)
 
@@ -299,5 +305,5 @@ def _max_proba_is_true(sub_group):
 SCORERS["confidence_score"] = confidence_score()
 SCORERS["log_r2"] = log_r2_scorer
 SCORERS["silhouette"] = silhouette_scorer
-SCORERS["calinski_harabaz"] = calinski_harabaz_scorer
+SCORERS["calinski_harabasz"] = calinski_harabasz_scorer
 SCORERS["davies_bouldin"] = davies_bouldin_scorer
diff --git a/aikit/tools/data_structure_helper.py b/aikit/tools/data_structure_helper.py
@@ -127,7 +127,7 @@ def convert_to_array(xx, mapped_type=None):
         return convert_to_array(convert_to_dataframe(xx))
 
     if mapped_type == DataTypes.DataFrame:
-        return get_rid_of_categories(xx).values
+        return get_rid_of_categories(get_rid_of_sparse_columns(xx)).values
 
     elif mapped_type == DataTypes.Serie:
         return xx.values.reshape((xx.shape[0], 1))