fix: faster rounding test in weekly (#957)

Co-authored-by: jfrery <[email protected]>
zama-ai · Dec 11, 2024 · 5b9466c · 5b9466c
1 parent b04284a
commit 5b9466c
Show file tree

Hide file tree

Showing 5 changed files with 30 additions and 26 deletions.
diff --git a/.github/workflows/continuous-integration.yaml b/.github/workflows/continuous-integration.yaml
@@ -980,10 +980,12 @@ jobs:
         run: |
           ./script/make_utils/check_installation_with_all_python.sh --version ${{ matrix.python_version }} --sync_env
 
+      # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4679
       # Check installation with pip
       - name: Check installation with pip and python ${{ matrix.python_version }} (weekly)
         if: |
             (fromJSON(env.IS_WEEKLY))
+            && matrix.python_version != '3.12'
             && steps.conformance.outcome == 'success'
             && !cancelled()
         run: |

diff --git a/docs/advanced_examples/DecisionTreeClassifier.ipynb b/docs/advanced_examples/DecisionTreeClassifier.ipynb
@@ -78,7 +78,7 @@
     "\n",
     "# List of hyper parameters to tune\n",
     "param_grid = {\n",
-    "    \"max_features\": [None, \"auto\", \"sqrt\", \"log2\"],\n",
+    "    \"max_features\": [None, \"sqrt\", \"log2\"],\n",
     "    \"min_samples_leaf\": [1, 10, 100],\n",
     "    \"min_samples_split\": [2, 10, 100],\n",
     "    \"max_depth\": [None, 2, 4, 6, 8],\n",

diff --git a/docs/advanced_examples/ExperimentPrivacyTreePaper.ipynb b/docs/advanced_examples/ExperimentPrivacyTreePaper.ipynb
@@ -130,28 +130,45 @@
     "        y (np.array): Target labels of the dataset.\n",
     "    \"\"\"\n",
     "    if data_id is not None:\n",
-    "        X, y = fetch_openml(data_id=data_id, as_frame=False, cache=True, return_X_y=True)\n",
+    "        X, y = fetch_openml(data_id=data_id, as_frame=True, cache=True, return_X_y=True)\n",
     "    else:\n",
-    "        X, y = fetch_openml(name=name, as_frame=False, cache=True, return_X_y=True)\n",
+    "        X, y = fetch_openml(name=name, as_frame=True, cache=True, return_X_y=True)\n",
     "    return X, y\n",
     "\n",
     "\n",
+    "def preprocess_features(X):\n",
+    "    \"\"\"Convert categorical columns to numerical.\"\"\"\n",
+    "    X_processed = X.copy()\n",
+    "\n",
+    "    for column in X_processed.columns:\n",
+    "        if X_processed[column].dtype == \"object\" or X_processed[column].dtype.name == \"category\":\n",
+    "            # Convert categorical columns to numeric using label encoding\n",
+    "            X_processed[column] = X_processed[column].astype(\"category\").cat.codes\n",
+    "\n",
+    "    return X_processed.astype(np.float32)\n",
+    "\n",
+    "\n",
     "for ds_name, ds_id in dataset_names.items():\n",
     "    print(f\"Loading {ds_name}\")\n",
     "\n",
     "    X, y = load_dataset(ds_name, ds_id)\n",
     "\n",
+    "    # Preprocess features (handle categorical data)\n",
+    "    X = preprocess_features(X)\n",
+    "\n",
     "    # Remove rows with NaN values\n",
-    "    not_nan_idx = np.where(~np.isnan(X).any(axis=1))\n",
-    "    X = X[not_nan_idx]\n",
-    "    y = y[not_nan_idx]\n",
+    "    not_nan_mask = ~np.isnan(X).any(axis=1)\n",
+    "    X = X[not_nan_mask]\n",
+    "    y = y[not_nan_mask]\n",
     "\n",
     "    # Convert non-integer target labels to integers\n",
     "    if not y.dtype == np.int64:\n",
     "        encoder = OrdinalEncoder()\n",
-    "        y = encoder.fit_transform(y.reshape(-1, 1)).astype(np.int32).squeeze()\n",
+    "        # Convert pandas Series to numpy array before reshaping\n",
+    "        y = encoder.fit_transform(np.array(y).reshape(-1, 1)).astype(np.int32).squeeze()\n",
     "\n",
-    "    datasets[ds_name] = {\"X\": X, \"y\": y}"
+    "    # Ensure both X and y are numpy arrays before storing\n",
+    "    datasets[ds_name] = {\"X\": np.array(X), \"y\": np.array(y)}"
    ]
   },
   {

diff --git a/docs/advanced_examples/LogisticRegressionTraining.ipynb b/docs/advanced_examples/LogisticRegressionTraining.ipynb
@@ -111,7 +111,7 @@
     "\n",
     "# Load the Iris dataset\n",
     "X_full, y_full = datasets.load_iris(return_X_y=True)\n",
-    "X_full = MinMaxScaler(feature_range=[-1, 1]).fit_transform(X_full)\n",
+    "X_full = MinMaxScaler(feature_range=(-1, 1)).fit_transform(X_full)\n",
     "\n",
     "# Select petal length and petal width for visualization\n",
     "X = X_full[:, 2:4]  # Petal length and petal width\n",
@@ -384,7 +384,7 @@
     "X, y = datasets.load_breast_cancer(return_X_y=True)\n",
     "x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)\n",
     "\n",
-    "scaler = MinMaxScaler(feature_range=[-1, 1])\n",
+    "scaler = MinMaxScaler(feature_range=(-1, 1))\n",
     "x_train = scaler.fit_transform(x_train)\n",
     "x_test = scaler.transform(x_test)\n",
     "\n",

diff --git a/tests/sklearn/test_sklearn_models.py b/tests/sklearn/test_sklearn_models.py
@@ -1344,14 +1344,9 @@ def check_rounding_consistency(
     y,
     predict_method,
     metric,
-    is_weekly_option,
 ):
     """Test that Concrete ML without and with rounding are 'equivalent'."""
 
-    # Run the test with more samples during weekly CIs
-    if is_weekly_option:
-        fhe_test = get_random_samples(x, n_sample=5)
-
     # Check that rounding is enabled
     assert os.environ.get("TREES_USE_ROUNDING") == "1", "'TREES_USE_ROUNDING' is not enabled"
 
@@ -1361,10 +1356,6 @@ def check_rounding_consistency(
     rounded_predict_quantized = predict_method(x, fhe="disable")
     rounded_predict_simulate = predict_method(x, fhe="simulate")
 
-    # Compute the FHE predictions only during weekly CIs
-    if is_weekly_option:
-        rounded_predict_fhe = predict_method(fhe_test, fhe="execute")
-
     with pytest.MonkeyPatch.context() as mp_context:
 
         # Disable rounding
@@ -1389,11 +1380,6 @@ def check_rounding_consistency(
         metric(rounded_predict_quantized, not_rounded_predict_quantized)
         metric(rounded_predict_simulate, not_rounded_predict_simulate)
 
-        # Compute the FHE predictions only during weekly CIs
-        if is_weekly_option:
-            not_rounded_predict_fhe = predict_method(fhe_test, fhe="execute")
-            metric(rounded_predict_fhe, not_rounded_predict_fhe)
-
         # Check that the maximum bit-width of the circuit with rounding is at most:
         # maximum bit-width (of the circuit without rounding) + 2
         # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4178
@@ -2076,7 +2062,7 @@ def test_linear_models_have_no_tlu(
 # Additional tests for this purpose should be added in future updates
 # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4179
 @pytest.mark.parametrize("model_class, parameters", get_sklearn_tree_models_and_datasets())
-@pytest.mark.parametrize("n_bits", [2, 5, 10])
+@pytest.mark.parametrize("n_bits", [2, 5, 8])
 def test_rounding_consistency_for_regular_models(
     model_class,
     parameters,
@@ -2110,7 +2096,6 @@ def test_rounding_consistency_for_regular_models(
         y,
         predict_method,
         metric,
-        is_weekly_option,
     )