From 5b9466ccbe782a9a1bbb0466f6440be1e8fb1afa Mon Sep 17 00:00:00 2001 From: Andrei Stoian <95410270+andrei-stoian-zama@users.noreply.github.com> Date: Wed, 11 Dec 2024 12:51:14 +0100 Subject: [PATCH] fix: faster rounding test in weekly (#957) Co-authored-by: jfrery --- .github/workflows/continuous-integration.yaml | 2 ++ .../DecisionTreeClassifier.ipynb | 2 +- .../ExperimentPrivacyTreePaper.ipynb | 31 ++++++++++++++----- .../LogisticRegressionTraining.ipynb | 4 +-- tests/sklearn/test_sklearn_models.py | 17 +--------- 5 files changed, 30 insertions(+), 26 deletions(-) diff --git a/.github/workflows/continuous-integration.yaml b/.github/workflows/continuous-integration.yaml index fce1874ea..0c7554e55 100644 --- a/.github/workflows/continuous-integration.yaml +++ b/.github/workflows/continuous-integration.yaml @@ -980,10 +980,12 @@ jobs: run: | ./script/make_utils/check_installation_with_all_python.sh --version ${{ matrix.python_version }} --sync_env + # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4679 # Check installation with pip - name: Check installation with pip and python ${{ matrix.python_version }} (weekly) if: | (fromJSON(env.IS_WEEKLY)) + && matrix.python_version != '3.12' && steps.conformance.outcome == 'success' && !cancelled() run: | diff --git a/docs/advanced_examples/DecisionTreeClassifier.ipynb b/docs/advanced_examples/DecisionTreeClassifier.ipynb index da5714ffb..8b94a616b 100644 --- a/docs/advanced_examples/DecisionTreeClassifier.ipynb +++ b/docs/advanced_examples/DecisionTreeClassifier.ipynb @@ -78,7 +78,7 @@ "\n", "# List of hyper parameters to tune\n", "param_grid = {\n", - " \"max_features\": [None, \"auto\", \"sqrt\", \"log2\"],\n", + " \"max_features\": [None, \"sqrt\", \"log2\"],\n", " \"min_samples_leaf\": [1, 10, 100],\n", " \"min_samples_split\": [2, 10, 100],\n", " \"max_depth\": [None, 2, 4, 6, 8],\n", diff --git a/docs/advanced_examples/ExperimentPrivacyTreePaper.ipynb b/docs/advanced_examples/ExperimentPrivacyTreePaper.ipynb index 0b5b2c28b..5bf349784 100644 --- a/docs/advanced_examples/ExperimentPrivacyTreePaper.ipynb +++ b/docs/advanced_examples/ExperimentPrivacyTreePaper.ipynb @@ -130,28 +130,45 @@ " y (np.array): Target labels of the dataset.\n", " \"\"\"\n", " if data_id is not None:\n", - " X, y = fetch_openml(data_id=data_id, as_frame=False, cache=True, return_X_y=True)\n", + " X, y = fetch_openml(data_id=data_id, as_frame=True, cache=True, return_X_y=True)\n", " else:\n", - " X, y = fetch_openml(name=name, as_frame=False, cache=True, return_X_y=True)\n", + " X, y = fetch_openml(name=name, as_frame=True, cache=True, return_X_y=True)\n", " return X, y\n", "\n", "\n", + "def preprocess_features(X):\n", + " \"\"\"Convert categorical columns to numerical.\"\"\"\n", + " X_processed = X.copy()\n", + "\n", + " for column in X_processed.columns:\n", + " if X_processed[column].dtype == \"object\" or X_processed[column].dtype.name == \"category\":\n", + " # Convert categorical columns to numeric using label encoding\n", + " X_processed[column] = X_processed[column].astype(\"category\").cat.codes\n", + "\n", + " return X_processed.astype(np.float32)\n", + "\n", + "\n", "for ds_name, ds_id in dataset_names.items():\n", " print(f\"Loading {ds_name}\")\n", "\n", " X, y = load_dataset(ds_name, ds_id)\n", "\n", + " # Preprocess features (handle categorical data)\n", + " X = preprocess_features(X)\n", + "\n", " # Remove rows with NaN values\n", - " not_nan_idx = np.where(~np.isnan(X).any(axis=1))\n", - " X = X[not_nan_idx]\n", - " y = y[not_nan_idx]\n", + " not_nan_mask = ~np.isnan(X).any(axis=1)\n", + " X = X[not_nan_mask]\n", + " y = y[not_nan_mask]\n", "\n", " # Convert non-integer target labels to integers\n", " if not y.dtype == np.int64:\n", " encoder = OrdinalEncoder()\n", - " y = encoder.fit_transform(y.reshape(-1, 1)).astype(np.int32).squeeze()\n", + " # Convert pandas Series to numpy array before reshaping\n", + " y = encoder.fit_transform(np.array(y).reshape(-1, 1)).astype(np.int32).squeeze()\n", "\n", - " datasets[ds_name] = {\"X\": X, \"y\": y}" + " # Ensure both X and y are numpy arrays before storing\n", + " datasets[ds_name] = {\"X\": np.array(X), \"y\": np.array(y)}" ] }, { diff --git a/docs/advanced_examples/LogisticRegressionTraining.ipynb b/docs/advanced_examples/LogisticRegressionTraining.ipynb index ca9d7023b..0ad87b51b 100644 --- a/docs/advanced_examples/LogisticRegressionTraining.ipynb +++ b/docs/advanced_examples/LogisticRegressionTraining.ipynb @@ -111,7 +111,7 @@ "\n", "# Load the Iris dataset\n", "X_full, y_full = datasets.load_iris(return_X_y=True)\n", - "X_full = MinMaxScaler(feature_range=[-1, 1]).fit_transform(X_full)\n", + "X_full = MinMaxScaler(feature_range=(-1, 1)).fit_transform(X_full)\n", "\n", "# Select petal length and petal width for visualization\n", "X = X_full[:, 2:4] # Petal length and petal width\n", @@ -384,7 +384,7 @@ "X, y = datasets.load_breast_cancer(return_X_y=True)\n", "x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)\n", "\n", - "scaler = MinMaxScaler(feature_range=[-1, 1])\n", + "scaler = MinMaxScaler(feature_range=(-1, 1))\n", "x_train = scaler.fit_transform(x_train)\n", "x_test = scaler.transform(x_test)\n", "\n", diff --git a/tests/sklearn/test_sklearn_models.py b/tests/sklearn/test_sklearn_models.py index 5f500d4de..f12531869 100644 --- a/tests/sklearn/test_sklearn_models.py +++ b/tests/sklearn/test_sklearn_models.py @@ -1344,14 +1344,9 @@ def check_rounding_consistency( y, predict_method, metric, - is_weekly_option, ): """Test that Concrete ML without and with rounding are 'equivalent'.""" - # Run the test with more samples during weekly CIs - if is_weekly_option: - fhe_test = get_random_samples(x, n_sample=5) - # Check that rounding is enabled assert os.environ.get("TREES_USE_ROUNDING") == "1", "'TREES_USE_ROUNDING' is not enabled" @@ -1361,10 +1356,6 @@ def check_rounding_consistency( rounded_predict_quantized = predict_method(x, fhe="disable") rounded_predict_simulate = predict_method(x, fhe="simulate") - # Compute the FHE predictions only during weekly CIs - if is_weekly_option: - rounded_predict_fhe = predict_method(fhe_test, fhe="execute") - with pytest.MonkeyPatch.context() as mp_context: # Disable rounding @@ -1389,11 +1380,6 @@ def check_rounding_consistency( metric(rounded_predict_quantized, not_rounded_predict_quantized) metric(rounded_predict_simulate, not_rounded_predict_simulate) - # Compute the FHE predictions only during weekly CIs - if is_weekly_option: - not_rounded_predict_fhe = predict_method(fhe_test, fhe="execute") - metric(rounded_predict_fhe, not_rounded_predict_fhe) - # Check that the maximum bit-width of the circuit with rounding is at most: # maximum bit-width (of the circuit without rounding) + 2 # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4178 @@ -2076,7 +2062,7 @@ def test_linear_models_have_no_tlu( # Additional tests for this purpose should be added in future updates # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4179 @pytest.mark.parametrize("model_class, parameters", get_sklearn_tree_models_and_datasets()) -@pytest.mark.parametrize("n_bits", [2, 5, 10]) +@pytest.mark.parametrize("n_bits", [2, 5, 8]) def test_rounding_consistency_for_regular_models( model_class, parameters, @@ -2110,7 +2096,6 @@ def test_rounding_consistency_for_regular_models( y, predict_method, metric, - is_weekly_option, )