From 6f536e2b65515852811d6def3431da2d7139c773 Mon Sep 17 00:00:00 2001 From: jfrery Date: Tue, 10 Dec 2024 17:25:07 +0100 Subject: [PATCH 01/11] feat: add lora fine tuning for llama 3.2 --- .github/workflows/refresh-one-notebook.yaml | 2 + docs/advanced_examples/LoraMLP.ipynb | 280 +- docs/advanced_examples/aggregated_code.txt | 5248 +++++++++++++++++ .../ml/torch/hybrid_backprop_linear.py | 116 + src/concrete/ml/torch/hybrid_model.py | 7 +- src/concrete/ml/torch/lora.py | 445 +- tests/torch/test_lora.py | 821 +-- .../lora_finetuning/GPT2FineTuneHybrid.ipynb | 47 +- .../lora_finetuning/LLamaFineTuning.ipynb | 345 ++ use_case_examples/lora_finetuning/Makefile | 3 + .../data_finetune/dataset.jsonl | 46 + .../data_finetune/raw_cml_1.7.0_examples.txt | 458 ++ .../lora_finetuning/requirements.txt | 1 + .../lora_finetuning/scripts/create_dataset.py | 109 + .../lora_finetuning/utils_lora.py | 34 +- 15 files changed, 7129 insertions(+), 833 deletions(-) create mode 100644 docs/advanced_examples/aggregated_code.txt create mode 100644 src/concrete/ml/torch/hybrid_backprop_linear.py create mode 100644 use_case_examples/lora_finetuning/LLamaFineTuning.ipynb create mode 100644 use_case_examples/lora_finetuning/data_finetune/dataset.jsonl create mode 100644 use_case_examples/lora_finetuning/data_finetune/raw_cml_1.7.0_examples.txt create mode 100644 use_case_examples/lora_finetuning/scripts/create_dataset.py diff --git a/.github/workflows/refresh-one-notebook.yaml b/.github/workflows/refresh-one-notebook.yaml index 3713dadf8..96f4107b9 100644 --- a/.github/workflows/refresh-one-notebook.yaml +++ b/.github/workflows/refresh-one-notebook.yaml @@ -28,6 +28,7 @@ on: - KNearestNeighbors \n - LinearRegression \n - LinearSVR \n + - LLamaFineTuning \n - LogisticRegression \n - LogisticRegressionTraining \n - LoraMLP \n @@ -76,6 +77,7 @@ env: KNearestNeighbors: "docs/advanced_examples/KNearestNeighbors.ipynb" LinearRegression: "docs/advanced_examples/LinearRegression.ipynb" LinearSVR: "docs/advanced_examples/LinearSVR.ipynb" + LLamaFineTuning: "use_case_examples/lora_finetuning/LLamaFineTuning.ipynb" LogisticRegression: "docs/advanced_examples/LogisticRegression.ipynb" LogisticRegressionTraining: "docs/advanced_examples/LogisticRegressionTraining.ipynb" LoraMLP: "docs/advanced_examples/LoraMLP.ipynb" diff --git a/docs/advanced_examples/LoraMLP.ipynb b/docs/advanced_examples/LoraMLP.ipynb index 7b6dc6e7c..7a7015614 100644 --- a/docs/advanced_examples/LoraMLP.ipynb +++ b/docs/advanced_examples/LoraMLP.ipynb @@ -21,7 +21,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 1, @@ -31,7 +31,6 @@ ], "source": [ "import shutil\n", - "import time\n", "from pathlib import Path\n", "\n", "import matplotlib.pyplot as plt\n", @@ -41,10 +40,8 @@ "from sklearn.datasets import make_circles, make_moons\n", "from torch import nn, optim\n", "from torch.utils.data import DataLoader, TensorDataset\n", - "from tqdm import tqdm\n", "\n", - "from concrete.ml.torch.hybrid_model import HybridFHEModel\n", - "from concrete.ml.torch.lora import LoraTraining, get_remote_names\n", + "from concrete.ml.torch.lora import LoraTrainer\n", "\n", "# Set random seed for reproducibility\n", "SEED = 42\n", @@ -132,13 +129,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training on Task 1 without LoRA:\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "Training on Task 1 without LoRA:\n", "Epoch [20/20], Loss: 0.0036\n" ] }, @@ -276,25 +267,26 @@ "cell_type": "code", "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LoRA layers detected in the model.\n" + ] + } + ], "source": [ - "# Set up LoRA training\n", - "lora_training = LoraTraining(peft_model)\n", - "\n", - "# Set up optimizer and scheduler\n", + "# Update training parameters, including loss function\n", "optimizer = optim.Adam(filter(lambda p: p.requires_grad, peft_model.parameters()), lr=0.01)\n", + "loss_fn = nn.CrossEntropyLoss()\n", + "training_args = {\"gradient_accumulation_steps\": 1}\n", "\n", - "# Update training parameters, including loss function\n", - "lora_training.update_training_parameters(\n", - " optimizer=optimizer,\n", - " loss_fn=nn.CrossEntropyLoss(),\n", - " training_args={\"gradient_accumulation_steps\": 1},\n", + "# Set up LoRA training\n", + "lora_trainer = LoraTrainer(\n", + " peft_model, optimizer=optimizer, loss_fn=loss_fn, training_args=training_args\n", ")\n", "\n", - "# Create the HybridFHEModel\n", - "remote_names = get_remote_names(lora_training)\n", - "hybrid_model = HybridFHEModel(lora_training, module_names=remote_names)\n", - "\n", "# Prepare input data for calibration\n", "batch_size_per_task = batch_size // 2\n", "inputset = (\n", @@ -302,10 +294,8 @@ " torch.cat([y_task1[:batch_size_per_task], y_task2[:batch_size_per_task]]),\n", ")\n", "\n", - "# Calibrate and compile the model\n", - "lora_training.toggle_calibrate(enable=True)\n", - "hybrid_model.compile_model(inputset, n_bits=8)\n", - "lora_training.toggle_calibrate(enable=False)" + "# Compile the model\n", + "lora_trainer.compile(inputset, n_bits=8)" ] }, { @@ -313,187 +303,11 @@ "execution_count": 6, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fine-tuning on Task 2 with LoRA:\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\r", - "Training: 0%| | 0/10 [00:00 None: + # For a better visualization, we sort the predictions and the ground truth. + y_true = np.array(y_true) + idx = np.argsort(y_true) + y_true_sorted = y_true[idx] + + for title, y_pred in y_preds.items(): + y_preds[title] = y_pred[idx].flatten() + + ncols, nrows = len(y_preds), 1 + + fig, axes = plt.subplots(nrows, ncols, figsize=(15, 5)) + + for i, ((title, y_pred), c) in enumerate(zip(y_preds.items(), colors)): + axes[i].scatter(np.arange(len(y_true_sorted)), y_true_sorted, c="r") + axes[i].scatter(np.arange(len(y_true_sorted)), y_pred, c=c, alpha=0.5) + axes[i].set_xlabel(title, labelpad=5) + axes[i].set_ylabel("Sale_Prices ($)") + # Hide x ticks, because it just refers to indexes. + axes[i].get_xaxis().set_ticks([]) + + # Set the spacing between subplots. + fig.tight_layout() + +plot_predictions( + y_test, + y_preds={"XGBoost": y_preds_XGBoost, "Quant. XGBoost": y_preds_non_fhe}, + colors=["g", "b"], +) + +print(f"R2_score with XGBoost: {metrics.r2_score(y_test, y_preds_XGBoost):.4f}") +print( + f"R2_score in FHE simulation (not encrypted): {metrics.r2_score(y_test, y_preds_non_fhe):.4f}" +) + +n_folds = 5 +param_grid = { + "n_bits": [2, 3, 4, 5, 6, 7], + "max_depth": [4], + "n_estimators": [10, 20, 50, 100], +} + +grid_search_concrete = GridSearchCV(ConcreteXGBRegressor(), param_grid, cv=n_folds, n_jobs=1) +grid_search_concrete.fit(X_train, y_train); + +results = pd.DataFrame(grid_search_concrete.cv_results_) + +print(f"Best score : {grid_search_concrete.best_score_:.3f}") +print(f"Best params: {grid_search_concrete.best_params_}") + +def lineplot(df, yaxis, ylabel, title, group_keys: str = "param_n_estimators"): + params = [ + {"color": "red", "linewidth": 1}, + {"color": "green", "marker": "x", "markersize": 5, "linewidth": 1}, + {"color": "magenta", "marker": "s", "markersize": 5, "dashes": (3, 20)}, + {"color": "blue", "marker": "^", "markersize": 5, "dashes": (3, 10)}, + {"color": "gold", "marker": "*", "markersize": 5, "dashes": (3, 40)}, + {"color": "black", "linestyle": "dashed", "dashes": (3, 10)}, + ] + + plt.figure(figsize=(15, 4)) + + for (key, grp), param in zip(df.groupby([group_keys]), params): + plt.plot(grp["param_n_bits"], grp[yaxis], **param, label=f"estimators_{key}") + + plt.title(title) + plt.ylabel(ylabel) + plt.xlabel("$n_{bits}$") + plt.legend(loc="best") + plt.ylim(0, 1) + plt.minorticks_on() + plt.show() + +lineplot( + df=results, + yaxis="mean_test_score", + ylabel="$r^2_{score}$", + title="$r^2_{score}$ given n_estimators and n_bits", +) + +best_params_xgboost = {"n_estimators": 50, "n_bits": 5} + +# Train the concrete xgboost with the best combination of parameters. +concrete_reg = ConcreteXGBRegressor(**best_params_xgboost, n_jobs=1) + +concrete_reg.fit(X_train, y_train) + +from concrete.compiler import check_gpu_available + +use_gpu_if_available = False +device = "cuda" if use_gpu_if_available and check_gpu_available() else "cpu" + +# Compile the model using the training data. +circuit = concrete_reg.compile(X_train, device=device) + +# Get the equivalent predictions in clear quantized data: +y_preds_clear = concrete_reg.predict(X_test, fhe="disable") + +# Perform the inference in FHE (simulation): +y_preds_simulated = concrete_reg.predict(X_test, fhe="simulate") + +plot_predictions( + y_test, + y_preds={ + "XGBoost": y_preds_XGBoost, + "Concrete ML without FHE": y_preds_clear, + "Concrete ML with FHE (simulation)": y_preds_simulated, + }, + colors=["g", "b", "m"], +) + +# Test in FHE on a smaller test set +FHE_SAMPLE = 20 +X_test_fhe = X_test[:FHE_SAMPLE] +y_test_fhe = y_test[:FHE_SAMPLE] + +# Perform the inference in FHE: +time_begin = time.time() +y_preds_fhe = concrete_reg.predict(X_test_fhe, fhe="execute") +print(f"FHE runtime per sample: {(time.time() - time_begin) / len(X_test_fhe):.2f} sec") + +# Evaluation + +r2_score_sklearn = metrics.r2_score(y_test, y_preds_XGBoost) +r2_score_clear_concrete = metrics.r2_score(y_test, y_preds_clear) +r2_score_simulated_concrete = metrics.r2_score(y_test, y_preds_simulated) +r2_score_fhe_concrete = metrics.r2_score(y_test_fhe, y_preds_fhe) + +print(f"R2_score with XGBoost : {r2_score_sklearn:.4f}") +print(f"R2_score without FHE : {r2_score_clear_concrete:.4f}") +print(f"R2_score with FHE (simulation) : {r2_score_simulated_concrete:.4f}") +print(f"R2_score with FHE : {r2_score_fhe_concrete:.4f}") + + + +# Code from: ./ExperimentPrivacyTreePaper.ipynb +-------------------------------------------------------------------------------- + +# Importing necessary libraries and modules + +import time + +import numpy as np +from IPython.display import display +from onnx import numpy_helper +from sklearn.datasets import fetch_openml +from sklearn.metrics import ( + accuracy_score, + average_precision_score, + f1_score, + precision_score, + recall_score, +) +from sklearn.model_selection import RepeatedKFold +from sklearn.preprocessing import LabelBinarizer, OrdinalEncoder + +from concrete.ml.sklearn import DecisionTreeClassifier, RandomForestClassifier, XGBClassifier + + +def basic_preprocessing(df, target_column): + """ + Convert categorical columns to their corresponding code values + and binarize the target column. + + Parameters: + df (pandas.DataFrame): Input dataframe to preprocess. + target_column (str): Name of the target column to be binarized. + + Returns: + pandas.DataFrame: Preprocessed dataframe. + """ + + for col in df.columns: + if df[col].dtype == "object": + df[col] = df[col].astype("category") + df[col] = df[col].cat.codes + elif df[col].dtype == "category": + df[col] = df[col].cat.codes + df[target_column] = LabelBinarizer().fit_transform(df[target_column]) + + return df + +# Set up dataset names and their respective IDs for fetching from OpenML +dataset_names = { + "spambase": 44, + "wine": None, + "heart-h": 1565, + "wdbc": 1510, + "adult": None, + "steel": 1504, +} + +datasets = {} + + +def load_dataset(name, data_id=None): + """Load dataset from OpenML by name or by ID. + + Args: + name (str): Name of the dataset. + data_id (int, optional): The ID of the dataset on OpenML. + If provided, the dataset is loaded by ID. + + Returns: + X (np.array): Features of the dataset. + y (np.array): Target labels of the dataset. + """ + if data_id is not None: + X, y = fetch_openml(data_id=data_id, as_frame=False, cache=True, return_X_y=True) + else: + X, y = fetch_openml(name=name, as_frame=False, cache=True, return_X_y=True) + return X, y + + +for ds_name, ds_id in dataset_names.items(): + print(f"Loading {ds_name}") + + X, y = load_dataset(ds_name, ds_id) + + # Remove rows with NaN values + not_nan_idx = np.where(~np.isnan(X).any(axis=1)) + X = X[not_nan_idx] + y = y[not_nan_idx] + + # Convert non-integer target labels to integers + if not y.dtype == np.int64: + encoder = OrdinalEncoder() + y = encoder.fit_transform(y.reshape(-1, 1)).astype(np.int32).squeeze() + + datasets[ds_name] = {"X": X, "y": y} + +# Setting a random seed for reproducibility across all models and operations +random_seed = 42 + +# Models with their hyper-parameters +model_hyperparameters = { + DecisionTreeClassifier: {"max_depth": 5, "random_state": random_seed}, + XGBClassifier: {"max_depth": 3, "n_estimators": 50, "random_state": random_seed}, + RandomForestClassifier: {"n_estimators": 50, "random_state": random_seed}, +} + +decision_tree_comparison_params = { + "spam": {"max_leaf_nodes": 58, "max_depth": 17}, + "heart-h": {"max_leaf_nodes": 5, "max_depth": 3}, + "steel": {"max_leaf_nodes": None, "max_depth": 5}, + "wdbc": {"max_leaf_nodes": None, "max_depth": 10}, +} + +# List of bit-width used for quantization +n_bits_list = list(range(1, 10)) + +def analyze_gemm_computation(concrete_classifier): + """Analyze the GEMM (General Matrix Multiply) operations in the given ONNX model. + + Args: + concrete_classifier (object): Classifier that contains an ONNX model representation. + x_train (np.array): Training dataset. + + Returns: + tuple: Shapes of the matrices involved in GEMM operations. + """ + + # Extract weights and biases from the ONNX model graph + quant_params = { + onnx_init.name: numpy_helper.to_array(onnx_init) + for onnx_init in concrete_classifier.onnx_model.graph.initializer + if "weight" in onnx_init.name or "bias" in onnx_init.name + } + + # Extract the shapes of matrices used in GEMM operations + matrix_shapes = [] + for i in range(1, 4): + key = [key for key in quant_params.keys() if f"_{i}" in key and "weight" in key][0] + matrix_shapes.append(quant_params[key].shape) + + return tuple(matrix_shapes) + +def benchmark_model(X, y, model, model_params, n_bits, rkf): + """Benchmark a given model and return its evaluation scores.""" + scores = { + "precision": [], + "recall": [], + "accuracy": [], + "f1": [], + "average_precision": [], + "nodes": None, + } + scores_fp32 = {"precision": [], "recall": [], "accuracy": [], "f1": [], "average_precision": []} + + metric_func_to_key = { + "precision_score": "precision", + "recall_score": "recall", + "f1_score": "f1", + "average_precision_score": "average_precision", + } + + for train_index, test_index in rkf.split(X): + X_train, X_test = X[train_index], X[test_index] + y_train, y_test = y[train_index], y[test_index] + + concrete_model, sklearn_model = model(n_bits=n_bits, **model_params).fit_benchmark( + X_train, y_train + ) + + y_pred = concrete_model.predict(X_test) + if len(set(y_test)) == 2: + for metric_func in [precision_score, recall_score, average_precision_score, f1_score]: + scores_key = metric_func_to_key[metric_func.__name__] + scores[scores_key].append(metric_func(y_test, y_pred)) + scores["accuracy"].append(accuracy_score(y_test, y_pred)) + + y_pred_fp32 = sklearn_model.predict(X_test) + if len(set(y_test)) == 2: + for metric_func in [precision_score, recall_score, average_precision_score, f1_score]: + scores_key = metric_func_to_key[metric_func.__name__] + scores_fp32[scores_key].append(metric_func(y_test, y_pred_fp32)) + scores_fp32["accuracy"].append(accuracy_score(y_test, y_pred_fp32)) + + shapes = analyze_gemm_computation(concrete_model) + scores["nodes"] = shapes[0][0] + + # Calculate inference time + concrete_model.compile(X_train) + concrete_model.fhe_circuit.keygen(force=False) + + start = time.time() + concrete_model.predict(X_test[:1], fhe="execute") + end = time.time() + scores["inference_time"] = end - start + + start = time.time() + concrete_model.predict(X_test[:1]) + end = time.time() + scores_fp32["inference_time"] = end - start + + return scores, scores_fp32 + + +n_bits = 6 +scores_global = {} + +rkf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=0) + +for dataset_name, dataset_data in datasets.items(): + X, y = dataset_data["X"].astype(np.float32), dataset_data["y"] + assert len(set(y)) >= 2 + if y.dtype not in [np.int32, bool]: + print(f"Unexpected datatype for y in dataset {dataset_name}: {y.dtype}") + + key_dataset = f"{dataset_name} (#features: {X.shape[1]})" + scores_global[key_dataset] = {} + + for cls, model_params in model_hyperparameters.items(): + scores, scores_fp32 = benchmark_model(X, y, cls, model_params, n_bits, rkf) + + scores_global[key_dataset][cls.__name__ + "_concrete"] = scores + scores_global[key_dataset][cls.__name__ + "_fp32"] = scores_fp32 + + print( + f"{cls.__name__} on {key_dataset} -> Acc: {np.mean(scores['accuracy']):.4f}, " + f"Acc (fp32): {np.mean(scores_fp32['accuracy']):.4f}, " + f"FHE inference time: {scores['inference_time']:.2f}s" + ) + +import math + +import pandas as pd + +df = pd.DataFrame.from_dict( + {(i, j): value for i, scores in scores_global.items() for j, value in scores.items()}, + orient="index", +) + + +df["FHE/Clear ratio"] = (df["inference_time"] / df["inference_time"].shift(-1)).apply( + lambda x: "" if (x < 1) or (math.isnan(x)) else str(int(round(x, 0))) + "x" +) + + +def format_scores(val): + if isinstance(val, list): + if not val: + return "-" + return f"{np.mean(val) * 100:.1f}\\% ± {np.std(val) * 100:.1f}\\%" + + if pd.isna(val): + return "-" + + if isinstance(val, (float, int)): + # To ensure all floating point values are treated as percentages + return f"{val:.3f}" + + if "x" in str(val): # Ensure that val is treated as a string + return val + + return "-" + + +df = df.applymap(format_scores) + +# Renaming for display +model_names = { + "DecisionTreeClassifier_concrete": "FHE-DT", + "DecisionTreeClassifier_fp32": "FP32-DT", + "XGBClassifier_concrete": "FHE-XGB", + "XGBClassifier_fp32": "FP32-XGB", + "RandomForestClassifier_concrete": "FHE-RF", + "RandomForestClassifier_fp32": "FP32-RF", +} + +for original, renamed in model_names.items(): + df.index = df.index.set_levels(df.index.levels[1].str.replace(original, renamed), level=1) + +df.columns = df.columns.str.replace("average_precision", "AP") + +# Reordering Columns +columns_order = [col for col in df if col not in ["FHE/Clear ratio", "inference_time"]] + [ + "inference_time", + "FHE/Clear ratio", +] +df = df[columns_order] + +# Drop and rename columns +df.columns = df.columns.str.replace("inference_time", "Time (s)") +df.drop(columns=["precision", "recall"], inplace=True) + +# Adjust LaTeX output +latex_code = df.to_latex(multirow=True, escape=False, column_format="l|l|l|l|l|l|l|l") + +latex_code = latex_code.replace("#", "\\#") +display(df) + +def evaluate_model(X, y, model, rkf): + """Evaluate a given model and return its scores.""" + scores = {"precision": [], "recall": [], "accuracy": [], "f1": [], "average_precision": []} + scores_fp32 = {"precision": [], "recall": [], "accuracy": [], "f1": [], "average_precision": []} + + metric_func_to_key = { + "precision_score": "precision", + "recall_score": "recall", + "f1_score": "f1", + "average_precision_score": "average_precision", + } + + for train_index, test_index in rkf.split(X): + X_train, X_test = X[train_index], X[test_index] + y_train, y_test = y[train_index], y[test_index] + + concrete_model, sklearn_model = model.fit_benchmark(X_train, y_train) + + for model_instance, score_dict in [(concrete_model, scores), (sklearn_model, scores_fp32)]: + y_pred = model_instance.predict(X_test) + for metric_func in [precision_score, recall_score, average_precision_score, f1_score]: + score_key = metric_func_to_key[metric_func.__name__] + score_dict[score_key].append(metric_func(y_test, y_pred)) + score_dict["accuracy"].append(accuracy_score(y_test, y_pred)) + + return scores, scores_fp32 + + +rkf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=0) +X, y = datasets["spambase"]["X"].astype(np.float32), datasets["spambase"]["y"] +assert len(set(y)) == 2 +if y.dtype not in [np.int32, bool]: + print(f"Unexpected datatype for y in dataset spambase: {y.dtype}") + +scores_global = {} + +for n_bits in n_bits_list: + scores_global[n_bits] = {} + + for model_cls, params in model_hyperparameters.items(): + model_instance = model_cls(n_bits=n_bits, **params) + scores, scores_fp32 = evaluate_model(X, y, model_instance, rkf) + + model_name = model_cls.__name__ + scores_global[n_bits][model_name + "_concrete"] = scores + scores_global[n_bits][model_name + "_fp32"] = scores_fp32 + + print(f"{model_name} with {n_bits}-bits:") + print("Average precision:", np.mean(scores["average_precision"])) + print("Average precision (fp32):", np.mean(scores_fp32["average_precision"])) + +import matplotlib.pyplot as plt +from tqdm import tqdm + + +def evaluate_model_on_error_rates(X_train, X_test, y_test, concrete_model, p_error_list): + """Evaluate the concrete model on different error rates and return accuracy and time taken.""" + acc_scores = [] + time_scores = [] + real_p_error_list = [] + + for p_error in tqdm(p_error_list): + concrete_model.compile(X_train, p_error=p_error) + real_p_error_list.append(concrete_model.fhe_circuit.p_error) + concrete_model.fhe_circuit.keygen(force=False) + + start_time = time.time() + y_pred = concrete_model.predict(X_test, fhe="execute") + end_time = time.time() + + acc_scores.append(accuracy_score(y_pred, y_test)) + time_scores.append(end_time - start_time) + + return acc_scores, time_scores, real_p_error_list + + +plt.rcParams.update({"font.size": 16}) +n_bits = 6 +p_error_list = [2e-40, 1e-6, 1e-5, 1e-4, 0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 0.95] +X, y = datasets["spambase"]["X"].astype(np.float32), datasets["spambase"]["y"] + +clf = DecisionTreeClassifier(n_bits=n_bits, **model_hyperparameters[DecisionTreeClassifier]) +rkf = RepeatedKFold(n_splits=20, n_repeats=3, random_state=0) + +for train_index, test_index in rkf.split(X): + X_train, X_test = X[train_index], X[test_index] + y_train, y_test = y[train_index], y[test_index] + + concrete_model, _ = clf.fit_benchmark(X_train, y_train) + + # Calculating num_nodes using analyze_gemm_computation function + shapes = analyze_gemm_computation(concrete_model) + num_nodes = shapes[0][0] + + acc_scores, time_p_error, real_p_error_list = evaluate_model_on_error_rates( + X_train, X_test, y_test, concrete_model, p_error_list + ) + break + +def plot_metrics_vs_error_rates( + metric_values, model_name, num_nodes, xlabel, ylabel, filename, red_line_value +): + """Plot the metrics against error rates.""" + plt.figure() + plt.plot( + [real_p_error_list[0], real_p_error_list[-1]], + [red_line_value, red_line_value], + color="red", + linewidth=2, + label="p_error=2E-40", + ) + plt.plot(real_p_error_list, metric_values, color="blue", linewidth=2, marker="x") + plt.grid(True) + plt.legend() + plt.title(f"{model_name} {num_nodes} nodes") + plt.xlabel(xlabel) + plt.ylabel(ylabel) + plt.semilogx() + plt.xticks(10.0 ** np.arange(-6, 1)) + plt.savefig(filename, bbox_inches="tight", dpi=300) + plt.show() + + +# Plotting accuracy vs error rates +plot_metrics_vs_error_rates( + acc_scores, + "DecisionTreeClassifier", + num_nodes, + "$p_{error}$", + "Metric", + "DecisionTreeClassifier" + "acc_p_error.eps", + 0.91, +) + +# Plotting execution time per data point vs error rates +plot_metrics_vs_error_rates( + np.asarray(time_p_error) / X_test.shape[0], + "DecisionTreeClassifier", + num_nodes, + "$p_{error}$", + "Execution time", + "DecisionTreeClassifier" + "speed_p_error.eps", + 1.807, +) + +# Plot the metrics vs n_bits for each model +plt.rcParams.update({"font.size": 16}) +for cls in model_hyperparameters: + plt.figure() + + f1_scores = [] + f1_scores_fp32 = [] + + average_precision_scores = [] + average_precision_scores_fp32 = [] + + for n_bits in n_bits_list: + average_precision_scores.append( + np.mean(scores_global[n_bits][cls.__name__ + "_concrete"]["average_precision"]) + ) + average_precision_scores_fp32.append( + np.mean(scores_global[n_bits][cls.__name__ + "_fp32"]["average_precision"]) + ) + + f1_scores.append(np.mean(scores_global[n_bits][cls.__name__ + "_concrete"]["f1"])) + f1_scores_fp32.append(np.mean(scores_global[n_bits][cls.__name__ + "_fp32"]["f1"])) + + # plt.legend() + ap_relative = np.array(average_precision_scores) / average_precision_scores_fp32 + f1_relative = np.array(f1_scores) / f1_scores_fp32 + print(f"ap relative: {ap_relative}, f1_relative: {f1_relative}") + plt.plot( + n_bits_list, + average_precision_scores, + label="concrete_average_precision", + color="blue", + linewidth=2, + ) + plt.plot( + n_bits_list, + average_precision_scores_fp32, + label="fp32_average_precision", + color="blue", + linewidth=2, + linestyle="dashed", + ) + + plt.plot(n_bits_list, f1_scores, label="concrete_f1", linewidth=2, color="red") + plt.plot( + n_bits_list, f1_scores_fp32, label="fp32_f1", color="red", linewidth=2, linestyle="dashed" + ) + + plt.grid(True) + plt.xlim([1, 9]) + plt.ylim([0, 1]) + plt.xticks(np.arange(1, 10)) + plt.legend() + + plt.title(cls.__name__) + plt.xlabel("Bitwidth") + plt.ylabel("Metric") + # Save the figure + plt.savefig(cls.__name__ + ".eps", bbox_inches="tight", dpi=300) + + plt.show() + +def predict_with_fhe(clf, X_sample): + """Predict using FHE and return elapsed time.""" + print("Compiling and keygen...") + clf.compile(X_sample[:100]) + clf.fhe_circuit.keygen(force=False) + + print("Predict in FHE") + start_time = time.time() + _ = clf.predict(X_sample[:1], fhe="execute") + end_time = time.time() + + return end_time - start_time + + +def analyze_and_store(clf, X_sample, nodes_dict, scores_dict): + """Analyze the model and store results.""" + elapsed_time = predict_with_fhe(clf, X_sample) + + model_name = clf.__class__.__name__ + if model_name not in nodes_dict: + nodes_dict[model_name] = [] + scores_dict[model_name] = [] + + scores_dict[model_name].append(elapsed_time) + + shapes = analyze_gemm_computation(clf) + nodes_dict[model_name].append(shapes[0][0]) + + print(clf.n_bits) + print(scores_dict[model_name][-1]) + print(nodes_dict[model_name][-1]) + + +X, y = datasets["spambase"]["X"], datasets["spambase"]["y"] +nodes_dict = {} +scores_dict = {} + +for model_name, hyperparameters in model_hyperparameters.items(): + for n_bits in n_bits_list: + clf = model_name(n_bits=n_bits, **hyperparameters) + clf.fit(X, y) + + if n_bits < 9: + analyze_and_store(clf, X, nodes_dict, scores_dict) + +def plot_fhe_inference_time(n_bits_list, scores, model_hyperparameters): + """Plot the FHE inference time against bitwidth for each model.""" + + # Calculate average inference time per node for each bitwidth + n_bits_timings = np.zeros((8,)) + for model in model_hyperparameters: + for idx, n_bits in enumerate(n_bits_list): + if n_bits < 9: + n_bits_timings[idx] += ( + scores[model.__name__][idx] / nodes_dict[model.__name__][idx] * 1000 + ) + n_bits_timings /= len(model_hyperparameters) + + # Plot setup + plt.figure(figsize=(10, 6)) + plt.rcParams.update({"font.size": 16}) + + plt.plot( + range(1, 9), + n_bits_timings, + label="FHE Inference Time", + color="blue", + linewidth=2, + marker="o", + ) + + plt.xlabel("Bitwidth") + plt.ylabel("Time (ms)") + plt.grid(True, which="both") + plt.semilogy() + plt.ylim([0, 1000]) + plt.xlim([0.5, 8.5]) + plt.xticks(np.arange(1, 9)) + plt.title("FHE Execution vs Precision", pad=10) + + plt.savefig("fhe_inference_time.eps", bbox_inches="tight", dpi=300) + plt.show() + + +plot_fhe_inference_time(n_bits_list, scores_dict, model_hyperparameters) + + + +# Code from: ./SVMClassifier.ipynb +-------------------------------------------------------------------------------- + +# display visualizations and plots in the notebook itself +%matplotlib inline + +# import numpy and matplotlib +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from sklearn.decomposition import PCA +from sklearn.metrics import accuracy_score, f1_score, make_scorer +from sklearn.model_selection import GridSearchCV, train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.svm import LinearSVC as SklearnLinearSVC + +# import the concrete-ml LinearSVC implementation +from concrete.ml.sklearn.svm import LinearSVC as ConcreteLinearSVC + +def plot_decision_boundary( + clf, + X, + y, + title="LinearSVC Decision Boundary", + xlabel="First Principal Component", + ylabel="Second Principal Component", +): + # Perform PCA to reduce the dimensionality to 2 + pca = PCA(n_components=2) + X_pca = pca.fit_transform(X) + + # Create the mesh grid + x_min, x_max = X_pca[:, 0].min() - 1, X_pca[:, 0].max() + 1 + y_min, y_max = X_pca[:, 1].min() - 1, X_pca[:, 1].max() + 1 + xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), np.arange(y_min, y_max, 0.02)) + + # Transform the mesh grid points back to the original feature space + mesh_points = pca.inverse_transform(np.c_[xx.ravel(), yy.ravel()]) + + # Make predictions using the classifier + Z = clf.predict(mesh_points) + Z = Z.reshape(xx.shape) + + # Plot the decision boundary + _, ax = plt.subplots() + ax.contourf(xx, yy, Z, alpha=0.8) + ax.scatter(X_pca[:, 0], X_pca[:, 1], c=y, edgecolors="k", marker="o", s=50) + + # Calculate the accuracy + accuracy = accuracy_score(y, clf.predict(X)) + + plt.xlabel(xlabel) + plt.ylabel(ylabel) + plt.title(f"{title} (Accuracy: {accuracy:.4f})") + plt.show() + +# Get the data +df = pd.read_csv( + "https://gist.githubusercontent.com/robinstraub/72f1cb27829dba85f49f68210979f561/" + "raw/b9982ae654967028f6f4010bd235d850d38fe25b/pulsar-star-dataset.csv" +) +df.head() + +# Extract the features and labels +X = df.drop(columns=["target_class"]) +y = df["target_class"] + +# Replace N/A values with the mean of the respective feature +X.fillna(X.mean(), inplace=True) + +# Split the data into train and test sets +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +# Scale the data +scaler = StandardScaler() +X_train = scaler.fit_transform(X_train) +X_test = scaler.transform(X_test) + +# Convert the floating labels to integer labels for both train and test sets +y_train = y_train.astype(int) +y_test = y_test.astype(int) + +# Train a model with scikit-learn LinearSVC, perform prediction and compute the accuracy +svm_sklearn = SklearnLinearSVC(max_iter=100) +svm_sklearn.fit(X_train, y_train) +# plot the boundary +plot_decision_boundary(svm_sklearn, X_test, y_test) + +# Perform the same steps with the Concrete-ML LinearSVC implementation +svm_concrete = ConcreteLinearSVC(max_iter=100, n_bits=8) +svm_concrete.fit(X_train, y_train) +# plot the boundary +plot_decision_boundary(svm_concrete, X_test, y_test) + +# A circuit needs to be compiled to enable FHE execution +circuit = svm_concrete.compile(X_train) +# Now that a circuit is compiled, the svm_concrete can predict value with FHE +y_pred = svm_concrete.predict(X_test, fhe="execute") +accuracy = accuracy_score(y_test, y_pred) +# print the accuracy +print(f"FHE Accuracy: {accuracy:.4f} (bit-width: {circuit.graph.maximum_integer_bit_width()})") + +# setup and train a scikit-learn LinearSVC model, just as before +svm_sklearn = SklearnLinearSVC() +svm_sklearn.fit(X_train, y_train) +# predict some test data and measure the model accuracy +y_pred_sklearn = svm_sklearn.predict(X_test) +accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn) + +print(f"Scikit-learn Accuracy: {accuracy_sklearn:.4f}") + +svm = ConcreteLinearSVC() + +# Define the parameter grid for the grid search +param_grid = param_grid = [ + { + "C": np.logspace(-3, 3, 7), + "n_bits": range(2, 17), + "penalty": ["l1", "l2"], + "dual": [False, True], + }, +] + +# Use the F1 score as the metric to optimize, as it provides a +# balanced trade-off between precision and recall +scorer = make_scorer(f1_score, average="weighted") + +# Set up the grid search with the custom scoring function +grid_search = GridSearchCV(svm, param_grid, scoring=scorer, cv=5, n_jobs=1) + +# Fit the grid search to the data +grid_search.fit(X_train, y_train) + +# Convert the grid search results into a pandas DataFrame +results_df = pd.DataFrame(grid_search.cv_results_) + +# Define a custom function to highlight a specific row based on n_bits value + + +def highlight_row(row, n_bits_value=3, color="green"): + return [ + f"background-color: {color}" if row["param_n_bits"] == n_bits_value else "" for _ in row + ] + + +# Find the best hyperparameter combination for each n_bits value +best_results = results_df.loc[results_df.groupby("param_n_bits")["mean_test_score"].idxmax()] +best_results = best_results[ + ["param_n_bits", "param_C", "param_penalty", "param_dual", "mean_test_score"] +] +best_results.reset_index(drop=True, inplace=True) + +# Display the best results DataFrame +best_results.style.apply(highlight_row, n_bits_value=3, axis=1).hide() + +svm_concrete = ConcreteLinearSVC(n_bits=3, C=1, dual=False, penalty="l1") +svm_concrete.fit(X_train, y_train) + +# compile the model +circuit = svm_concrete.compile(X_train) + +# the model can now be executed with FHE +y_pred = svm_concrete.predict(X_test, fhe="simulate") +accuracy = accuracy_score(y_test, y_pred) +print(f"Accuracy with FHE simulation: {accuracy:.4f}") + +# predict the test set to verify the compiled model accuracy +y_pred = svm_concrete.predict(X_test, fhe="execute") +accuracy = accuracy_score(y_test, y_pred) +print(f"Accuracy with FHE execution: {accuracy:.4f}") + + + +# Code from: ./LinearSVR.ipynb +-------------------------------------------------------------------------------- + +import time + +import numpy as np +import pandas as pd +from sklearn.datasets import load_diabetes +from sklearn.metrics import make_scorer, mean_squared_error +from sklearn.model_selection import GridSearchCV, KFold, train_test_split +from sklearn.svm import LinearSVR as SklearnLinearSVR + +from concrete.ml.sklearn.svm import LinearSVR as ConcreteLinearSVR + +%matplotlib inline + +import matplotlib.pyplot as plt +from IPython.display import display + +train_plot_config = {"c": "black", "marker": "D", "s": 15, "label": "Train data"} +test_plot_config = {"c": "red", "marker": "x", "s": 15, "label": "Test data"} + + +def get_sklearn_plot_config(mse_score=None): + label = "scikit-learn" + if mse_score is not None: + label += f", {'$MSE$'}={mse_score:.4f}" + return {"c": "blue", "linewidth": 2.5, "label": label} + + +def get_concrete_plot_config(mse_score=None): + label = "Concrete-ML" + if mse_score is not None: + label += f", {'$MSE$'}={mse_score:.4f}" + return {"c": "orange", "linewidth": 2.5, "label": label} + +# Load the diabetes data-set +X, y = load_diabetes(return_X_y=True) +# Use only one feature for educational purpose +X = X[:, np.newaxis, 2] + +# We split the data-set into a training and a testing set +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=23) + +# We sort the test set for a better visualization +sorted_indexes = np.argsort(np.squeeze(X_test)) +X_test = X_test[sorted_indexes, :] +y_test = y_test[sorted_indexes] + +plt.ioff() + +plt.clf() +fig, ax = plt.subplots(1, figsize=(10, 5)) +fig.patch.set_facecolor("white") +ax.scatter(X_train, y_train, **train_plot_config) +ax.scatter(X_test, y_test, **test_plot_config) +ax.legend() +display(fig) + +grid_scorer = make_scorer(mean_squared_error, greater_is_better=False) + +param_grid = { + "epsilon": [0.0, 1.0, 10.0, 20.0], + "C": [0.1, 100.0, 10000.0, 100000.0], +} + +sklearn_rgs = SklearnLinearSVR() +kfold_cv = KFold(n_splits=5, shuffle=True, random_state=13) + +gs_sklearn = GridSearchCV( + sklearn_rgs, + param_grid, + cv=kfold_cv, + scoring=grid_scorer, + verbose=1, +).fit(X_train, y_train) + +param_grid = { + "n_bits": [6, 8, 12], + "epsilon": [0.0, 1.0, 10.0, 20.0], + "C": [0.1, 100.0, 10000.0, 100000.0], +} + +concrete_rgs = ConcreteLinearSVR() + +gs_concrete = GridSearchCV( + concrete_rgs, + param_grid, + cv=kfold_cv, + scoring=grid_scorer, + verbose=1, +).fit(X_train, y_train) + +plt.ioff() + +results_df = pd.DataFrame(gs_concrete.cv_results_) + +fig, ax = plt.subplots(1, figsize=(12, 8)) +(l1,) = ax.plot( + np.arange(16), -results_df.loc[results_df["param_n_bits"] == 6, "mean_test_score"], "-o" +) +(l2,) = ax.plot( + np.arange(16), -results_df.loc[results_df["param_n_bits"] == 8, "mean_test_score"], "-o" +) +(l3,) = ax.plot( + np.arange(16), -results_df.loc[results_df["param_n_bits"] == 12, "mean_test_score"], "-o" +) +ax.legend((l1, l2, l3), ("n_bits = 6", "n_bits = 8", "n_bits = 12"), loc="upper right", shadow=True) +ax.set_xlabel("Different models with fixed values of C and epsilon") +ax.set_ylabel("Mean MSE accros CV folds") +ax.set_title("Impact of `n_bits` on Cross Validation performances") +display(fig) + +# Print mean time fit and std time fit for both models +print( + f"Mean time fit sklearn: {np.mean(gs_sklearn.cv_results_['mean_fit_time']):.3f}s," + f" std time fit sklearn: {np.std(gs_sklearn.cv_results_['mean_fit_time']):.3f}s" +) +print( + f"Mean time fit concrete: {np.mean(gs_concrete.cv_results_['mean_fit_time']):.3f}s," + f"std time fit concrete: {np.std(gs_concrete.cv_results_['mean_fit_time']):.3f}s" +) + +# Print best score for both models +print(f"Best MSE score sklearn: {-gs_sklearn.best_score_:.2f}") +print(f"Best MSE score concrete: {-gs_concrete.best_score_:.2f}") + +# Get best hyperparameters out of gs_concrete +best_params_concrete = gs_concrete.best_params_ +print(f"Best parameters for Concrete: {best_params_concrete}") +best_params_sklearn = gs_sklearn.best_params_ +print(f"Best parameters for Sklearn: {best_params_sklearn}") + +# Train concrete and sklearn LinearSVR with best hyper parameters +concrete_rgs = ConcreteLinearSVR(**best_params_concrete) + +concrete_rgs, sklearn_rgs = concrete_rgs.fit_benchmark(X_train, y_train) + +# Compile the model using the training data +circuit = concrete_rgs.compile(X_train) + +# Generate the key +print(f"Generating a key for an {circuit.graph.maximum_integer_bit_width()}-bit circuit") + +time_begin = time.time() +circuit.client.keygen(force=False) +print(f"Key generation time: {time.time() - time_begin:.2f} seconds") + +# Now predict using the FHE-quantized model on the testing set +time_begin = time.time() +y_pred_fhe = concrete_rgs.predict(X_test, fhe="execute") +print(f"Execution time: {(time.time() - time_begin) / len(X_test):.4f} seconds per sample") + +# Now predict using the Sklearn model on the testing set +time_begin = time.time() +y_pred_sklearn = sklearn_rgs.predict(X_test) +print(f"Execution time: {(time.time() - time_begin) / len(X_test):.4f} seconds per sample") + +# Now predict using clear quantized Concrete-ML model on testing set +time_begin = time.time() +y_preds_quantized = concrete_rgs.predict(X_test) +print(f"Execution time: {(time.time() - time_begin) / len(X_test):.4f} seconds per sample") + +# Print all MSE a string to explain + +mse_sklearn = mean_squared_error(y_test, y_pred_sklearn) +mse_clear = mean_squared_error(y_test, y_preds_quantized) +mse_fhe = mean_squared_error(y_test, y_pred_fhe) + +print( + f"Clear FP32 sklearn model MSE: {mse_sklearn:.3f}\n" + f"Clear quantized model MSE: {mse_clear:.3f}\n" + f"FHE model MSE: {mse_fhe:.3f}" +) + +# Measure the error of the FHE-quantized model with respect to quantized clear Concrete ML model +concrete_score_difference = abs(mse_fhe - mse_clear) * 100 / mse_clear +print( + "\nRelative difference between Concrete-ml (quantized clear) and Concrete-ml (FHE) scores:", + f"{concrete_score_difference:.2f}%", +) + + +# Measure the error of the FHE quantized model with respect to the sklearn float model +score_difference = abs(mse_fhe - mse_sklearn) * 100 / mse_sklearn +print( + "Relative difference between scikit-learn (clear) and Concrete-ml (FHE) scores:", + f"{score_difference:.2f}%", +) + +# We densify the space representation of the original X, +# to better visualize the resulting step function in the following figure +x_space = np.linspace(X_test.min(), X_test.max(), num=300) +x_space = x_space[:, np.newaxis] +y_pred_q_space = concrete_rgs.predict(x_space) + +plt.ioff() + +plt.clf() +fig, ax = plt.subplots(1, figsize=(12, 8)) +fig.patch.set_facecolor("white") +ax.scatter(X_train, y_train, **train_plot_config) +ax.scatter(X_test, y_test, **test_plot_config) +ax.plot(X_test, y_pred_sklearn, **get_sklearn_plot_config(mse_sklearn)) +ax.plot(x_space, y_pred_q_space, **get_concrete_plot_config(mse_clear)) +ax.legend() +display(fig) + + + +# Code from: ./LogisticRegressionTraining.ipynb +-------------------------------------------------------------------------------- + +%matplotlib inline +# Import dataset libraries and util functions +from pathlib import Path +from tempfile import TemporaryDirectory + +import matplotlib.pyplot as plt +import numpy as np +from concrete.compiler import check_gpu_available +from matplotlib.colors import ListedColormap +from matplotlib.lines import Line2D +from sklearn import datasets +from sklearn.linear_model import SGDClassifier as SklearnSGDClassifier +from sklearn.metrics import accuracy_score +from sklearn.preprocessing import MinMaxScaler + +from concrete import fhe +from concrete.ml.deployment import FHEModelClient, FHEModelDev, FHEModelServer +from concrete.ml.sklearn import SGDClassifier + +use_gpu_if_available = False +device = "cuda" if use_gpu_if_available and check_gpu_available() else "cpu" + + +def plot_decision_boundary( + X, y, clf=None, weights=None, bias=None, title="Decision Boundary", accuracy=None +): + # Create a mesh to plot the decision boundaries + x_min, x_max = X[:, 0].min() - 0.1, X[:, 0].max() + 0.1 + y_min, y_max = X[:, 1].min() - 0.1, X[:, 1].max() + 0.1 + xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01)) + + if clf is not None: + # Predictions to get the decision boundary + Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) + Z = Z.reshape(xx.shape) + learned_weights = ( + f"Learned weights: " + f"{clf.coef_[0][0]:.3f}, " + f"{clf.coef_[0][1]:.3f}, " + f"{clf.intercept_.reshape((-1,))[0]:.3f}" + ) + elif weights is not None and bias is not None: + # Compute the linear model for the mesh grid + linear_model = np.dot(np.c_[xx.ravel(), yy.ravel()], weights[0]) + bias[0] + Z = np.round(1 / (1 + np.exp(-linear_model))) + Z = Z.reshape(xx.shape) + learned_weights = "" + else: + raise ValueError("Either 'clf' or both 'weights' and 'bias' must be provided.") + + # Define red and blue color map + cm_bright = ListedColormap(["#FF0000", "#0000FF"]) + + # Plotting the results + plt.figure(figsize=(10, 6)) + plt.contourf(xx, yy, Z, alpha=0.3, cmap=cm_bright) + plt.scatter(X[:, 0], X[:, 1], c=y, edgecolor="k", cmap=cm_bright) + plt.title(f"{title} (Accuracy: {accuracy})\n {learned_weights}") + plt.xlabel("Feature 1") + plt.ylabel("Feature 2") + + # Create a custom legend + legend_elements = [ + Line2D( + [0], + [0], + marker="o", + color="w", + label="Class 0", + markerfacecolor="#FF0000", + markersize=10, + ), + Line2D( + [0], + [0], + marker="o", + color="w", + label="Class 1", + markerfacecolor="#0000FF", + markersize=10, + ), + ] + plt.legend(handles=legend_elements, loc="upper right") + + plt.show() + + +# Load the Iris dataset +X_full, y_full = datasets.load_iris(return_X_y=True) +X_full = MinMaxScaler(feature_range=[-1, 1]).fit_transform(X_full) + +# Select petal length and petal width for visualization +X = X_full[:, 2:4] # Petal length and petal width + +# Filter the dataset for binary classification (Versicolor and Virginica) +# These correspond to target labels 1 and 2 in the Iris dataset +binary_filter = (y_full == 1) | (y_full == 2) +X_binary = X[binary_filter] +X_full_binary = X_full[binary_filter] +y_binary = y_full[binary_filter] - 1 + +# Train an SGDClassifier on the binary dataset +N_ITERATIONS = 15 +RANDOM_STATE = 42 + +np.random.seed(RANDOM_STATE) + +model_binary_sklearn = SklearnSGDClassifier(random_state=RANDOM_STATE, max_iter=N_ITERATIONS) + +model_binary_sklearn.fit(X_binary, y_binary) + +y_pred_binary_sklearn = model_binary_sklearn.predict(X_binary) + +accuracy_binary_sklearn = accuracy_score(y_binary, y_pred_binary_sklearn) + +plot_decision_boundary( + X_binary, + y_binary, + clf=model_binary_sklearn, + accuracy=accuracy_binary_sklearn, + title="Scikit-Learn decision boundary", +) + +parameters_range = (-1.0, 1.0) + +model_binary_fhe = SGDClassifier( + random_state=RANDOM_STATE, + max_iter=N_ITERATIONS, + fit_encrypted=True, + parameters_range=parameters_range, + verbose=True, +) + +# Fit on encrypted data +model_binary_fhe.fit(X_binary, y_binary, fhe="execute", device=device) + +# The weights are decrypted at the end of the `fit` call. Use the clear weights here +# to evaluate accuracy on clear data +y_pred_binary = model_binary_fhe.predict(X_binary) + +model_binary_fhe.compile(X_binary) + +# Evaluate the decrypted weights on encrypted data +y_pred_binary_fhe = model_binary_fhe.predict(X_binary, fhe="execute") + +# Check that the same result is obtained when applying +# the decrypted model on clear data and on encrypted data +# Linear classifiers are 100% correct on encrypted data compared to execution on clear data +assert np.all(y_pred_binary == y_pred_binary_fhe) + +accuracy_binary_fhe = accuracy_score(y_binary, y_pred_binary_fhe) + +plot_decision_boundary( + X_binary, + y_binary, + clf=model_binary_fhe, + accuracy=accuracy_binary_fhe, + title="Concrete ML (training on encrypted data with FHE) decision boundary", +) + +from sklearn.model_selection import train_test_split + +X, y = datasets.load_breast_cancer(return_X_y=True) +x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y) + +scaler = MinMaxScaler(feature_range=[-1, 1]) +x_train = scaler.fit_transform(x_train) +x_test = scaler.transform(x_test) + +rng = np.random.default_rng(RANDOM_STATE) +perm = rng.permutation(x_train.shape[0]) + +x_train = x_train[perm, ::] +y_train = y_train[perm] + +parameters_range = (-1.0, 1.0) + +model_sklearn = SklearnSGDClassifier( + random_state=RANDOM_STATE, + max_iter=N_ITERATIONS, +) + +model_sklearn.fit(x_train, y_train) + +y_pred_sklearn = model_sklearn.predict(x_test) + +accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn) + +print(f"Sklearn clear accuracy: {accuracy_sklearn*100:.2f}%") + +model_concrete = SGDClassifier( + random_state=RANDOM_STATE, + max_iter=N_ITERATIONS, + fit_encrypted=True, + parameters_range=parameters_range, +) + +# Train with simulation on the full dataset +model_concrete.fit(x_train, y_train, fhe="simulate") + +model_concrete.compile(x_train) + +# Measure accuracy on the test set using simulation +y_pred_fhe = model_concrete.predict(x_test, fhe="simulate") + +accuracy_fhe = accuracy_score(y_test, y_pred_fhe) +print(f"Full encrypted fit (simulated) accuracy: {accuracy_fhe*100:.2f}%") + +# To measure accuracy after every batch initialize the SGDClassifier with warm_start=True +# which keeps the weights obtained with previous batches + +model_concrete_partial = SGDClassifier( + random_state=RANDOM_STATE, + max_iter=N_ITERATIONS, + fit_encrypted=True, + parameters_range=parameters_range, + warm_start=True, +) + +batch_size = model_concrete_partial.batch_size + +classes = np.unique(y_train) + +# Go through the training batches +accuracy_scores = [] +for idx in range(x_train.shape[0] // batch_size): + batch_range = range(idx * batch_size, (idx + 1) * batch_size) + x_batch = x_train[batch_range, ::] + y_batch = y_train[batch_range] + + # Fit on a single batch with partial_fit + # Provide the list of all expected classes for the first iteration, as done in scikit-learn + if idx == 0: + model_concrete_partial.partial_fit(x_batch, y_batch, classes=classes, fhe="simulate") + else: + model_concrete_partial.partial_fit(x_batch, y_batch, fhe="simulate") + + model_concrete_partial.compile(x_train) + + # Measure accuracy of the model with FHE simulation + y_pred_partial_fhe = model_concrete_partial.predict(x_test, fhe="simulate") + + accuracy_partial = accuracy_score(y_test, y_pred_partial_fhe) + accuracy_scores.append(accuracy_partial) + +# Plot the evolution of accuracy throughout the training process +fig = plt.figure() +plt.plot(accuracy_scores) +plt.title(f"Accuracy evolution on breast-cancer. Final accuracy {accuracy_scores[-1]*100:.2f}%") +plt.xlabel("Batch number") +plt.ylabel("Accuracy") +plt.grid(True) +plt.show() + +# Initialize the model with parameters +parameters_range = (-1.0, 1.0) +batch_size = 8 + +sgd_clf_binary_fhe = SGDClassifier( + random_state=RANDOM_STATE, + max_iter=N_ITERATIONS, + fit_encrypted=True, + parameters_range=parameters_range, +) + +# Generate the min and max values for X_binary and y_binary +x_min, x_max = X_binary.min(axis=0), X_binary.max(axis=0) +y_min, y_max = y_binary.min(), y_binary.max() + +# Create a dataset with the min and max values for each feature, repeated to fill the batch size +x_compile_set = np.vstack([x_min, x_max] * (batch_size // 2)) + +# Create a dataset with the min and max values for y, repeated to fill the batch size +y_compile_set = np.array([y_min, y_max] * (batch_size // 2)) + +# Fit the model with the created dataset to compile it for production +# This step ensures the model knows the number of features, targets and features distribution + + +device = "cuda" if check_gpu_available() else "cpu" +sgd_clf_binary_fhe.fit(x_compile_set, y_compile_set, fhe="disable", device=device) + +# Define the directory where to save the deployment files +DEPLOYMENT_PATH = Path("fhe_training") +DEPLOYMENT_PATH.mkdir(exist_ok=True) + +deployment_dir = TemporaryDirectory(dir=str(DEPLOYMENT_PATH)) # pylint: disable=consider-using-with +deployment_path = Path(deployment_dir.name) + +# Save the training FHE circuit for production +fhe_dev = FHEModelDev(deployment_path, sgd_clf_binary_fhe) +fhe_dev.save(mode="training") + +# On the client side, load the circuit.zip with the information to create +# - the key +# - the pre and post processing functions + +fhe_client = FHEModelClient(deployment_path) +fhe_client.load() +serialized_evaluation_keys = fhe_client.get_serialized_evaluation_keys() + +# On the server side, we load the server.zip which contain the training model +fhe_server = FHEModelServer(deployment_path) +fhe_server.load() + +# Define utils function to evaluate the model + + +def model_inference(weights, bias, X): + # Compute the linear model + linear_model = np.dot(X, weights[0]) + bias[0] + + # Apply the sigmoid function + sigmoid = 1 / (1 + np.exp(-linear_model)) + + # Compute the prediction + prediction = np.round(sigmoid) + + return prediction + + +def compute_model_accuracy(weights, bias, X, y): + # Compute the prediction + prediction = model_inference(weights, bias, X).squeeze() + + # Compute the accuracy + return np.mean(prediction == y) + +batch_size = sgd_clf_binary_fhe.batch_size + +# Shuffle X_binary and y_binary +perm = np.random.permutation(X_binary.shape[0]) +X_binary = X_binary[perm, ::] +y_binary = y_binary[perm] + +# Initialize the weight and bias randomly +# They are going to be updated using FHE training. +weights = np.random.rand(1, X_binary.shape[1], 1) +bias = np.random.rand(1, 1, 1) + +# Plot the decision boundaries before starting +plot_decision_boundary( + X_binary, + y_binary, + weights=weights, + bias=bias, + title="Decision Boundary before training", + accuracy=compute_model_accuracy(weights, bias, X_binary, y_binary), +) + + +def quantize_encrypt_serialize_batches(fhe_client, x, y, weights, bias, batch_size): + x_batches_enc, y_batches_enc = [], [] + + for i in range(0, x.shape[0], batch_size): + + # Avoid the last batch if it's not a multiple of 'batch_size' + if i + batch_size < x.shape[0]: + batch_range = range(i, i + batch_size) + else: + break + + # Make the data X (1, batch_size, n_features) and y (1, batch_size, n_targets=1) + x_batch = np.expand_dims(x[batch_range, :], 0) + y_batch = np.expand_dims(y[batch_range], (0, 2)) + + # Encrypt the batch + x_batch_enc, y_batch_enc, _, _ = fhe_client.quantize_encrypt_serialize( + x_batch, y_batch, None, None + ) + + x_batches_enc.append(x_batch_enc) + y_batches_enc.append(y_batch_enc) + + _, _, weights_enc, bias_enc = fhe_client.quantize_encrypt_serialize(None, None, weights, bias) + + return x_batches_enc, y_batches_enc, weights_enc, bias_enc + + +def server_run(fhe_server, x_batches_enc, y_batches_enc, weights_enc, bias_enc, evaluation_keys): + + weights_enc = fhe.Value.deserialize(weights_enc) + bias_enc = fhe.Value.deserialize(bias_enc) + + evaluation_keys = fhe.EvaluationKeys.deserialize(evaluation_keys) + + # Run the circuit on the server n times, n being the number of batches sent by the user + for x_batch, y_batch in zip(x_batches_enc, y_batches_enc): + x_batch = fhe.Value.deserialize(x_batch) + y_batch = fhe.Value.deserialize(y_batch) + + weights_enc, bias_enc = fhe_server.run( + (x_batch, y_batch, weights_enc, bias_enc), evaluation_keys + ) + + weights_enc = weights_enc.serialize() + bias_enc = bias_enc.serialize() + + return weights_enc, bias_enc + + +def train_fhe_client_server( + x, + y, + batch_size, + fhe_client, + fhe_server, + serialized_evaluation_keys, + weights, + bias, + n_epochs=1, +): + acc_history = [] + + for epoch in range(n_epochs): + # Shuffle x and y + perm = np.random.permutation(x.shape[0]) + x = x[perm, ::] + y = y[perm] + + # Quantize, encrypt and serialize the batched inputs as well as the weight and bias values + x_batches_enc, y_batches_enc, weights_enc, bias_enc = quantize_encrypt_serialize_batches( + fhe_client, x, y, weights, bias, batch_size + ) + + # Iterate the circuit over the batches on the server + fitted_weights_enc, fitted_bias_enc = server_run( + fhe_server, + x_batches_enc, + y_batches_enc, + weights_enc, + bias_enc, + serialized_evaluation_keys, + ) + + # Back on the client, deserialize, decrypt and de-quantize the fitted weight and bias values + weights, bias = fhe_client.deserialize_decrypt_dequantize( + fitted_weights_enc, fitted_bias_enc + ) + + # Compute, store and print the epoch's accuracy + accuracy_score = compute_model_accuracy(weights, bias, x, y) + acc_history.append(accuracy_score) + + print(f"Epoch {epoch + 1}/{n_epochs} completed. Accuracy: {acc_history[-1]}") + + return weights, bias, acc_history + + +weights, bias, acc_history = train_fhe_client_server( + X_binary, + y_binary, + batch_size, + fhe_client, + fhe_server, + serialized_evaluation_keys, + weights, + bias, +) + +# Plot the decision final model boundary +plot_decision_boundary( + X_binary, + y_binary, + weights=weights, + bias=bias, + title="Decision Boundary after training", + accuracy=acc_history[-1], +) + +# Let's rotate the dataset 90 degrees and see +# if the model can learn the new dataset + +# Define the 90-degree rotation matrix +rotation_matrix = np.array([[0, -1], [1, 0]]) + +# Apply the rotation matrix to X_binary +X_binary_pivoted = X_binary @ rotation_matrix + +# Plot before training +plot_decision_boundary( + X_binary_pivoted, + y_binary, + weights=weights, + bias=bias, + title="Pivoted Dataset", + accuracy=compute_model_accuracy(weights, bias, X_binary_pivoted, y_binary), +) + +# Train the model again with the pivoted dataset +weights_pivoted, bias_pivoted, acc_history_pivoted = train_fhe_client_server( + X_binary_pivoted, + y_binary, + batch_size, + fhe_client, + fhe_server, + serialized_evaluation_keys, + weights, + bias, + n_epochs=2, +) + +# Plot the decision boundary for the pivoted dataset +plot_decision_boundary( + X_binary_pivoted, + y_binary, + weights=weights_pivoted, + bias=bias_pivoted, + title="Decision Boundary after training on pivoted dataset", + accuracy=acc_history_pivoted[-1], +) + +# Clean the temporary directories and their content +deployment_dir.cleanup() + + + +# Code from: ./QuantizationAwareTraining.ipynb +-------------------------------------------------------------------------------- + +import time + +import matplotlib.pyplot as plt +import numpy +import torch +from sklearn.model_selection import train_test_split +from torch import nn +from torch.utils.data import DataLoader, TensorDataset +from tqdm.auto import tqdm + +from concrete.ml.quantization.quantized_module import QuantizedModule +from concrete.ml.torch.compile import compile_brevitas_qat_model + +IN_FEAT = 2 +OUT_FEAT = 2 +N_SIDE = 100 +N_EXAMPLE_TOTAL = N_SIDE * N_SIDE +N_TEST = 500 +CLUSTERS = 3 + +# Generate the grid points and put them in a 2 column list of X,Y coordinates +xx, yy = numpy.meshgrid(numpy.linspace(0, 1, N_SIDE), numpy.linspace(0, 1, N_SIDE)) +X = numpy.c_[numpy.ravel(xx), numpy.ravel(yy)] + +# Generate the labels, using the XOR function to produce the checkerboard +y = (numpy.rint(xx * CLUSTERS).astype(numpy.int64) % 2) ^ ( + (numpy.rint(yy * CLUSTERS).astype(numpy.int64) % 2) +) +y = y.ravel() + +# Add some noise to the data +X += numpy.random.randn(X.shape[0], X.shape[1]) * 0.01 + +# Plot the data +plt.scatter(X[:, 0], X[:, 1], c=y) +plt.title("Original dataset") +plt.show() + +# And, finally, split it into train/test sets +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=N_TEST / N_EXAMPLE_TOTAL, random_state=42 +) + +# pylint: disable-next=too-many-arguments +def train( + torch_model, + X_train, + X_test, + y_train, + y_test, + criterion, + optimizer, + epochs=10, + batch_size=1, + shuffle=True, + device="cpu", +): + X_train = torch.tensor(X_train).float() + X_test = torch.tensor(X_test).float() + y_train = torch.tensor(y_train) + + train_loader = DataLoader( + TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=shuffle + ) + torch_model.train() + for epoch in range(epochs): + total_loss = [] + y_pred_all = [] + y_true_all = [] + + for batch_index, (X_batch, y_batch) in enumerate(train_loader): + # Forward pass + X_batch = X_batch.to(device) + y_batch = y_batch.to(device) + y_pred = torch_model(X_batch) + y_pred_all.append(y_pred.argmax(1).detach().cpu().numpy()) + y_true_all.append(y_batch.detach().cpu().numpy()) + + # Compute loss + loss = criterion(y_pred, y_batch) + if torch.isnan(loss): + print("y_pred", y_pred) + print("y_batch", y_batch) + raise ValueError(f"Loss diverged at step: {batch_index}") + + # Backward pass + optimizer.zero_grad() + loss.backward() + + # Update weights + optimizer.step() + + total_loss.append(loss.cpu().item()) + + # Print epoch number, loss and accuracy + y_pred_all = numpy.concatenate(y_pred_all) + y_true_all = numpy.concatenate(y_true_all) + accuracy = numpy.mean(y_pred_all == y_true_all) + print( + f"Epoch: {epoch:02} | Loss: {numpy.mean(total_loss):.4f} |" + f" Train Accuracy: {100*accuracy:.2f}%" + ) + + # Compute test accuracy once training is done + torch_model.eval() + fp32_pred = torch_model(X_test.to(device)).cpu().argmax(1).float().detach().numpy() + accuracy = numpy.mean(fp32_pred == y_test) + print(f"\nTest Accuracy Fp32: {accuracy*100:.2f}%") + + return accuracy + +def test_in_fhe(quantized_numpy_module, X_test, y_test, simulate=True): + if not simulate: + print("Generating key") + start_key = time.time() + quantized_numpy_module.fhe_circuit.keygen() + end_key = time.time() + print(f"Key generation finished in {end_key - start_key:.2f} seconds") + + fhe_mode = "simulate" if simulate else "execute" + + start_infer = time.time() + predictions = quantized_numpy_module.forward(X_test, fhe=fhe_mode).argmax(1) + end_infer = time.time() + + if not simulate: + print( + f"Inferences finished in {end_infer - start_infer:.2f} seconds " + f"({(end_infer - start_infer)/len(X_test):.2f} seconds/sample)" + ) + + # Compute accuracy + accuracy = numpy.mean(predictions == y_test) * 100 + print( + "FHE " + ("(simulation) " * simulate) + f"accuracy: {accuracy:.2f}% on " + f"{len(X_test)} examples." + ) + return predictions + +import brevitas.nn as qnn +from brevitas.core.bit_width import BitWidthImplType +from brevitas.core.quant import QuantType +from brevitas.core.restrict_val import FloatToIntImplType, RestrictValueType +from brevitas.core.scaling import ScalingImplType +from brevitas.core.zero_point import ZeroZeroPoint +from brevitas.inject import ExtendedInjector +from brevitas.quant.solver import ActQuantSolver, WeightQuantSolver +from dependencies import value +from torch.nn.utils import prune + + +# More details on injectors at +# https://github.com/Xilinx/brevitas/blob/master/ARCHITECTURE.md#injectors-and-quantizers +class CommonQuant(ExtendedInjector): + bit_width_impl_type = BitWidthImplType.CONST + scaling_impl_type = ScalingImplType.CONST + restrict_scaling_type = RestrictValueType.FP + zero_point_impl = ZeroZeroPoint + float_to_int_impl_type = FloatToIntImplType.ROUND + scaling_per_output_channel = False + narrow_range = True + signed = True + + @value + def quant_type(bit_width): # pylint: disable=no-self-argument + if bit_width is None: + return QuantType.FP + if bit_width == 1: + return QuantType.BINARY + return QuantType.INT + + +class CommonWeightQuant(CommonQuant, WeightQuantSolver): # pylint: disable=too-many-ancestors + scaling_const = 1.0 + signed = True + + +class CommonActQuant(CommonQuant, ActQuantSolver): # pylint: disable=too-many-ancestors + min_val = -1.0 + max_val = 1.0 + +class QATPrunedSimpleNet(nn.Module): + def __init__(self, n_hidden, qlinear_args, qidentity_args): + super().__init__() + + self.pruned_layers = set() + + self.quant_inp = qnn.QuantIdentity(**qidentity_args) + + self.fc1 = qnn.QuantLinear(IN_FEAT, n_hidden, **qlinear_args) + + self.relu1 = qnn.QuantReLU(bit_width=qidentity_args["bit_width"]) + + self.fc2 = qnn.QuantLinear(n_hidden, n_hidden, **qlinear_args) + + self.relu2 = qnn.QuantReLU(bit_width=qidentity_args["bit_width"]) + + self.fc3 = qnn.QuantLinear(n_hidden, OUT_FEAT, **qlinear_args) + + for m in self.modules(): + if isinstance(m, qnn.QuantLinear): + torch.nn.init.uniform_(m.weight.data, -1, 1) + + def forward(self, x): + x = self.quant_inp(x) + x = self.relu1(self.fc1(x)) + x = self.relu2(self.fc2(x)) + x = self.fc3(x) + return x + + def prune(self, max_non_zero): + # Linear layer weight has dimensions NumOutputs x NumInputs + for name, layer in self.named_modules(): + if isinstance(layer, qnn.QuantLinear): + num_zero_weights = (layer.weight.shape[1] - max_non_zero) * layer.weight.shape[0] + if num_zero_weights <= 0: + continue + print(f"Pruning layer {name} factor {num_zero_weights}") + prune.l1_unstructured(layer, "weight", amount=num_zero_weights) + self.pruned_layers.add(name) + + def unprune(self): + for name, layer in self.named_modules(): + if name in self.pruned_layers: + prune.remove(layer, "weight") + self.pruned_layers.remove(name) + +# Add MPS (for macOS with Apple Silicon or AMD GPUs) support when error is fixed. For now, we +# observe a decrease in torch's top1 accuracy when using MPS devices +# FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3953 +device = "cuda" if torch.cuda.is_available() else "cpu" + +# Define our loss function +criterion = nn.CrossEntropyLoss() + +# Define the batch size +batch_size = 1 +n_epochs = 7 +n_hidden = 100 + +# We use 100 neurons with only 20 that will be active. Having many neurons +# out of which we chose the best ones increases the robustness of training +# while keeping the accumulator size low +torch_model = QATPrunedSimpleNet( + n_hidden=n_hidden, + qlinear_args={ + "weight_bit_width": 3, + "weight_quant": CommonWeightQuant, + "bias": True, + "bias_quant": None, + "narrow_range": True, + }, + qidentity_args={"bit_width": 3, "act_quant": CommonActQuant}, +) +torch_model.prune(20) + +torch_model = torch_model.to(device) +optimizer = torch.optim.AdamW(torch_model.parameters(), lr=0.001) +accuracy = train( + torch_model, + X_train, + X_test, + y_train, + y_test, + criterion, + optimizer, + epochs=n_epochs, + batch_size=batch_size, + device=device, +) +torch_model.unprune() + +torch_model.eval() +# pylint: disable=not-callable +fp32_pred = ( + torch_model(torch.tensor(X_test).float().to(device)).cpu().argmax(1).float().detach().numpy() +) + +# pylint: enable=not-callable + +plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test.astype(numpy.float64)) +plt.title("Original test set") +plt.show() + +plt.scatter(X_test[:, 0], X_test[:, 1], c=fp32_pred) +plt.title("Torch: Predictions on test set") +plt.show() + +# We need to unprune the model before compiling +torch_model.unprune() + +# Move torch_model to CPU +torch_model = torch_model.cpu() + +# Compile the model using a representative input-set +quantized_numpy_module = compile_brevitas_qat_model(torch_model, X_train) + +prediction_simulated = test_in_fhe(quantized_numpy_module, X_test, y_test, simulate=True) + +# Reduce the test set for faster running time +FHE_SAMPLE = 10 + +prediction_fhe = test_in_fhe( + quantized_numpy_module, X_test[:FHE_SAMPLE], y_test[:FHE_SAMPLE], simulate=False +) + +class TorchSKLearnWrapper: + def __init__(self, torch_model): + self.torch_model = torch_model + self.fitted = True + + def fit(self): + return self + + @staticmethod + def __sklearn_is_fitted__(): + return True + + def predict(self, X): + self.torch_model.eval() + y_pred = self.torch_model(torch.tensor(X).float()).argmax(1).float().detach().numpy() + return y_pred + + def predict_proba(self, X): + self.torch_model.eval() + y_pred = self.torch_model(torch.tensor(X).float())[:, 1].float().detach().numpy() + return y_pred + +class ConcreteSKLearnWrapper: + def __init__(self, quantized_module: QuantizedModule): + self.quantized_module = quantized_module + self.fitted = True + + def fit(self): + return self + + @staticmethod + def __sklearn_is_fitted__(): + return True + + def predict(self, X, progress_bar=False): + predictions = numpy.zeros((X.shape[0],)) + for idx, x in enumerate(tqdm(X, disable=not progress_bar)): + predictions[idx] = self.quantized_module.forward( + numpy.expand_dims(x, 0), fhe="simulate" + ).argmax(axis=1) + return predictions + + def predict_proba(self, X, progress_bar=False): + predictions = numpy.zeros(shape=(X.shape[0], 2)) + for idx, x in enumerate(tqdm(X, disable=not progress_bar)): + predictions[idx] = self.quantized_module.forward( + numpy.expand_dims(x, 0), fhe="simulate" + )[0] + return predictions + +plt.scatter(X_test[:, 0], X_test[:, 1], c=prediction_simulated) +plt.title("Concrete ML predictions on test set") +plt.show() + +epsilon = 0.1 +base = 5 +max_value = 1 + epsilon +min_value = 0 - epsilon +grid_resolution = 100 +fig, axs = plt.subplots(figsize=(base * 3, base), ncols=3) +for ax in axs: + ax.set_xlim([min_value, max_value]) + ax.set_ylim([min_value, max_value]) + +xx0, xx1 = numpy.meshgrid( + numpy.linspace(min_value, max_value, grid_resolution), + numpy.linspace(min_value, max_value, grid_resolution), +) + +X_grid = numpy.c_[xx0.ravel(), xx1.ravel()] +y_pred_torch = TorchSKLearnWrapper(torch_model).predict(X_grid) +y_pred_concrete = ConcreteSKLearnWrapper(quantized_numpy_module).predict(X_grid) + +axs[1].contourf(xx0, xx1, y_pred_torch.reshape(xx0.shape)) +axs[2].contourf(xx0, xx1, y_pred_concrete.reshape(xx0.shape)) + +axs[0].scatter(X_test[:, 0], X_test[:, 1], c=prediction_simulated, marker="x") +axs[0].set_title("Ground truth") +axs[1].set_title("Float32 predictions") +axs[2].set_title("Concrete ML predictions") +plt.show() + + + +# Code from: ./PoissonRegression.ipynb +-------------------------------------------------------------------------------- + +import time + +import numpy as np +import sklearn +from sklearn.datasets import fetch_openml +from sklearn.linear_model import PoissonRegressor as SklearnPoissonRegressor +from sklearn.metrics import mean_poisson_deviance +from sklearn.model_selection import train_test_split + +from concrete.ml.sklearn import PoissonRegressor as ConcretePoissonRegressor + +%matplotlib inline + +import matplotlib.pyplot as plt +from IPython.display import display + +df, _ = fetch_openml( + data_id=41214, as_frame=True, cache=True, data_home="~/.cache/sklearn", return_X_y=True +) +df = df.head(50000) + +df["Frequency"] = df["ClaimNb"] / df["Exposure"] + +plt.ioff() +fig, ax = plt.subplots(1, 2, figsize=(15, 7)) +fig.patch.set_facecolor("white") +ax[0].set_title("Frequency of claims vs. Driver Age") +ax[0].set_xlabel("Driver Age") +ax[0].set_ylabel("Frequency of claims") +ax[0].scatter(df["DrivAge"], df["Frequency"], marker="o", color="#ffb700") +ax[1].set_title("Histogram of Frequency of claims") +ax[1].set_xlabel("Frequency of claims") +ax[1].set_ylabel("Count") +df["Frequency"].hist(bins=30, log=True, ax=ax[1], color="black") +display(fig) + +df_train, df_test = train_test_split(df, test_size=0.2, random_state=0) + +train_data = df_train["DrivAge"].values.reshape(-1, 1).astype(np.float64) +test_data = np.sort(df_test["DrivAge"].values).reshape(-1, 1).astype(np.float64) + +sklearn_pr = SklearnPoissonRegressor(max_iter=300) +sklearn_pr.fit(train_data, df_train["Frequency"], sample_weight=df_train["Exposure"]); + +sklearn_predictions = sklearn_pr.predict(test_data) + +plt.clf() +fig, ax = plt.subplots(1, figsize=(12, 8)) +fig.patch.set_facecolor("white") +ax.plot(test_data, sklearn_predictions, color="black", label="Float clear trend line") +ax.scatter(df_test["DrivAge"], df_test["Frequency"], marker="o", color="#ffb700") +ax.set_xlabel("Driver Age") +ax.set_ylim(0, 10) +ax.set_title("Regression with sklearn") +ax.set_ylabel("Frequency of claims") +ax.legend(loc="upper right") +display(fig) + +concrete_pr = ConcretePoissonRegressor(n_bits=8) +concrete_pr.fit(train_data, df_train["Frequency"], sample_weight=df_train["Exposure"]) + +concrete_predictions = concrete_pr.predict(test_data) + +y_true = df_test["Frequency"] +sample_weight = df_test["Exposure"] + +sklearn_score = mean_poisson_deviance(y_true, sklearn_predictions, sample_weight=sample_weight) +concrete_score = mean_poisson_deviance(y_true, concrete_predictions, sample_weight=sample_weight) + +print(f"mean Poisson deviance (scikit-learn): {sklearn_score:.4f}") +print(f"mean Poisson deviance (Concrete ML): {concrete_score:.4f}") + +plt.clf() +fig, ax = plt.subplots(1, figsize=(12, 8)) +fig.patch.set_facecolor("white") + +# Plot the scikit-learn in clear model's main trend line +ax.plot( + test_data, + sklearn_predictions, + color="black", + label=f"scikit-learn float, d={sklearn_score:.3f}", +) + +# Plot the Concrete quantized in clear model's main trend line +ax.plot( + test_data, + concrete_predictions, + color="red", + label=f"Concrete ML quantized, d={concrete_score:.3f}", +) + +# Plot the test data +ax.scatter(df_test["DrivAge"], df_test["Frequency"], marker="o", color="gray", label="Test data") + +# Parametrize the main figure +ax.set_xlabel("Driver Age") +ax.set_ylim(0, 10) +ax.set_title("Poisson Regression, float in clear and quantized in clear trend lines") +ax.set_ylabel("Frequency of claims") +ax.legend(loc="upper left") +ax.grid() + + +# Set a zoomed-in figure +axins = ax.inset_axes([0.5, 0.5, 0.47, 0.47]) + +# Plot the scikit-learn in clear model's zoomed trend line +axins.plot( + test_data, + sklearn_predictions, + color="black", +) + +# Plot the Concrete quantized in clear model's zoomed trend line +axins.plot( + test_data, + concrete_predictions, + color="red", +) + +# Parametrize the zoomed figure +x1, x2, y1, y2 = 60, 65, 0.3, 0.7 +axins.set_xlim(x1, x2) +axins.set_ylim(y1, y2) +axins.grid() +ax.indicate_inset_zoom(axins, edgecolor="black") + +display(fig) + +fhe_circuit = concrete_pr.compile(train_data) + +print(f"Generating a key for an {fhe_circuit.graph.maximum_integer_bit_width()}-bit circuit") + +time_begin = time.time() +fhe_circuit.client.keygen(force=False) +print(f"Key generation time: {time.time() - time_begin:.4f} seconds") + +time_begin = time.time() +concrete_predictions_fhe = concrete_pr.predict(test_data, fhe="execute") +print(f"Execution time: {(time.time() - time_begin) / len(test_data):.4f} seconds per sample") + +concrete_fhe_score = mean_poisson_deviance( + y_true, concrete_predictions_fhe, sample_weight=sample_weight +) + +print(f"mean Poisson deviance (Concrete FHE): {concrete_fhe_score:.4f}") + +plt.clf() +fig, ax = plt.subplots(1, figsize=(12, 8)) +fig.patch.set_facecolor("white") + +# Plot the scikit-learn in clear model's main trend line +ax.plot( + test_data, + sklearn_predictions, + color="black", + label=f"scikit-learn float, d={sklearn_score:.3f}", +) + +# Plot the Concrete quantized in clear model's main trend line +ax.plot( + test_data, + concrete_predictions, + color="red", + label=f"Concrete ML quantized, d={concrete_score:.3f}", +) + +# Plot the Concrete FHE model's main trend line +ax.plot( + test_data, + concrete_predictions_fhe, + color="blue", + label=f"Concrete ML FHE, d={concrete_fhe_score:.3f}", +) + +# Plot the test data +ax.scatter(df_test["DrivAge"], df_test["Frequency"], marker="o", color="gray", label="Test data") + +# Parametrize the main figure +ax.set_xlabel("Driver Age") +ax.set_ylim(0, 10) +ax.set_title("Poisson Regression, float in clear, quantized in clear and FHE trend lines") +ax.set_ylabel("Frequency of claims") +ax.legend(loc="upper left") +ax.grid() + +# Set a zoomed-in figure +axins = ax.inset_axes([0.5, 0.5, 0.47, 0.47]) + +# Plot the scikit-learn in clear model's zoomed trend line +axins.plot( + test_data, + sklearn_predictions, + color="black", +) + +# Plot the Concrete FHE model's zoomed trend line +axins.plot( + test_data, + concrete_predictions, + color="red", +) + +# Plot the Concrete FHE model's zoomed trend line +axins.plot( + test_data, + concrete_predictions_fhe, + color="blue", +) + +# Parametrize the zoomed figure +x1, x2, y1, y2 = 60, 65, 0.3, 0.7 +axins.set_xlim(x1, x2) +axins.set_ylim(y1, y2) +axins.grid() +ax.indicate_inset_zoom(axins, edgecolor="black") + +display(fig) + +import warnings + +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.preprocessing import ( + FunctionTransformer, + KBinsDiscretizer, + OneHotEncoder, + StandardScaler, +) + +warnings.filterwarnings("ignore") + +sklearn_sparse_arg = ( + {"sparse": False} if "1.1." in sklearn.__version__ else {"sparse_output": False} +) + +log_scale_transformer = make_pipeline(FunctionTransformer(np.log, validate=False), StandardScaler()) + +linear_model_preprocessor = ColumnTransformer( + [ + ("passthrough_numeric", "passthrough", ["BonusMalus"]), + ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), + ("log_scaled_numeric", log_scale_transformer, ["Density"]), + ( + "onehot_categorical", + OneHotEncoder(**sklearn_sparse_arg), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"], + ), + ], + remainder="drop", +) + +sklearn_pr = Pipeline( + [ + ("preprocessor", linear_model_preprocessor), + ("regressor", SklearnPoissonRegressor()), + ] +) + +n_bits = 16 +concrete_pr = Pipeline( + [ + ("preprocessor", linear_model_preprocessor), + ("regressor", ConcretePoissonRegressor(n_bits=n_bits)), + ] +) + +sklearn_pr.fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]) + +concrete_pr.fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]); + +def score_estimator(estimator, df_test, fhe="disable"): + """Score an estimator on the test set.""" + + if fhe == "execute": + time_begin = time.time() + y_pred = estimator.predict(df_test, fhe="execute") + print( + f"FHE execution time: {(time.time() - time_begin) / len(df_test):.4f} " + "seconds per sample\n" + ) + + else: + y_pred = estimator.predict(df_test) + + y_pred = np.squeeze(y_pred) + y_true = df_test["Frequency"] + sample_weight = df_test["Exposure"] + + # Ignore non-positive predictions, as they are invalid for the Tweedie deviance (except if + # power is equal to 0, making the model equivalent to a Linear Regression). We want to + # issue a warning if for some reason (e.g., low quantization, user error), the regressor + # predictions are negative. + + # Find all strictly positive values + mask = y_pred > 0 + + # If any non-positive values are found, issue a warning + if (~mask).any(): + n_masked, n_samples = (~mask).sum(), mask.shape[0] + print( + "WARNING: Estimator yields invalid, non-positive predictions " + f"for {n_masked} samples out of {n_samples}. These predictions " + "are ignored when computing the Poisson deviance." + ) + + return mean_poisson_deviance(y_true[mask], y_pred[mask], sample_weight=sample_weight[mask]) + +sklearn_score = score_estimator(sklearn_pr, df_test) +concrete_score = score_estimator(concrete_pr, df_test) + +print(f"scikit-learn (clear) deviance score: {sklearn_score:.4f}") +print(f"Concrete'ML (FHE) deviance score: {concrete_score:.4f}") + +# Measure the error of the FHE quantized model with respect to the clear scikit-learn +# float model +score_difference = abs(concrete_score - sklearn_score) * 100 / sklearn_score +print( + "Relative difference between scikit-learn (clear) and Concrete-ml (FHE) scores:", + f"{score_difference:.2f}%\n", +) + +n_bits_values = list(range(2, 20)) +concrete_deviance_scores = [] +for n_bits in n_bits_values: + concrete_regressor = Pipeline( + [ + ("preprocessor", linear_model_preprocessor), + ("regressor", ConcretePoissonRegressor(n_bits=n_bits)), + ] + ) + concrete_regressor.fit( + df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"] + ) + concrete_deviance_scores.append(score_estimator(concrete_regressor, df_test)) + +plt.clf() +fig, ax = plt.subplots(1, figsize=(12, 8)) +fig.patch.set_facecolor("white") +ax.hlines(y=sklearn_score, xmax=2, xmin=19, color="r", label="scikit-learn") +ax.plot(n_bits_values, concrete_deviance_scores, label="Concrete ML") +ax.set_xlabel("Number of bits") +ax.set_ylabel("Poisson deviance") +ax.set_xticks(n_bits_values) +ax.set_xticklabels([str(k) for k in n_bits_values]) +ax.grid() +ax.legend(loc="upper right") +display(fig) + +n_bits = 11 + +poisson_regressor_fhe = Pipeline( + [ + ("preprocessor", linear_model_preprocessor), + ("regressor", ConcretePoissonRegressor(n_bits=n_bits)), + ] +) +poisson_regressor_fhe.fit( + df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"] +); + +# Compile needs some preprocessed data in order to run. +df_test_processed = poisson_regressor_fhe["preprocessor"].transform(df_test) + +# pylint: disable-next=no-member +fhe_circuit = poisson_regressor_fhe["regressor"].compile(df_test_processed) + +print(f"Generating a key for an {fhe_circuit.graph.maximum_integer_bit_width()}-bit circuit") + +time_begin = time.time() +fhe_circuit.client.keygen(force=False) +print(f"Key generation time: {time.time() - time_begin:.4f} seconds") + +# Reducing the test set from 10000 to 1000 for faster FHE execution +df_test = df_test[:1000] + +concrete_score_fhe = score_estimator(poisson_regressor_fhe, df_test, fhe="execute") + +print(f"scikit-learn (clear) deviance score: {score_estimator(sklearn_pr, df_test):.4f}") +print(f"Concrete ML (FHE) deviance score: {concrete_score_fhe:.4f}") + +# Measure the error of the FHE quantized model with respect to the clear scikit-learn +# float model +score_difference = abs(concrete_score - sklearn_score) * 100 / sklearn_score +print( + "Relative difference between scikit-learn (clear) and Concrete-ml (FHE) scores:", + f"{score_difference:.2f}%\n", +) + + + +# Code from: ./XGBClassifier.ipynb +-------------------------------------------------------------------------------- + +import warnings + +warnings.simplefilter(action="ignore", category=FutureWarning) + +import time + +import matplotlib.pyplot as plt +import numpy +from concrete.compiler import check_gpu_available +from matplotlib.colors import ListedColormap +from sklearn.datasets import fetch_openml, make_circles +from sklearn.metrics import accuracy_score, make_scorer, matthews_corrcoef +from sklearn.model_selection import GridSearchCV, train_test_split +from xgboost.sklearn import XGBClassifier as SklearnXGBClassifier + +from concrete.ml.sklearn import XGBClassifier as ConcreteXGBClassifier + +use_gpu_if_available = False +device = "cuda" if use_gpu_if_available and check_gpu_available() else "cpu" + +%matplotlib inline + +X, y = make_circles(n_samples=1000, noise=0.1, factor=0.6, random_state=0) + +# Define the figure size and color +plt.figure(figsize=(10, 6)) +cm_bright = ListedColormap(["#FF0000", "#FFFFFF", "#0000FF"]) + +plt.scatter(X[:, 0], X[:, 1], c=y, s=10, cmap=cm_bright) +plt.show() + +# Define the parameters used for initialization +n_estimators = 50 +max_depth = 4 +n_bits = 6 + +# Define the parameters used for training +fit_extra_param = {"eval_metric": "logloss"} + +sklearn_model = SklearnXGBClassifier(n_estimators=n_estimators, max_depth=max_depth) +sklearn_model.fit(X, y, **fit_extra_param); + +concrete_model = ConcreteXGBClassifier( + n_bits=n_bits, n_estimators=n_estimators, max_depth=max_depth +) +concrete_model.fit(X, y); + +def plot_contour(model, X, y, title=""): + """Plot the contour lines given a model and a data-set.""" + # Create a grid will lots of point to plot the contour of the decision function + x_min, x_max = X[:, 0].min() - 0.1, X[:, 0].max() + 0.1 + y_min, y_max = X[:, 1].min() - 0.1, X[:, 1].max() + 0.1 + grid_x, grid_y = numpy.meshgrid( + numpy.arange(x_min, x_max, 0.1), numpy.arange(y_min, y_max, 0.1) + ) + + # Predict the function value on the grid. For the Concrete ML model, this inference is done in + # the clear, which is expected to exactly match the FHE inference. + grid_z = model.predict_proba(numpy.c_[grid_x.ravel(), grid_y.ravel()])[:, 1] + + grid_z = grid_z.reshape(grid_x.shape) + + # Define the plot size + plt.figure(figsize=(10, 6)) + + # Plot the contour and training examples + plt.contourf(grid_x, grid_y, grid_z, cmap=cm_bright, alpha=0.2) + plt.scatter(X[:, 0], X[:, 1], c=y, s=1, cmap=cm_bright) + plt.title(title) + plt.show() + +plot_contour(sklearn_model, X, y, title="Scikit-Learn XGBoost Classifier") + +plot_contour(concrete_model, X, y, title="Concrete ML XGBoost Classifier") + +# Load the data-set +X, y = fetch_openml(name="diabetes", as_frame=False, cache=True, return_X_y=True) + +# Replace (binary) target values by integers +y[y == "tested_positive"] = 1 +y[y == "tested_negative"] = 0 +y = y.astype(numpy.int64) + +# Create scorer with the MCC metric +grid_scorer = make_scorer(matthews_corrcoef, greater_is_better=True) + +# Define the number of estimators to consider for the following gridsearch +n_estimators = [1, 5, 10, 20] + [20 * i for i in range(2, 11)] + [50 * i for i in range(5, 11)] + +param_grid = { + "max_depth": [2], + "n_estimators": n_estimators, +} + +sklearn_grid_search = GridSearchCV( + SklearnXGBClassifier(), + param_grid, + cv=5, + scoring=grid_scorer, + error_score="raise", + verbose=1, +) + +sklearn_grid_search.fit(X, y, **fit_extra_param); + +param_grid = { + "n_bits": [6], + "max_depth": [2], + "n_estimators": n_estimators, +} + +concrete_grid_search = GridSearchCV( + ConcreteXGBClassifier(), + param_grid, + cv=5, + scoring=grid_scorer, + error_score="raise", + verbose=1, +) + +concrete_grid_search.fit(X, y); + +# Print the best MCC score for both models +print(f"Best MCC score for Scikit-Learn: {sklearn_grid_search.best_score_:.2f}") +print(f"Best MCC score Concrete ML: {concrete_grid_search.best_score_:.2f}") + +# Define the figure size +plt.figure(figsize=(10, 6)) + +# Plot the mean_test_score of both model along the n_estimators hyper parameter +plt.plot( + concrete_grid_search.cv_results_["param_n_estimators"], + concrete_grid_search.cv_results_["mean_test_score"], + label="Concrete ML", +) +plt.plot( + sklearn_grid_search.cv_results_["param_n_estimators"], + sklearn_grid_search.cv_results_["mean_test_score"], + label="Scikit-Learn", +) +plt.xlabel("n_estimators") +plt.ylabel("MCC") +plt.legend() +plt.show() + +best_params_sklearn = sklearn_grid_search.best_params_ +print(f"Best parameters found for the Scikit-Learn model: {best_params_sklearn}") + +best_params_concrete = concrete_grid_search.best_params_ +print(f"Best parameters found for the Concrete ML model: {best_params_concrete}") + +# Define the Concrete ML and Scikit-Learn models +concrete_model = ConcreteXGBClassifier(**best_params_concrete) +sklearn_model = SklearnXGBClassifier(**best_params_sklearn) + +# Split the data into a train and test set +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) + +# Fit both models +concrete_model.fit(X_train, y_train, **fit_extra_param) +sklearn_model.fit(X_train, y_train, **fit_extra_param); + +# Compile the Concrete ML model using the training data +circuit = concrete_model.compile(X_train, device=device) + +print(f"Generating a key for an {circuit.graph.maximum_integer_bit_width()}-bits circuit") + +# Generate the key +time_begin = time.time() +circuit.client.keygen(force=False) +print(f"Key generation time: {time.time() - time_begin:.2f} seconds") + +# Compute the predictions using the Scikit-Learn model +y_pred_sklearn = sklearn_model.predict(X_test) + +# Compute the predictions using the Concrete ML model with FHE simulation +y_pred_simulated = concrete_model.predict(X_test, fhe="simulate") + +print("Accuracy scores:") +print( + f"- Scikit-Learn (clear floating points): {accuracy_score(y_test, y_pred_sklearn)*100:.2f}%\n" + f"- Concrete ML (clear quantized): {accuracy_score(y_test, y_pred_simulated)*100:.2f}\n" +) + +N_SAMPLE_FHE = 10 + +# Pick N_SAMPLE_FHE random samples from the test set +idx_test = numpy.random.choice(X_test.shape[0], N_SAMPLE_FHE, replace=False) +X_test_fhe = X_test[idx_test] +y_test_fhe = y_test[idx_test] + +# Compute the predictions using the Concrete ML (quantized) model in the clear +y_preds_clear = concrete_model.predict(X_test_fhe) + +# Compute the predictions using the Concrete ML model in FHE +time_begin = time.time() +y_preds_fhe = concrete_model.predict(X_test_fhe, fhe="execute") +print(f"FHE execution time: {(time.time() - time_begin) / len(X_test_fhe):.2f} seconds per sample") + +# Compare the clear quantized inference vs FHE inference +print( + f"{(y_preds_fhe == y_preds_clear).sum()}/{N_SAMPLE_FHE} " + "FHE predictions match the clear quantized predictions" +) + + + +# Code from: ./GLMComparison.ipynb +-------------------------------------------------------------------------------- + +# Source : https://scikit-learn.org/stable/auto_examples/linear_model/plot_tweedie_regression_insurance_claims.html # noqa # pylint: disable=line-too-long + +# Authors: Christian Lorentzen +# Roman Yurchak +# Olivier Grisel +# Modified to integrate Concrete ML functions by Zama +# License: BSD 3 clause + +import sys +import time +from collections import defaultdict +from timeit import default_timer as timer + +import numpy as np +import sklearn +from sklearn.compose import ColumnTransformer +from sklearn.datasets import fetch_openml +from sklearn.linear_model import GammaRegressor as SklearnGammaRegressor +from sklearn.linear_model import PoissonRegressor as SklearnPoissonRegressor +from sklearn.linear_model import TweedieRegressor as SklearnTweedieRegressor +from sklearn.metrics import mean_gamma_deviance, mean_poisson_deviance, mean_tweedie_deviance +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import ( + FunctionTransformer, + KBinsDiscretizer, + OneHotEncoder, + StandardScaler, +) + +from concrete.ml.sklearn import GammaRegressor as ConcreteGammaRegressor +from concrete.ml.sklearn import PoissonRegressor as ConcretePoissonRegressor +from concrete.ml.sklearn import TweedieRegressor as ConcreteTweedieRegressor + +%matplotlib inline + +import matplotlib.pyplot as plt +from IPython.display import display + +# Getting the original data-set containing the risk features +# Link: https://www.openml.org/d/41214 +risks_data, _ = fetch_openml( + data_id=41214, as_frame=True, cache=True, data_home="~/.cache/sklearn", return_X_y=True +) + +# Getting the data set containing claims amount +# Link: https://www.openml.org/d/41215 +claims_data, _ = fetch_openml( + data_id=41215, as_frame=True, cache=True, data_home="~/.cache/sklearn", return_X_y=True +) + +# Set IDpol as index +risks_data["IDpol"] = risks_data["IDpol"].astype(int) +risks_data.set_index("IDpol", inplace=True) + +# Grouping claims mounts together if they are associated with the same policy +claims_data = claims_data.groupby("IDpol").sum() + +# Merging the two sets over policy IDs +data = risks_data.join(claims_data, how="left") + +# Only keeping the first 100 000 for faster running time +data = data.head(100000) + +# Filtering out unknown claim amounts +data["ClaimAmount"].fillna(0, inplace=True) + +# Filtering out claims with zero amount, as the severity (gamma) model +# requires strictly positive target values +data.loc[(data["ClaimAmount"] == 0) & (data["ClaimNb"] >= 1), "ClaimNb"] = 0 + +# Removing unreasonable outliers +data["ClaimNb"] = data["ClaimNb"].clip(upper=4) +data["Exposure"] = data["Exposure"].clip(upper=1) +data["ClaimAmount"] = data["ClaimAmount"].clip(upper=200000) + +sklearn_sparse_arg = ( + {"sparse": False} if "1.1." in sklearn.__version__ else {"sparse_output": False} +) +log_scale_transformer = make_pipeline(FunctionTransformer(np.log, validate=False), StandardScaler()) + +linear_model_preprocessor = ColumnTransformer( + [ + ("passthrough_numeric", "passthrough", ["BonusMalus"]), + ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), + ("log_scaled_numeric", log_scale_transformer, ["Density"]), + ( + "onehot_categorical", + OneHotEncoder(**sklearn_sparse_arg), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"], + ), + ], + remainder="drop", +) + +x = linear_model_preprocessor.fit_transform(data) + +# Creating target values for Poisson +data["Frequency"] = data["ClaimNb"] / data["Exposure"] + +# Creating target values for Gamma +data["AvgClaimAmount"] = data["ClaimAmount"] / np.fmax(data["ClaimNb"], 1) + +# Creating target values for Tweedie +# Insurances companies are interested in modeling the Pure Premium, that is the expected total +# claim amount per unit of exposure for each policyholder in their portfolio +data["PurePremium"] = data["ClaimAmount"] / data["Exposure"] + +plt.ioff() +fig, ax = plt.subplots(1, 3, figsize=(15, 7)) + +# Set the figure's main parameters +fig.patch.set_facecolor("white") +fig.suptitle("Different target values distribution") +fig.supylabel("Count") + +# Frequency of claims distribution +ax[0].set_title("Poisson") +ax[0].set_xlabel("Frequency of claims") +data["Frequency"].hist(bins=30, log=True, ax=ax[0], color="black") + +# Average amount of claims distribution +ax[1].set_title("Gamma") +ax[1].set_xlabel("Average amount of claims") +data["AvgClaimAmount"].hist(bins=30, log=True, ax=ax[1], color="blue") + +# PurePrenium distribution +ax[2].set_title("Tweedie") +ax[2].set_xlabel("PurePrenium") +data["PurePremium"].hist(bins=30, log=True, ax=ax[2], color="red") + +display(fig) + +train_data, test_data, x_train_data, x_test_data = train_test_split( + data, + x, + test_size=0.2, + random_state=0, +) +_, test_data, _, x_test_data = train_test_split( + test_data, + x_test_data, + test_size=50, + random_state=0, +) + +gamma_mask_train = train_data["ClaimAmount"] > 0 +gamma_mask_test = test_data["ClaimAmount"] > 0 + + +parameters_glms = { + "Poisson": { + "sklearn": SklearnPoissonRegressor, + "concrete": ConcretePoissonRegressor, + "init_parameters": { + "alpha": 1e-3, + "max_iter": 400, + }, + "fit_parameters": { + "X": x_train_data, + "y": train_data["Frequency"], + "sample_weight": train_data["Exposure"], + }, + "x_test": x_test_data, + "score_parameters": { + "y_true": test_data["Frequency"], + "sample_weight": test_data["Exposure"], + }, + "deviance": mean_poisson_deviance, + }, + "Gamma": { + "sklearn": SklearnGammaRegressor, + "concrete": ConcreteGammaRegressor, + "init_parameters": { + "alpha": 10.0, + "max_iter": 300, + }, + "fit_parameters": { + "X": x_train_data[gamma_mask_train], + "y": train_data[gamma_mask_train]["AvgClaimAmount"], + "sample_weight": train_data[gamma_mask_train]["ClaimNb"], + }, + "x_test": x_test_data[gamma_mask_test], + "score_parameters": { + "y_true": test_data[gamma_mask_test]["AvgClaimAmount"], + "sample_weight": test_data[gamma_mask_test]["ClaimNb"], + }, + "deviance": mean_gamma_deviance, + }, + "Tweedie": { + "sklearn": SklearnTweedieRegressor, + "concrete": ConcreteTweedieRegressor, + "init_parameters": { + "power": 1.9, + "alpha": 0.1, + "max_iter": 10000, + }, + "fit_parameters": { + "X": x_train_data, + "y": train_data["PurePremium"], + "sample_weight": train_data["Exposure"], + }, + "x_test": x_test_data, + "score_parameters": { + "y_true": test_data["PurePremium"], + "sample_weight": test_data["Exposure"], + "power": 1.9, + }, + "deviance": mean_tweedie_deviance, + }, +} + +def compare_regressors(n_bits, fhe="simulate"): + # pylint: disable=too-many-locals + scores = defaultdict(list) + predictions = defaultdict(list) + + for glm, parameters_glm in parameters_glms.items(): + # Retrieve the regressors + sklearn_class = parameters_glm["sklearn"] + concrete_class = parameters_glm["concrete"] + + # Instantiate the models + init_parameters = parameters_glm["init_parameters"] + sklearn_glm = sklearn_class(**init_parameters) + concrete_glm = concrete_class(n_bits=n_bits, **init_parameters) + + # Fit the models + fit_parameters = parameters_glm["fit_parameters"] + sklearn_glm.fit(**fit_parameters) + concrete_glm.fit(**fit_parameters) + + x_train_subset = fit_parameters["X"][:100] + # Compile the Concrete ML model if it needs to be executed in FHE + if fhe in ["execute", "simulate"]: + circuit = concrete_glm.compile(x_train_subset) + + # Generate the key + print( + "Generating a key for an " + f"{circuit.graph.maximum_integer_bit_width()}-bit circuit" + ) + sys.stdout.flush() + + time_begin = time.time() + circuit.client.keygen(force=False) + print(f"Key generation time: {time.time() - time_begin:.4f} seconds") + + # Compute the predictions using sklearn (floating points, in the clear) + x_test = parameters_glm["x_test"] + sklearn_predictions = sklearn_glm.predict(x_test) + + # Compute the predictions using Concrete ML (quantized, in the clear) + concrete_q_predictions = concrete_glm.predict(x_test) + + # Compute the predictions using Concrete ML (in FHE) + start = timer() + concrete_predictions = concrete_glm.predict( + x_test, + fhe=fhe, + ) + end = timer() + run_time = end - start + + # Compute the deviance scores + mean_deviance = parameters_glm["deviance"] + score_parameters = parameters_glm["score_parameters"] + sklearn_score = mean_deviance(y_pred=sklearn_predictions, **score_parameters) + concrete_q_score = mean_deviance(y_pred=concrete_q_predictions, **score_parameters) + concrete_score = mean_deviance(y_pred=concrete_predictions, **score_parameters) + + # Print the deviance scores + fhe_message = "in FHE" if fhe == "execute" else "in clear" + print(f"Mean {glm} deviance (scikit-learn): {sklearn_score:.4f}") + print(f"Mean {glm} deviance (Concrete ML, quantized): {concrete_q_score:.4f}") + print( + f"Mean {glm} deviance (Concrete ML {fhe_message}, " + f"with {run_time / len(x_test):.4f} seconds " + f"per inference): {concrete_score:.4f}" + ) + + # Measure the error of the FHE quantized model with respect to the clear scikit-learn + # float model + score_difference = abs(concrete_score - sklearn_score) * 100 / sklearn_score + print( + "Relative difference between scikit-learn (clear) and Concrete-ml (FHE) scores:", + f"{score_difference:.2f}%\n", + ) + + # Store the results + scores["sklearn"].append(sklearn_score) + scores["concrete"].append(concrete_score) + predictions["sklearn"].append(sklearn_predictions) + predictions["concrete"].append(concrete_predictions) + + return scores, predictions + +n_bits = 11 +fhe = "execute" + +scores, predictions = compare_regressors(n_bits, fhe=fhe) + + + +# Code from: ./ClassifierComparison.ipynb +-------------------------------------------------------------------------------- + +# Source: +# https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html + +# Code source: Gaël Varoquaux +# Andreas Müller +# Modified for documentation by Jaques Grobler +# Modified to integrate Concrete ML functions by Zama +# License: BSD 3 clause + +import warnings + +warnings.simplefilter(action="ignore", category=FutureWarning) + +from functools import partial + +import torch + +from concrete.ml.sklearn import ( + DecisionTreeClassifier, + LinearSVC, + LogisticRegression, + NeuralNetClassifier, + RandomForestClassifier, + XGBClassifier, +) + +# The simulation mode allows to measure the impact of FHE execution on accuracy +# without paying the cost of FHE computations. +# However, data is not encrypted when using the simulation: the model performs inference +# on clear data. +%run utils/classifier_comparison_utils.py + +params_neural_net = { + "module__n_w_bits": 2, + "module__n_a_bits": 4, + "module__n_accum_bits": 32, + "module__n_hidden_neurons_multiplier": 6, + "module__n_layers": 2, # 1 hidden layer + "module__activation_function": torch.nn.ReLU, + "max_epochs": 400, + "verbose": 0, + "lr": 0.001, +} + +neural_network_classifiers = [ + ( + partial(NeuralNetClassifier, batch_size=32, **params_neural_net), + "Neural Net", + ), +] + +# pylint: disable-next=undefined-variable +make_classifier_comparison("NN Classifiers", neural_network_classifiers, 0.5, simulate=True) # noqa + +linear_classifiers = [ + (partial(LinearSVC, C=0.025), "Linear SVC"), + (LogisticRegression, "Logistic Regression"), +] + +# pylint: disable-next=undefined-variable +make_classifier_comparison("Linear Classifiers", linear_classifiers, 0, simulate=True, h=1) # noqa + +tree_classifiers = [ + (partial(DecisionTreeClassifier, max_depth=5), "Decision Tree"), + (partial(RandomForestClassifier, max_depth=4, n_estimators=5), "Random Forest"), + (partial(XGBClassifier, n_jobs=1, max_depth=4, n_estimators=5), "XGB"), +] + +# pylint: disable-next=undefined-variable +make_classifier_comparison( # noqa + "Tree-Based Classifiers", tree_classifiers, 0.5, simulate=True, h=0.1 +) + + + +# Code from: ./LogisticRegression.ipynb +-------------------------------------------------------------------------------- + +import time + +import numpy as np +from sklearn.datasets import make_classification +from sklearn.linear_model import LogisticRegression as SklearnLogisticRegression +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MinMaxScaler, StandardScaler + +from concrete.ml.sklearn import LogisticRegression as ConcreteLogisticRegression + +%matplotlib inline + +import matplotlib.pyplot as plt +from IPython.display import display + +X, y = make_classification( + n_samples=200, + n_features=2, + n_redundant=0, + n_informative=2, + random_state=2, + n_clusters_per_class=1, +) + +rng = np.random.RandomState(2) +X += 2 * rng.uniform(size=X.shape) + +b_min = np.min(X, axis=0) +b_max = np.max(X, axis=0) + +x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) + +x_test_grid, y_test_grid = np.meshgrid( + np.linspace(b_min[0], b_max[0], 30), np.linspace(b_min[1], b_max[1], 30) +) +x_grid_test = np.vstack([x_test_grid.ravel(), y_test_grid.ravel()]).transpose() + +sklearn_logr = SklearnLogisticRegression() +sklearn_logr.fit(x_train, y_train) +y_pred_test = sklearn_logr.predict(x_test) + +# Compute the scikit-learn classifier's probabilities on the domain +y_score_grid = sklearn_logr.predict_proba(x_grid_test)[:, 1] + +plt.ioff() +plt.clf() +fig, ax = plt.subplots(1, figsize=(12, 8)) +fig.patch.set_facecolor("white") +ax.contourf(x_test_grid, y_test_grid, y_score_grid.reshape(x_test_grid.shape), cmap="coolwarm") +CS1 = ax.contour( + x_test_grid, + y_test_grid, + y_score_grid.reshape(x_test_grid.shape), + levels=[0.5], + linewidths=2, +) +CS1.collections[0].set_label("Sklearn decision boundary") +ax.scatter(x_train[:, 0], x_train[:, 1], c=y_train, marker="D", cmap="jet", label="Train data") +ax.scatter(x_test[:, 0], x_test[:, 1], c=y_test, marker="x", cmap="jet", label="Test data") +ax.legend(loc="upper right") +display(fig) + +concrete_logr = ConcreteLogisticRegression(n_bits=8) +concrete_logr.fit(x_train, y_train); + +# Predict on the test set +y_proba_q = concrete_logr.predict_proba(x_test)[:, 1] +y_pred_q = concrete_logr.predict(x_test) + +# Compute the probabilities on the whole domain in order to be able to plot the contours +y_proba_q_grid = concrete_logr.predict_proba(x_grid_test)[:, 1] +y_pred_q_grid = concrete_logr.predict(x_grid_test) + +fhe_circuit = concrete_logr.compile(x_train) + +print(f"Generating a key for an {fhe_circuit.graph.maximum_integer_bit_width()}-bit circuit") + +time_begin = time.time() +fhe_circuit.client.keygen(force=False) +print(f"Key generation time: {time.time() - time_begin:.4f} seconds") + +time_begin = time.time() +y_pred_fhe = concrete_logr.predict(x_test, fhe="execute") +print(f"Execution time: {(time.time() - time_begin) / len(x_test):.4f} seconds per sample") + +sklearn_accuracy = accuracy_score(y_test, y_pred_test) +quantized_accuracy = accuracy_score(y_test, y_pred_q) +fhe_accuracy = accuracy_score(y_test, y_pred_fhe) + +print(f"Sklearn accuracy: {sklearn_accuracy:.4f}") +print(f"Quantized Clear Accuracy: {quantized_accuracy:.4f}") +print(f"FHE Accuracy: {fhe_accuracy:.4f}") + +# Measure the error of the FHE quantized model with respect to the clear quantized model +concrete_score_difference = abs(fhe_accuracy - quantized_accuracy) +print( + "\nRelative difference between Concrete-ml (quantized clear) and Concrete-ml (FHE) scores:", + f"{concrete_score_difference:.2f}%", +) + +# Measure the error of the FHE quantized model with respect to the clear scikit-learn float model +score_difference = abs(fhe_accuracy - sklearn_accuracy) +print( + "Relative difference between scikit-learn (clear) and Concrete-ml (FHE) scores:", + f"{score_difference:.2f}%", +) + +plt.clf() +fig, ax = plt.subplots(1, figsize=(12, 8)) +fig.patch.set_facecolor("white") +ax.contourf(x_test_grid, y_test_grid, y_proba_q_grid.reshape(x_test_grid.shape), cmap="coolwarm") +CS1 = ax.contour( + x_test_grid, + y_test_grid, + y_proba_q_grid.reshape(x_test_grid.shape), + levels=[0.5], + linewidths=2, +) +ax.scatter(x_train[:, 0], x_train[:, 1], c=y_train, cmap="jet", marker="D") +ax.scatter(x_test[:, 0], x_test[:, 1], c=y_pred_q, cmap="jet", marker="x") +CS2 = ax.contour( + x_test_grid, + y_test_grid, + y_score_grid.reshape(x_test_grid.shape), + levels=[0.5], + linewidths=2, + linestyles="dashed", + cmap="hot", +) +ax.clabel(CS1, CS1.levels, inline=True, fontsize=10) +ax.clabel(CS2, CS2.levels, inline=True, fontsize=10) +CS1.collections[0].set_label(f"FHE decision boundary, acc={fhe_accuracy:.2f}") +CS2.collections[0].set_label(f"Sklearn decision boundary, acc={sklearn_accuracy:.2f}") +ax.legend(loc="upper right") +display(fig) + +from utils.scaling_comparison_utils import plot_data + +scaler = MinMaxScaler((-1, 1)) +x_train_scaled = scaler.fit_transform(x_train) +x_test_scaled = scaler.transform(x_test) + +scaler = StandardScaler() +x_train_normalized = scaler.fit_transform(x_train) +x_test_normalized = scaler.transform(x_test) + +x_train_unscaled = x_train_scaled.copy() +x_train_unscaled[:, 0] *= 100 + +x_test_unscaled = x_test_scaled.copy() +x_test_unscaled[:, 0] *= 100 + +x_train_shifted = x_train_scaled.copy() +x_train_shifted[:, 0] += 100 + +x_test_shifted = x_test_scaled.copy() +x_test_shifted[:, 0] += 100 + +n_bits = 12 +random_state = 0 + +fig, axes = plt.subplots(ncols=2, nrows=5, figsize=(8 * 3, 8 * 4)) +models = [ConcreteLogisticRegression(n_bits=n_bits, random_state=random_state) for _ in range(5)] +features_trains = [x_train, x_train_scaled, x_train_normalized, x_train_unscaled, x_train_shifted] +targets_trains = [y_train, y_train, y_train, y_train, y_train] +features_tests = [x_test, x_test_scaled, x_test_normalized, x_test_unscaled, x_test_shifted] +targets_tests = [y_test, y_test, y_test, y_test, y_test] +names = ["unchanged", "min-max-transformed", "normalized", "unscaled", "shifted"] + +for ax, model, features_train, targets_train, features_test, targets_test, name in zip( + axes, + models, + features_trains, + targets_trains, + features_tests, + targets_tests, + names, +): + plot_data(ax, features_train, targets_train, features_test, targets_test, model, name, h=1) +display(fig) + + + +# Code from: ./LoraMLP.ipynb +-------------------------------------------------------------------------------- + +import shutil +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import torch +from peft import LoraConfig, get_peft_model +from sklearn.datasets import make_circles, make_moons +from torch import nn, optim +from torch.utils.data import DataLoader, TensorDataset + +from concrete.ml.torch.lora import LoraTrainer + +# Set random seed for reproducibility +SEED = 42 +np.random.seed(SEED) +torch.manual_seed(SEED) + +# Task 1: Two interleaving half circles (make_moons) +X_task1, y_task1 = make_moons(n_samples=500, noise=0.1) +# Task 2: Two concentric circles +X_task2, y_task2 = make_circles(n_samples=500, noise=0.2, factor=0.5) + + +def plot_datasets_and_boundaries(X_task1, y_task1, X_task2, y_task2, model=None, titles=None): + _, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6)) + + if titles is None: + titles = ["Task 1 Dataset", "Task 2 Dataset"] + + for ax, X, y, title in zip([ax1, ax2], [X_task1, X_task2], [y_task1, y_task2], titles): + ax.scatter(X[:, 0], X[:, 1], c=y, cmap="viridis", edgecolor="k") + ax.set_title(title) + + if model is not None: + x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5 + y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5 + h = 0.1 # step size in the mesh + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) + grid = torch.FloatTensor(np.c_[xx.ravel(), yy.ravel()]) + + with torch.no_grad(): + Z = model(grid) + probabilities = torch.softmax(Z, dim=1) + Z = probabilities[:, 1].numpy().reshape(xx.shape) + + ax.contourf(xx, yy, Z, cmap="viridis", alpha=0.3) + + plt.tight_layout() + plt.show() + + +# Plot datasets +plot_datasets_and_boundaries(X_task1, y_task1, X_task2, y_task2) + +# Convert datasets to PyTorch tensors +X_task1 = torch.FloatTensor(X_task1) +y_task1 = torch.LongTensor(y_task1) +X_task2 = torch.FloatTensor(X_task2) +y_task2 = torch.LongTensor(y_task2) + +# Create DataLoaders +batch_size = 32 +train_loader_task1 = DataLoader( + TensorDataset(X_task1, y_task1), batch_size=batch_size, shuffle=True +) +train_loader_task2 = DataLoader( + TensorDataset(X_task2, y_task2), batch_size=batch_size, shuffle=True +) + +# Define an MLP model without LoRA layers + + +class SimpleMLP(nn.Module): + """Simple MLP model without LoRA layers.""" + + def __init__(self, input_size=2, hidden_size=128, num_classes=2): + super().__init__() + self.fc1 = nn.Linear(input_size, hidden_size) + self.relu = nn.ReLU() + self.fc2 = nn.Linear(hidden_size, num_classes) + + def forward(self, x): + """Forward pass of the MLP.""" + out = self.fc1(x) + out = self.relu(out) + out = self.fc2(out) + return out + + +# Instantiate the model +model = SimpleMLP() + +# Training loop for Task 1 + + +def train_model(model, train_loader, num_epochs=100): + """Train the model. + + Args: + model (nn.Module): The model to train. + train_loader (DataLoader): DataLoader for training data. + num_epochs (int): Number of epochs to train. + """ + device = torch.device("cpu") + model.to(device) + model.train() + + criterion = nn.CrossEntropyLoss() + optimizer = optim.Adam(model.parameters(), lr=0.01) + + for epoch in range(num_epochs): + total_loss = 0 + for x_batch, y_batch in train_loader: + x_batch = x_batch.to(device) + y_batch = y_batch.to(device) + + optimizer.zero_grad() + outputs = model(x_batch) + loss = criterion(outputs, y_batch) + loss.backward() + optimizer.step() + + total_loss += loss.item() + + # Print loss every 20 epochs + if (epoch + 1) % 20 == 0: + print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}") + + +# Train the model on Task 1 +print("Training on Task 1 without LoRA:") +train_model(model, train_loader_task1, num_epochs=20) + +# Plot datasets with decision boundaries +plot_datasets_and_boundaries( + X_task1.numpy(), + y_task1.numpy(), + X_task2.numpy(), + y_task2.numpy(), + model=model, + titles=["Task 1 after Training", "Task 2 after Training"], +) + +# Apply LoRA to the model using peft +lora_config = LoraConfig( + r=1, lora_alpha=1, lora_dropout=0.01, target_modules=["fc1", "fc2"], bias="none" +) + +peft_model = get_peft_model(model, lora_config) + +# Update training parameters, including loss function +optimizer = optim.Adam(filter(lambda p: p.requires_grad, peft_model.parameters()), lr=0.01) +loss_fn = nn.CrossEntropyLoss() +training_args = {"gradient_accumulation_steps": 1} + +# Set up LoRA training +lora_trainer = LoraTrainer( + peft_model, optimizer=optimizer, loss_fn=loss_fn, training_args=training_args +) + +# Prepare input data for calibration +batch_size_per_task = batch_size // 2 +inputset = ( + torch.cat([X_task1[:batch_size_per_task], X_task2[:batch_size_per_task]]), + torch.cat([y_task1[:batch_size_per_task], y_task2[:batch_size_per_task]]), +) + +# Compile the model +lora_trainer.compile(inputset, n_bits=8) + +# Fine-tune the model on Task 2 using LoRA +lora_trainer.train(train_loader_task2, num_epochs=10, fhe="execute") + +# Enable LoRA adapters (already enabled by default) +peft_model.enable_adapter_layers() + +# Plot datasets with decision boundaries after fine-tuning +plot_datasets_and_boundaries( + X_task1.numpy(), + y_task1.numpy(), + X_task2.numpy(), + y_task2.numpy(), + model=peft_model, + titles=["Task 1 after Fine-tuning", "Task 2 after Fine-tuning"], +) + +# Disable LoRA adapters +peft_model.disable_adapter_layers() + +# Plot datasets with decision boundaries after fine-tuning +plot_datasets_and_boundaries( + X_task1.numpy(), + y_task1.numpy(), + X_task2.numpy(), + y_task2.numpy(), + model=peft_model, + titles=["Task 1 after Fine-tuning", "Task 2 after Fine-tuning"], +) + +# Enable LoRA adapters (already enabled by default) +peft_model.enable_adapter_layers() + +# Print trainable (lora) parameters +peft_model.print_trainable_parameters() + +# Save the model and remove all layers that will be done on the server +path = Path("lora_mlp") + +if path.is_dir() and any(path.iterdir()): + shutil.rmtree(path) + +lora_trainer.save_and_clear_private_info(path) + +# At this point, the hybrid_model only contains the trainable parameters of the LoRA layers. +peft_model.print_trainable_parameters() + + + +# Code from: ./ImportingFromScikitLearn.ipynb +-------------------------------------------------------------------------------- + +from functools import partial + +# The simulation mode allows to measure the impact of FHE execution on accuracy +# without paying the cost of FHE computations. +# However, data is not encrypted when using the simulation: the model performs inference +# on clear data. + + +def make_classifier_comparison_from_sklearn(*args, **kwargs): + return args, kwargs + + +%run utils/classifier_comparison_utils.py + +from concrete.ml.sklearn import ( + DecisionTreeClassifier, + LinearSVC, + LogisticRegression, + RandomForestClassifier, + XGBClassifier, +) + +%%time + +linear_classifiers = [ + (partial(LinearSVC, C=0.025), "Linear SVC"), + (LogisticRegression, "Logistic Regression"), +] + +# pylint: disable-next=undefined-variable +make_classifier_comparison_from_sklearn( + "Linear Classifiers", linear_classifiers, 0, simulate=True, h=1 +) # noqa + +%%time + +tree_classifiers = [ + (partial(DecisionTreeClassifier, max_depth=5), "Decision Tree"), + (partial(RandomForestClassifier, max_depth=4, n_estimators=5), "Random Forest"), + (partial(XGBClassifier, n_jobs=1, max_depth=4, n_estimators=5), "XGB"), +] + +# pylint: disable-next=undefined-variable +make_classifier_comparison_from_sklearn( # noqa + "Tree-Based Classifiers", tree_classifiers, 0.5, simulate=True, h=0.1 +) + + + +# Code from: ./DecisionTreeRegressor.ipynb +-------------------------------------------------------------------------------- + +import sys +import time + +import numpy +from sklearn.datasets import fetch_california_housing +from sklearn.linear_model import LinearRegression +from sklearn.metrics import mean_absolute_error +from sklearn.model_selection import train_test_split +from sklearn.utils import resample + +import concrete.ml +from concrete.ml.sklearn import DecisionTreeRegressor as ConcreteDecisionTreeRegressor + +print(f"Using ConcreteML version {concrete.ml.version.__version__}") +print(f"With Python version {sys.version}") + +features_all, target_all = fetch_california_housing(return_X_y=True) +features, target = resample(features_all, target_all, replace=True, n_samples=6000, random_state=42) + +# Split data in train-test groups +x_train, x_test, y_train, y_test = train_test_split( + features, + target, + test_size=0.15, + random_state=42, +) + +%matplotlib inline +import matplotlib.pyplot as plt + +plt.hist(target, bins=15, density=True) +plt.show() + +# Utility functions + + +def print_as_dollars(x): + """Prints the value * 100'000$""" + return f"{x * 10**5:.2f}$" + + +def print_compare_to_baseline(x, baseline_error): + """Prints percentage improvement over baseline""" + return f"{(x - baseline_error) / baseline_error * 100 :.2f}% of baseline" + + +mean_error = mean_absolute_error(y_test, numpy.repeat([numpy.median(y_test)], y_test.shape)) +print(f"Mean Absolute Overall Error : {print_as_dollars(mean_error)}") + +canary = LinearRegression() +canary.fit(x_train[:, :1], y_train) +baseline_error = mean_absolute_error(canary.predict(x_test[:, :1]), y_test) +print(f"Baseline Mean Error : {print_as_dollars(baseline_error)}") + +default_model = ConcreteDecisionTreeRegressor(criterion="absolute_error", n_bits=6, random_state=42) + +begin = time.time() +default_model.fit(x_train, y_train) +print(f"Training on {x_train.shape[0]} samples in {(time.time() - begin):.4f} seconds") + +default_error = mean_absolute_error(default_model.predict(x_test), y_test) +print( + f"Default Model Mean Error: {print_as_dollars(default_error)}," + f"{print_compare_to_baseline(default_error, baseline_error)}" +) + +# Find best hyper parameters with cross validation +from sklearn.model_selection import GridSearchCV + +# List of hyper parameters to tune +param_grid = { + "criterion": ["absolute_error"], + "random_state": [42], + "max_depth": [10], + "n_bits": [6, 7], + "max_features": [2, 5], + "min_samples_leaf": [2, 5], + "min_samples_split": [2, 10], +} + +grid_search = GridSearchCV( + ConcreteDecisionTreeRegressor(), + param_grid, + cv=3, + scoring="neg_mean_absolute_error", + error_score="raise", + n_jobs=1, +) + +gs_results = grid_search.fit(x_train, y_train) +print("Best hyper parameters:", gs_results.best_params_) +print(f"Min lost: {print_as_dollars(-gs_results.best_score_)}") + +# We fix all parameters as the best ones, except for n_bits. +best = gs_results.best_params_ +cv_errors = [ + {"n_bits": params["n_bits"], "score": score} + for params, score in zip( + gs_results.cv_results_["params"], gs_results.cv_results_["mean_test_score"] + ) + if (params["max_depth"] == best["max_depth"]) + and (params["max_features"] == best["max_features"]) # noqa: W503 + and (params["min_samples_leaf"] == best["min_samples_leaf"]) # noqa: W503 + and (params["min_samples_split"] == best["min_samples_split"]) # noqa: W503 +] +for el in cv_errors: + print(f"Error for n_bits={el['n_bits']} is {print_as_dollars(-el['score'])}") + +# Build the model with best hyper parameters +model = ConcreteDecisionTreeRegressor( + max_depth=gs_results.best_params_["max_depth"], + max_features=gs_results.best_params_["max_features"], + min_samples_leaf=gs_results.best_params_["min_samples_leaf"], + min_samples_split=gs_results.best_params_["min_samples_split"], + n_bits=6, + random_state=42, +) + +model, sklearn_model = model.fit_benchmark(x_train, y_train) + +# Compute average precision on test +y_pred_concrete = model.predict(x_test) +y_pred_sklearn = sklearn_model.predict(x_test) +concrete_average_precision = mean_absolute_error(y_test, y_pred_concrete) +sklearn_average_precision = mean_absolute_error(y_test, y_pred_sklearn) +print( + f"Sklearn Mean Error: {print_as_dollars(sklearn_average_precision)}," + f"{print_compare_to_baseline(sklearn_average_precision, baseline_error)}" +) +print( + f"Concrete Mean Error: {print_as_dollars(concrete_average_precision)}," + f"{print_compare_to_baseline(concrete_average_precision, baseline_error)}" +) + +from concrete.compiler import check_gpu_available + +use_gpu_if_available = False +device = "cuda" if use_gpu_if_available and check_gpu_available() else "cpu" + +x_train_subset = x_train[:500] + +begin = time.time() +circuit = model.compile(x_train_subset, device=device) +print(f"Compiled with {len(x_train_subset)} samples in {(time.time() - begin):.4f} seconds") + +print(f"Generating a key for an {circuit.graph.maximum_integer_bit_width()}-bit circuit") +time_begin = time.time() +circuit.client.keygen(force=False) +print(f"Key generation time: {time.time() - time_begin:.2f} seconds") + +FHE_SAMPLES = 3 +x_test_small = x_test[:FHE_SAMPLES] +y_pred = y_test[:FHE_SAMPLES] + +# Predict in FHE for a few examples +time_begin = time.time() +y_pred_fhe = model.predict(x_test_small, fhe="execute") +print(f"Execution time: {(time.time() - time_begin) / FHE_SAMPLES:.2f} seconds per sample") + +# Check prediction FHE vs sklearn +print("Cipher estimates:") +print(f"{', '.join(f'{print_as_dollars(x)}' for x in y_pred_fhe)}") +print("Plain estimates:") +print(f"{', '.join(f'{print_as_dollars(x)}' for x in y_pred)}") +print("Differences:") +print(f"{', '.join(f'{print_as_dollars(x)}' for x in (y_pred_fhe - y_pred))}") + +# Concatenate all the steps in one function of n_bits + + +def evaluate(n_bits): + model = ConcreteDecisionTreeRegressor( + max_depth=gs_results.best_params_["max_depth"], + max_features=gs_results.best_params_["max_features"], + min_samples_leaf=gs_results.best_params_["min_samples_leaf"], + min_samples_split=gs_results.best_params_["min_samples_split"], + n_bits=n_bits, + random_state=42, + ) + + model, sklearn_model = model.fit_benchmark(x_train, y_train) + + y_pred_concrete = model.predict(x_test) + y_pred_sklearn = sklearn_model.predict(x_test) + + concrete_average_precision = mean_absolute_error(y_test, y_pred_concrete) + sklearn_average_precision = mean_absolute_error(y_test, y_pred_sklearn) + + print( + f"Sklearn Mean Error: {print_as_dollars(sklearn_average_precision)}," + f"{print_compare_to_baseline(sklearn_average_precision, baseline_error)}" + ) + print( + f"Concrete Mean Error: {print_as_dollars(concrete_average_precision)}," + f"{print_compare_to_baseline(concrete_average_precision, baseline_error)}" + ) + + x_train_subset = x_train[:500] + begin = time.time() + circuit = model.compile(x_train_subset) + print( + f"Circuit compiled with {len(x_train_subset)} samples in {(time.time() - begin):.4f} " + "seconds" + ) + print(f"Generating a key for an {circuit.graph.maximum_integer_bit_width()}-bit circuit") + + time_begin = time.time() + circuit.client.keygen(force=False) + print(f"Key generation time: {time.time() - time_begin:.2f} seconds") + + time_begin = time.time() + model.predict(x_test_small, fhe="execute") + print(f"Execution time: {(time.time() - time_begin) / FHE_SAMPLES:.2f} seconds per sample") + + +for n_bits in [6, 7]: + header = f"N_BITS = {n_bits}" + print(header) + print("-" * len(header)) + evaluate(n_bits) + print() + + + +# Code from: ./DecisionTreeClassifier.ipynb +-------------------------------------------------------------------------------- + +import time + +import numpy +from sklearn.datasets import fetch_openml +from sklearn.model_selection import train_test_split + +features, classes = fetch_openml(data_id=44, as_frame=False, cache=True, return_X_y=True) +classes = classes.astype(numpy.int64) + +x_train, x_test, y_train, y_test = train_test_split( + features, + classes, + test_size=0.15, + random_state=42, +) + +# Find best hyper parameters with cross validation +from sklearn.model_selection import GridSearchCV + +from concrete.ml.sklearn import DecisionTreeClassifier as ConcreteDecisionTreeClassifier + +# List of hyper parameters to tune +param_grid = { + "max_features": [None, "auto", "sqrt", "log2"], + "min_samples_leaf": [1, 10, 100], + "min_samples_split": [2, 10, 100], + "max_depth": [None, 2, 4, 6, 8], +} + +grid_search = GridSearchCV( + ConcreteDecisionTreeClassifier(), + param_grid, + cv=10, + scoring="average_precision", + error_score="raise", + n_jobs=1, +) + +gs_results = grid_search.fit(x_train, y_train) +print("Best hyper parameters:", gs_results.best_params_) +print("Best score:", gs_results.best_score_) + +# Build the model with best hyper parameters +model = ConcreteDecisionTreeClassifier( + max_features=gs_results.best_params_["max_features"], + min_samples_leaf=gs_results.best_params_["min_samples_leaf"], + min_samples_split=gs_results.best_params_["min_samples_split"], + max_depth=gs_results.best_params_["max_depth"], + n_bits=6, +) + +model, sklearn_model = model.fit_benchmark(x_train, y_train) + +# Compute average precision on test +from sklearn.metrics import average_precision_score + +# pylint: disable=no-member +y_pred_concrete = model.predict_proba(x_test)[:, 1] +y_pred_sklearn = sklearn_model.predict_proba(x_test)[:, 1] +concrete_average_precision = average_precision_score(y_test, y_pred_concrete) +sklearn_average_precision = average_precision_score(y_test, y_pred_sklearn) +print(f"Sklearn average precision score: {sklearn_average_precision:0.2f}") +print(f"Concrete average precision score: {concrete_average_precision:0.2f}") + +# Show the confusion matrix on x_test +from sklearn.metrics import confusion_matrix + +y_pred = model.predict(x_test) +true_negative, false_positive, false_negative, true_positive = confusion_matrix( + y_test, y_pred, normalize="true" +).ravel() + +num_samples = len(y_test) +num_spam = sum(y_test) + +print(f"Number of test samples: {num_samples}") +print(f"Number of spams in test samples: {num_spam}") + +print(f"True Negative (legit mail well classified) rate: {true_negative}") +print(f"False Positive (legit mail classified as spam) rate: {false_positive}") +print(f"False Negative (spam mail classified as legit) rate: {false_negative}") +print(f"True Positive (spam well classified) rate: {true_positive}") + +from concrete.compiler import check_gpu_available + +use_gpu_if_available = False +device = "cuda" if use_gpu_if_available and check_gpu_available() else "cpu" + +# We first compile the model with some data, here the training set +circuit = model.compile(x_train, device=device) + +print(f"Generating a key for an {circuit.graph.maximum_integer_bit_width()}-bit circuit") + +time_begin = time.time() +circuit.client.keygen(force=False) +print(f"Key generation time: {time.time() - time_begin:.2f} seconds") + +# Reduce the sample size for a faster total execution time +FHE_SAMPLES = 10 +x_test = x_test[:FHE_SAMPLES] +y_pred = y_pred[:FHE_SAMPLES] +y_reference = y_test[:FHE_SAMPLES] + +# Predict in FHE for a few examples +time_begin = time.time() +y_pred_fhe = model.predict(x_test, fhe="execute") +print(f"Execution time: {(time.time() - time_begin) / len(x_test):.2f} seconds per sample") + +# Check prediction FHE vs sklearn +print(f"Ground truth: {y_reference}") +print(f"Prediction sklearn: {y_pred}") +print(f"Prediction FHE: {y_pred_fhe}") + +print( + f"{numpy.sum(y_pred_fhe == y_pred)}/" + "10 predictions are similar between the FHE model and the clear sklearn model." +) + + + +# Code from: ./RegressorComparison.ipynb +-------------------------------------------------------------------------------- + +import warnings + +warnings.simplefilter(action="ignore", category=FutureWarning) + + +import time +from functools import partial + +import matplotlib.pyplot as plt +import numpy as np +import torch +from sklearn.metrics import r2_score +from sklearn.model_selection import train_test_split +from sklearn.neural_network import MLPRegressor +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import PolynomialFeatures, StandardScaler + +from concrete.ml.sklearn import ( + DecisionTreeRegressor, + LinearRegression, + LinearSVR, + NeuralNetRegressor, + RandomForestRegressor, + XGBRegressor, +) + +%matplotlib inline + +rng = np.random.RandomState(42) + +def make_regression_data( + n_samples=200, + n_features=1, + bias=0.0, + noise_scale=1.0, + loc=0.0, + scale=1.0, + polynomial_exp=1, + target_scale=1.0, + feature_scale=1.0, +): + """ + Generates a dataset for regression models. + """ + X = rng.randn(n_samples, n_features) + # To avoid to have to big numbers on polynomial datasets + if polynomial_exp > 1: + feature_scale = 1 + X = feature_scale * np.sort(X, 0) + scale = scale * polynomial_exp + noise = noise_scale * rng.normal(loc=loc, scale=scale, size=n_samples) + y = X.ravel() ** polynomial_exp + bias + noise + y *= target_scale + return X, y + +# pylint: disable=too-many-locals,too-many-statements + + +def make_regressor_comparison(title, regressors, **kwargs): + print(title) + + # Create subplots where each column represents a polynomial degree + subplot_col = kwargs.get("polynomial_exp", 1) + fig, axs = plt.subplots(len(regressors), subplot_col, figsize=(15, 8), sharex=False) + + # Create data-sets for each polynomial degree + for i in range(subplot_col): + kwargs_copy = kwargs.copy() + kwargs_copy["polynomial_exp"] = i + 1 + X, y = make_regression_data(**kwargs_copy) + + # Split the data into training and test sets + # Use 15 percent (30 points for a data-set of 200 points) for prediction + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) + + sort_test_index = np.argsort(X_test.ravel()) + X_test = X_test[sort_test_index, :] + y_test = y_test[sort_test_index] + + # Feature preprocessing + # Linear models require polynomial features to be applied before training + # to fit a non-linear model and other models perform better with this transoformation + pipe = Pipeline( + [ + ("poly", PolynomialFeatures(i + 1)), + ("scaler", StandardScaler()), + ] + ) + + X_poly_train = pipe.fit_transform(X_train) + X_poly_test = pipe.transform(X_test) + + # Iterate over the given regressors + for j, (regressor, model_name) in enumerate(regressors): + print(f"Evaluation of {model_name}") + if np.ndim(axs) > 1: + axs[0, i].set_title(f"Polynomial degree {i + 1}") + ax = axs[j, i] + else: + try: + axs[i].set_title(f"Polynomial degree {i + 1}") + ax = axs[i] + except IndexError: + ax = axs + ax.set_title(f"Polynomial degree {i + 1}") + + # Plot the training points + ax.scatter( + X_train, + y_train, + edgecolors="k", + label="Train data", + ) + + # Plot the testing points + ax.scatter( + X_test, + y_test, + marker="D", + alpha=0.6, + edgecolors="k", + label="Test data", + ) + + # Instantiate the model + model = regressor() + + # Train the model and retrieve both the Concrete-ML model and its equivalent one from + # scikit-learn + # If the model is a NeuralNetClassifier, instantiate a scikit-learn MLPClassifier + # separately in order to be able to be able to compare the results with a float model + # that doesn't use QAT + if model.__class__ == NeuralNetRegressor: + + sklearn_model = MLPRegressor( + alpha=1, + activation="identity", + max_iter=1000, + hidden_layer_sizes=(25,), + learning_rate_init=0.005, + ) + sklearn_model.fit(X_poly_train, y_train) + + # When we apply PolynomialFeatures the input dim is equal to degree of polynome + 1 + model.module__input_dim = i + 2 + concrete_model = model.fit(X_poly_train, y_train.reshape(-1, 1)) + + else: + + concrete_model, sklearn_model = model.fit_benchmark(X_poly_train, y_train) + + # Compute the predictions in clear using the scikit-learn model + sklearn_y_pred = sklearn_model.predict(X_poly_test) + + # Compile the Contrete-ML model + circuit = concrete_model.compile(X_poly_train) + + print( + "Generating a key for a " f"{circuit.graph.maximum_integer_bit_width()}-bit circuit" + ) + + time_begin = time.time() + circuit.client.keygen(force=False) + time_end = time.time() + print(f"Key generation time: {time_end - time_begin:.2f} seconds") + + # Compute the predictions in FHE using the Concrete-ML model + time_begin = time.time() + concrete_y_pred = concrete_model.predict(X_poly_test[:1], fhe="execute") + time_end = time.time() + + print(f"Execution time: {(time_end - time_begin):.2f} " "seconds per sample in FHE") + + # Compute predictions for all test examples with the simulate mode + concrete_y_pred = concrete_model.predict(X_poly_test, fhe="simulate") + + # Measure the R2 score + sklearn_score = r2_score(sklearn_y_pred, y_test) + concrete_score = r2_score(concrete_y_pred, y_test) + + is_a_tree_based_model = concrete_model.__class__ in [ + DecisionTreeRegressor, + RandomForestRegressor, + XGBRegressor, + ] + + # If the model is not a tree-based model, retrieve the maximum integer bitwidth + # reached within its circuit. + bitwidth = None + if not is_a_tree_based_model: + bitwidth = circuit.graph.maximum_integer_bit_width() + + # Plot the predictions + ax.plot(X_test, concrete_y_pred, c="blue", linewidth=2.5, label="Concrete-ML") + + # Plot the predictions + ax.plot(X_test, sklearn_y_pred, c="red", linewidth=2.5, label="scikit-learn") + + ax.text( + 0.5, + 0.80, + f"Concrete-ML R2: {concrete_score:.2f}\n scikit-learn R2: {sklearn_score:.2f}\n", + transform=ax.transAxes, + fontsize=12, + va="top", + ha="right", + ) + if bitwidth: + ax.text( + 0.75, + 0.1, + f"bitwidth={bitwidth}", + transform=ax.transAxes, + fontsize=12, + va="bottom", + ha="left", + ) + handles, labels = ax.get_legend_handles_labels() + fig.legend(handles, labels, loc="upper left") + + scaler = 0.5 + if len(regressors) == 3: + scaler = 0.3 + fig.text( + -0.05, 0.75 - j * scaler, f"{model_name}", ha="center", va="bottom", fontsize=14 + ) + + plt.tight_layout(pad=1.2) + plt.show() + +params_neural_net = { + "module__n_w_bits": 6, + "module__n_a_bits": 8, + "module__n_accum_bits": 16, + "module__n_hidden_neurons_multiplier": 10, + "module__n_layers": 2, # 1 hidden layer + "module__activation_function": torch.nn.Identity, + "max_epochs": 400, + "verbose": 0, + "lr": 0.1, +} + + +neural_network_regressor = [ + ( + partial(NeuralNetRegressor, batch_size=32, **params_neural_net), + "Neural Net", + ), +] +make_regressor_comparison( + "NN Regressors", + neural_network_regressor, + n_samples=250, + polynomial_exp=3, + bias=20, + scale=0.25, + target_scale=1, + feature_scale=10, +) + +np.random.seed(42) +linear_regressor = [ + (partial(LinearSVR, n_bits={"op_inputs": 5, "op_weights": 2}, C=0.5), "Linear SVR"), + (partial(LinearRegression, n_bits={"op_inputs": 5, "op_weights": 2}), "Linear Regression"), +] +make_regressor_comparison( + "linear", + linear_regressor, + polynomial_exp=3, + bias=20, + scale=0.25, + target_scale=1, + feature_scale=10, +) + +tree_regressors = [ + (partial(DecisionTreeRegressor, n_bits=5, max_depth=5), "Decision Tree"), + (partial(RandomForestRegressor, n_bits=5), "RandomForestRegressor"), + ( + partial(XGBRegressor, n_bits=6, n_estimators=50, max_depth=3, gamma=1, learning_rate=0.3), + "XGB", + ), +] + +make_regressor_comparison( + "Tree-Based Regressors", + tree_regressors, + n_samples=300, + polynomial_exp=3, + bias=20, + scale=0.25, + target_scale=1, + feature_scale=10, +) + + + +# Code from: ./FullyConnectedNeuralNetwork.ipynb +-------------------------------------------------------------------------------- + +import time + +import numpy as np +from matplotlib import pyplot as plt +from sklearn.datasets import load_iris +from sklearn.decomposition import PCA +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split +from torch import nn +from tqdm import tqdm + +from concrete.ml.sklearn import NeuralNetClassifier + +# Get iris data-set + +X, y = load_iris(return_X_y=True) + +# Split into train and test +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) + +# Scikit-Learn and Concrete ML neural networks only handle float32 input values +X_train, X_test = X_train.astype("float32"), X_test.astype("float32") + +params = { + "module__n_layers": 3, + "module__activation_function": nn.ReLU, + "max_epochs": 1000, + "verbose": 0, +} +model = NeuralNetClassifier(**params) + +model, sklearn_model = model.fit_benchmark(X=X_train, y=y_train) + +# Evaluate the sklearn model, which needs to specifically be of type float32 +y_pred_sklearn = sklearn_model.predict(X_test) + +sklearn_accuracy = accuracy_score(y_test, y_pred_sklearn) * 100 +print(f"The test accuracy of the trained scikit-learn model is {sklearn_accuracy:.2f}%") + +# Evaluate the Concrete ML model in the clear +y_pred_simulated = model.predict(X_test) + +simulated_accuracy = accuracy_score(y_test, y_pred_simulated) * 100 +print(f"The test accuracy of the trained Concrete ML simulated model is {simulated_accuracy:.2f}%") + +# Compile the model to have before +fhe_circuit = model.compile(X_train) + +print("Generating a key for a " f"{fhe_circuit.graph.maximum_integer_bit_width()}-bit circuit") + +time_begin = time.time() +fhe_circuit.client.keygen(force=True) +print(f"Key generation time: {time.time() - time_begin:.2f} seconds") + +fhe_predictions = [] +time_begin = time.time() +for x in tqdm(X_test): + y_ = model.predict(np.array([x]), fhe="execute")[0] + fhe_predictions.append(y_) + +print(f"Execution time: {(time.time() - time_begin) / len(X_test):.2f} seconds per sample") + +fhe_accuracy = accuracy_score(y_test, fhe_predictions) * 100 + +print(f"Test accuracy using the sklearn model: {sklearn_accuracy:.2f}%") +print(f"Test accuracy using the Concrete ML simulated model: {simulated_accuracy:.2f}%") +print(f"Test accuracy using the Concrete ML FHE model: {fhe_accuracy:.2f}%") + +# Create a 2D grid in order to visualize predictions and contours for both models +pca = PCA(n_components=2, random_state=np.random.randint(0, 2**15)) +X_test_2d = pca.fit_transform(X_test) + +b_min = np.min(X_test_2d, axis=0) +b_max = np.max(X_test_2d, axis=0) + +grid_dims = tuple( + np.linspace(b_min[i], b_max[i], 512, dtype=X_test.dtype) for i in range(X_test_2d.shape[1]) +) +ndgrid_tuple = np.meshgrid(*grid_dims) +grid_2d = np.vstack([g.ravel() for g in ndgrid_tuple]).transpose() + +grid_test = pca.inverse_transform(grid_2d) + +# Evaluate the predicted classes using the sklearn model +grid_pred_sklearn = sklearn_model.predict_proba(grid_test) +pred_sklearn_classes = np.argmax(grid_pred_sklearn, axis=1) + +# Evaluate the predicted classes using the Concrete ML simulated model +# Pylint is disabled because it does not seem to be able to understand that `model` is a +# NeuralClassifier instance and support the predict_proba method. This may be solved by removing +# Skorch and Sklearn inheritance +# FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3373 +grid_pred_fhe = model.predict_proba(grid_test) # pylint: disable=no-member +pred_fhe_classes = np.argmax(grid_pred_fhe, axis=1) + +%matplotlib inline + +cmap = "autumn" + +classes_to_plot = [ + (pred_sklearn_classes, "Clear Inference (Sklearn)", sklearn_accuracy), + (pred_fhe_classes, "FHE Inference (Concrete ML)", simulated_accuracy), +] + +fig, axes = plt.subplots(1, 2, figsize=(16, 6)) + +for i, (classes, title, accuracy) in enumerate(classes_to_plot): + ax = axes[i] + + # Plot contours based on the predicted classes + ax.contourf( + ndgrid_tuple[0], + ndgrid_tuple[1], + classes.reshape(ndgrid_tuple[0].shape), + cmap=cmap, + label="ookko", + ) + + # Set the title and legend text + ax.set_title(title) + ax.text(1.6, 1, f"accuracy: {accuracy:.2f}", size=12) + + # Plot the test data as a scatter with marker borders + ax.scatter(X_test_2d[:, 0], X_test_2d[:, 1], c=y_test, s=50, edgecolors="k", cmap=cmap) + +fig.suptitle("Decision boundaries", size=15) +plt.show() + + + +# Code from: ./LinearRegression.ipynb +-------------------------------------------------------------------------------- + +import time + +import numpy as np +from sklearn.datasets import make_regression +from sklearn.linear_model import LinearRegression as SklearnLinearRegression +from sklearn.metrics import r2_score +from sklearn.model_selection import train_test_split + +from concrete.ml.sklearn import LinearRegression as ConcreteLinearRegression + +%matplotlib inline + +import matplotlib.pyplot as plt +from IPython.display import display + +train_plot_config = {"c": "black", "marker": "D", "s": 15, "label": "Train data"} +test_plot_config = {"c": "red", "marker": "x", "s": 15, "label": "Test data"} + + +def get_sklearn_plot_config(r2_score=None): + label = "Scikit-Learn" + if r2_score is not None: + label += f", {'$R^2$'}={r2_score:.4f}" + return {"c": "blue", "linewidth": 2.5, "label": label} + + +def get_concrete_plot_config(r2_score=None): + label = "Concrete ML" + if r2_score is not None: + label += f", {'$R^2$'}={r2_score:.4f}" + return {"c": "orange", "linewidth": 2.5, "label": label} + +# pylint: disable=unbalanced-tuple-unpacking +X, y = make_regression( + n_samples=200, n_features=1, n_targets=1, bias=5.0, noise=30.0, random_state=42 +) +# pylint: enable=unbalanced-tuple-unpacking + +# We split the data-set into a training and a testing set +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) + +# We sort the test set for a better visualization +sorted_indexes = np.argsort(np.squeeze(X_test)) +X_test = X_test[sorted_indexes, :] +y_test = y_test[sorted_indexes] + +plt.ioff() + +plt.clf() +fig, ax = plt.subplots(1, figsize=(10, 5)) +fig.patch.set_facecolor("white") +ax.scatter(X_train, y_train, **train_plot_config) +ax.scatter(X_test, y_test, **test_plot_config) +ax.legend() +display(fig) + +sklearn_lr = SklearnLinearRegression() +sklearn_lr.fit(X_train, y_train) +y_pred = sklearn_lr.predict(X_test) + +# Compute the R2 scores +sklearn_r2_score = r2_score(y_test, y_pred) + +plt.ioff() +plt.clf() + +fig, ax = plt.subplots(1, figsize=(10, 5)) +fig.patch.set_facecolor("white") +ax.scatter(X_train, y_train, **train_plot_config) +ax.scatter(X_test, y_test, **test_plot_config) +ax.plot(X_test, y_pred, **get_sklearn_plot_config(sklearn_r2_score)) +ax.legend() +display(fig) + +# We quantize the inputs using 8-bits +concrete_lr = ConcreteLinearRegression(n_bits=8) + +# We train the concrete linear regression model on clear data +concrete_lr.fit(X_train, y_train) + +# We densify the space representation of the original X, +# to better visualize the resulting step function in the following figure +x_space = np.linspace(X_test.min(), X_test.max(), num=300) +x_space = x_space[:, np.newaxis] +y_pred_q_space = concrete_lr.predict(x_space) + +# Now, we can test our Concrete ML model on the clear test data +y_pred_q = concrete_lr.predict(X_test) + +# Compute the R2 scores +quantized_r2_score = r2_score(y_test, y_pred_q) + +plt.ioff() + +plt.clf() +fig, ax = plt.subplots(1, figsize=(12, 8)) +fig.patch.set_facecolor("white") +ax.scatter(X_train, y_train, **train_plot_config) +ax.scatter(X_test, y_test, **test_plot_config) +ax.plot(X_test, y_pred, **get_sklearn_plot_config(sklearn_r2_score)) +ax.plot(x_space, y_pred_q_space, **get_concrete_plot_config(quantized_r2_score)) +ax.legend() +display(fig) + +fhe_circuit = concrete_lr.compile(X_train) + +print(f"Generating a key for a {fhe_circuit.graph.maximum_integer_bit_width()}-bit circuit") + +time_begin = time.time() +fhe_circuit.client.keygen(force=False) +print(f"Key generation time: {time.time() - time_begin:.4f} seconds") + +time_begin = time.time() +y_pred_fhe = concrete_lr.predict(X_test, fhe="execute") +print(f"Execution time: {(time.time() - time_begin) / len(X_test):.4f} seconds per sample") + +# Measure the FHE R2 score +fhe_r2_score = r2_score(y_test, y_pred_fhe) + +print("R^2 scores:") +print(f"scikit-learn (clear): {sklearn_r2_score:.4f}") +print(f"Concrete ML (quantized): {quantized_r2_score:.4f}") +print(f"Concrete ML (FHE): {fhe_r2_score:.4f}") + +# Measure the error of the FHE quantized model with respect to the clear scikit-learn float model +concrete_score_difference = abs(fhe_r2_score - quantized_r2_score) * 100 / quantized_r2_score +print( + "\nRelative score difference for Concrete ML (quantized clear) vs. Concrete ML (FHE):", + f"{concrete_score_difference:.2f}%", +) + +# Measure the error of the FHE quantized model with respect to the clear float model +score_difference = abs(fhe_r2_score - sklearn_r2_score) * 100 / sklearn_r2_score +print( + "Relative score difference for scikit-learn (clear) vs. Concrete ML (FHE) scores:", + f"{score_difference:.2f}%", +) + +# For better visualization +y_pred_q_space = concrete_lr.predict(x_space) + +plt.clf() +fig, ax = plt.subplots(1, figsize=(12, 8)) +fig.patch.set_facecolor("white") +ax.scatter(X_train, y_train, **train_plot_config) +ax.scatter(X_test, y_test, **test_plot_config) +ax.plot(X_test, y_pred, **get_sklearn_plot_config(sklearn_r2_score)) +ax.plot(x_space, y_pred_q_space, **get_concrete_plot_config(fhe_r2_score)) +ax.legend() + +display(fig) + + + +# Code from: ./ConvolutionalNeuralNetwork.ipynb +-------------------------------------------------------------------------------- + +import time + +import numpy as np +import torch +import torch.utils +from concrete.compiler import check_gpu_available +from sklearn.datasets import load_digits +from sklearn.model_selection import train_test_split +from torch import nn +from torch.utils.data import DataLoader, TensorDataset +from tqdm import tqdm + +from concrete.ml.torch.compile import compile_torch_model + +# And some helpers for visualization. + +%matplotlib inline + +import matplotlib.pyplot as plt + +X, y = load_digits(return_X_y=True) + +# The sklearn Digits data-set, though it contains digit images, keeps these images in vectors +# so we need to reshape them to 2D first. The images are 8x8 px in size and monochrome +X = np.expand_dims(X.reshape((-1, 8, 8)), 1) + +nplot = 4 +fig, ax = plt.subplots(nplot, nplot, figsize=(6, 6)) +for i in range(0, nplot): + for j in range(0, nplot): + ax[i, j].imshow(X[i * nplot + j, ::].squeeze()) +plt.show() + +x_train, x_test, y_train, y_test = train_test_split( + X, y, test_size=0.25, shuffle=True, random_state=42 +) + +class TinyCNN(nn.Module): + """A very small CNN to classify the sklearn digits data-set.""" + + def __init__(self, n_classes) -> None: + """Construct the CNN with a configurable number of classes.""" + super().__init__() + + # This network has a total complexity of 1216 MAC + self.conv1 = nn.Conv2d(1, 8, 3, stride=1, padding=0) + self.conv2 = nn.Conv2d(8, 16, 3, stride=2, padding=0) + self.conv3 = nn.Conv2d(16, 32, 2, stride=1, padding=0) + self.fc1 = nn.Linear(32, n_classes) + + def forward(self, x): + """Run inference on the tiny CNN, apply the decision layer on the reshaped conv output.""" + x = self.conv1(x) + x = torch.relu(x) + x = self.conv2(x) + x = torch.relu(x) + x = self.conv3(x) + x = torch.relu(x) + x = x.flatten(1) + x = self.fc1(x) + return x + +torch.manual_seed(42) + + +def train_one_epoch(net, optimizer, train_loader): + # Cross Entropy loss for classification when not using a softmax layer in the network + loss = nn.CrossEntropyLoss() + + net.train() + avg_loss = 0 + for data, target in train_loader: + optimizer.zero_grad() + output = net(data) + loss_net = loss(output, target.long()) + loss_net.backward() + optimizer.step() + avg_loss += loss_net.item() + + return avg_loss / len(train_loader) + + +# Create the tiny CNN with 10 output classes +N_EPOCHS = 150 + +# Create a train data loader +train_dataset = TensorDataset(torch.Tensor(x_train), torch.Tensor(y_train)) +train_dataloader = DataLoader(train_dataset, batch_size=64) + +# Create a test data loader to supply batches for network evaluation (test) +test_dataset = TensorDataset(torch.Tensor(x_test), torch.Tensor(y_test)) +test_dataloader = DataLoader(test_dataset) + +# Train the network with Adam, output the test set accuracy every epoch +net = TinyCNN(10) +losses_bits = [] +optimizer = torch.optim.Adam(net.parameters()) +for _ in tqdm(range(N_EPOCHS), desc="Training"): + losses_bits.append(train_one_epoch(net, optimizer, train_dataloader)) + +fig = plt.figure(figsize=(8, 4)) +plt.plot(losses_bits) +plt.ylabel("Cross Entropy Loss") +plt.xlabel("Epoch") +plt.title("Training set loss during training") +plt.grid(True) +plt.show() + +def test_torch(net, test_loader): + """Test the network: measure accuracy on the test set.""" + + # Freeze normalization layers + net.eval() + + all_y_pred = np.zeros((len(test_loader)), dtype=np.int64) + all_targets = np.zeros((len(test_loader)), dtype=np.int64) + + # Iterate over the batches + idx = 0 + for data, target in test_loader: + # Accumulate the ground truth labels + endidx = idx + target.shape[0] + all_targets[idx:endidx] = target.numpy() + + # Run forward and get the predicted class id + output = net(data).argmax(1).detach().numpy() + all_y_pred[idx:endidx] = output + + idx += target.shape[0] + + # Print out the accuracy as a percentage + n_correct = np.sum(all_targets == all_y_pred) + print( + f"Test accuracy for fp32 weights and activations: " + f"{n_correct / len(test_loader) * 100:.2f}%" + ) + + +test_torch(net, test_dataloader) + +def test_with_concrete(quantized_module, test_loader, use_sim): + """Test a neural network that is quantized and compiled with Concrete ML.""" + + # Casting the inputs into int64 is recommended + all_y_pred = np.zeros((len(test_loader)), dtype=np.int64) + all_targets = np.zeros((len(test_loader)), dtype=np.int64) + + # Iterate over the test batches and accumulate predictions and ground truth labels in a vector + idx = 0 + for data, target in tqdm(test_loader): + data = data.numpy() + target = target.numpy() + + fhe_mode = "simulate" if use_sim else "execute" + + # Quantize the inputs and cast to appropriate data type + y_pred = quantized_module.forward(data, fhe=fhe_mode) + + endidx = idx + target.shape[0] + + # Accumulate the ground truth labels + all_targets[idx:endidx] = target + + # Get the predicted class id and accumulate the predictions + y_pred = np.argmax(y_pred, axis=1) + all_y_pred[idx:endidx] = y_pred + + # Update the index + idx += target.shape[0] + + # Compute and report results + n_correct = np.sum(all_targets == all_y_pred) + + return n_correct / len(test_loader) + +n_bits = 6 + +use_gpu_if_available = False +device = "cuda" if use_gpu_if_available and check_gpu_available() else "cpu" + +q_module = compile_torch_model(net, x_train, rounding_threshold_bits=6, p_error=0.1, device=device) + +start_time = time.time() +accs = test_with_concrete( + q_module, + test_dataloader, + use_sim=True, +) +sim_time = time.time() - start_time + +print(f"Simulated FHE execution for {n_bits} bit network accuracy: {accs:.2f}%") + +# Generate keys first +t = time.time() +q_module.fhe_circuit.keygen() +print(f"Keygen time: {time.time()-t:.2f}s") + +# Run inference in FHE on a single encrypted example +mini_test_dataset = TensorDataset(torch.Tensor(x_test[:100, :]), torch.Tensor(y_test[:100])) +mini_test_dataloader = DataLoader(mini_test_dataset) + +t = time.time() +accuracy_test = test_with_concrete( + q_module, + mini_test_dataloader, + use_sim=False, +) +elapsed_time = time.time() - t +time_per_inference = elapsed_time / len(mini_test_dataset) +accuracy_percentage = 100 * accuracy_test + +print( + f"Time per inference in FHE: {time_per_inference:.2f} " + f"with {accuracy_percentage:.2f}% accuracy" +) + diff --git a/src/concrete/ml/torch/hybrid_backprop_linear.py b/src/concrete/ml/torch/hybrid_backprop_linear.py new file mode 100644 index 000000000..308d6bfe9 --- /dev/null +++ b/src/concrete/ml/torch/hybrid_backprop_linear.py @@ -0,0 +1,116 @@ +"""Linear layer implementations for backprop FHE-compatible models.""" + +from torch import autograd, nn + +# pylint: disable=arguments-differ,abstract-method + + +class ForwardModuleLinear(nn.Module): + """Forward module for linear layers.""" + + def __init__(self, weight, bias=None, weight_transposed=False): + super().__init__() + self.weight = weight + self.bias = bias + self.weight_transposed = weight_transposed # If True, weight is (in_features, out_features) + + def forward(self, input_tensor): + """Forward pass for linear layers. + + Args: + input_tensor: The input tensor. + + Returns: + The output tensor after applying the linear transformation. + """ + if self.weight_transposed: + # Weight is (in_features, out_features) + output = input_tensor @ self.weight + else: + # Weight is (out_features, in_features) + output = input_tensor @ self.weight.t() + if self.bias is not None: + output += self.bias + return output + + +class BackwardModuleLinear(nn.Module): + """Backward module for linear layers.""" + + def __init__(self, weight, weight_transposed=False): + super().__init__() + self.weight = weight + self.weight_transposed = weight_transposed + + def forward(self, grad_output): + """Backward pass for linear layers. + + Args: + grad_output: The gradient output tensor. + + Returns: + The gradient input tensor after applying the backward pass. + """ + if self.weight_transposed: + grad_input = grad_output @ self.weight.t() + else: + grad_input = grad_output @ self.weight + return grad_input + + +class CustomLinear(nn.Module): + """Custom linear module.""" + + def __init__(self, weight, bias=None, weight_transposed=False): + super().__init__() + self.forward_module = ForwardModuleLinear(weight, bias, weight_transposed) + self.backward_module = BackwardModuleLinear(weight, weight_transposed) + + def forward(self, input_tensor): + """Forward pass of the custom linear module. + + Args: + input_tensor: The input tensor. + + Returns: + The output tensor after applying the custom linear module. + """ + return ForwardBackwardModule.apply(input_tensor, self.forward_module, self.backward_module) + + +class ForwardBackwardModule(autograd.Function): + """Custom autograd function for forward and backward passes.""" + + @staticmethod + def forward(ctx, input_tensor, forward_module, backward_module): + """Forward pass of the custom autograd function. + + Args: + ctx: The context object. + input_tensor: The input tensor. + forward_module: The forward module. + backward_module: The backward module. + + Returns: + The output tensor after applying the forward pass. + """ + ctx.backward_module = backward_module + output = forward_module.forward(input_tensor) + return output + + @staticmethod + def backward(ctx, grad_output): + """Backward pass of the custom autograd function. + + Args: + ctx: The context object. + grad_output: The gradient output tensor. + + Returns: + The gradient input tensor after applying the backward pass. + """ + backward_module = ctx.backward_module + grad_input = backward_module.forward(grad_output) + + # grad_weight and grad_bias are not needed when computing the backward for LoRA + return grad_input, None, None diff --git a/src/concrete/ml/torch/hybrid_model.py b/src/concrete/ml/torch/hybrid_model.py index 8cc4e69f2..5aa58e5a0 100644 --- a/src/concrete/ml/torch/hybrid_model.py +++ b/src/concrete/ml/torch/hybrid_model.py @@ -29,7 +29,7 @@ compile_torch_model, has_any_qnn_layers, ) -from .lora import BackwardModuleLinear, ForwardModuleLinear +from .hybrid_backprop_linear import BackwardModuleLinear, ForwardModuleLinear def tuple_to_underscore_str(tup: Tuple) -> str: @@ -389,7 +389,6 @@ def __init__( def _replace_modules(self): """Replace the private modules in the model with remote layers.""" - self._has_only_large_linear_layers = True for module_name in self.module_names: # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3858 @@ -682,7 +681,9 @@ def clear_private_info(module): # Save the model with a specific filename model_path = path / "model.pth" - torch.save(self.model, model_path.resolve()) + # Save the model state dict due to a Brevitas issue + # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4572 + torch.save(self.model.state_dict(), model_path.resolve()) # Save the FHE circuit in the same directory self._save_fhe_circuit(path, via_mlir=via_mlir) diff --git a/src/concrete/ml/torch/lora.py b/src/concrete/ml/torch/lora.py index 5a069737a..f516816ec 100644 --- a/src/concrete/ml/torch/lora.py +++ b/src/concrete/ml/torch/lora.py @@ -1,16 +1,22 @@ -"""This module contains classes for LoRA (Low-Rank Adaptation) training and custom layers.""" +"""This module contains classes for LoRA (Low-Rank Adaptation) FHE training and custom layers.""" from typing import List, Tuple, Union import torch +from torch import Tensor, nn +from torch.utils.data import DataLoader +from tqdm import tqdm + +from .hybrid_backprop_linear import CustomLinear +from .hybrid_model import HybridFHEModel try: from transformers import Conv1D as TransformerConv1D -except ImportError: +except ImportError: # pragma: no cover TransformerConv1D = None # Create a tuple of linear layer classes to check against -LINEAR_LAYERS: tuple = (torch.nn.Linear,) +LINEAR_LAYERS: tuple = (nn.Linear,) if TransformerConv1D is not None: LINEAR_LAYERS = LINEAR_LAYERS + (TransformerConv1D,) @@ -19,6 +25,23 @@ # pylint: disable=arguments-differ +def try_dict(obj): + """Try to convert the object to a dict. + + Args: + obj: The object to convert to a dict. + + Returns: + The object converted to a dict or None if the conversion fails. + """ + if isinstance(obj, dict): + return obj + try: + return dict(obj) + except (TypeError, ValueError): + return None + + class LoraTraining(torch.nn.Module): """LoraTraining module for fine-tuning with LoRA in a hybrid model setting. @@ -31,49 +54,85 @@ class LoraTraining(torch.nn.Module): toggle between calibration and optimization modes. Args: - inference_model (torch.nn.Module): The base model to be fine-tuned. - n_layers_to_skip (int): Number of layers to skip. Linear layers that do not require - gradient to be propagated are skipped. Defaults to 1. + model (torch.nn.Module): The base model with LoRA layers to be fine-tuned. + n_layers_to_skip_for_backprop (int): Number of initial linear layers to keep as standard + layers. Since the first layer doesn't need backpropagation (no previous layer to + update), we typically skip 1 layer. Defaults to 1. + loss_fn (callable, optional): Loss function to compute the loss. If None, the model + is expected to return a loss. """ - def __init__(self, inference_model, n_layers_to_skip: int = 1) -> None: + def __init__(self, model, n_layers_to_skip_for_backprop=1, loss_fn=None): super().__init__() - self.inference_model = inference_model - - self.replace_layers_with_custom(self.inference_model, n_layers_to_skip) + # Assert that the model contains LoRA layers + self.assert_has_lora_layers(model) - self.optimizer = None - self.lr_scheduler = None - self.loss_fn = None - self.gradient_accumulation_steps = 1 - self.max_grad_norm = None + self.inference_model = model + self.replace_layers_with_custom(self.inference_model, n_layers_to_skip_for_backprop) self.calibrate = False - self.run_optimizer = False + self.loss_fn = loss_fn + self.loss_scaling_factor = 1.0 + + def set_loss_scaling_factor(self, loss_scaling_factor: float): + """Set the loss scaling factor for gradient accumulation. + + Args: + loss_scaling_factor (float): The factor to scale the loss by. + """ + self.loss_scaling_factor = loss_scaling_factor @staticmethod - def replace_layers_with_custom(model: torch.nn.Module, n_layers_to_skip: int): - """Replace linear layers with custom ones. + def assert_has_lora_layers(model): + """Assert that the model contains LoRA layers. + + Args: + model (torch.nn.Module): The model to check for LoRA layers. + + Raises: + ValueError: If the model does not contain any LoRA layers. + """ + + def is_lora_module(module): + # Check for common LoRA attributes with case-insensitive matching + lora_attributes = ["lora_a", "lora_b", "lora_dropout"] + return any( + hasattr(module, attr) + or hasattr(module, attr.lower()) + or hasattr(module, attr.upper()) + for attr in lora_attributes + ) + + has_lora = any(is_lora_module(module) for module in model.modules()) + + if not has_lora: + raise ValueError("The model does not contain any detectable LoRA layers.") - This method replaces eligible linear layers in the model with custom layers - that are compatible with the LoRA training procedure. + print("LoRA layers detected in the model.") + + @staticmethod + def replace_layers_with_custom(model: nn.Module, n_layers_to_skip_for_backprop: int) -> None: + """Replace linear layers with custom ones. Args: - model (torch.nn.Module): The model to replace layers in. - n_layers_to_skip (int): Number of layers to skip. + model (nn.Module): The model to replace layers in. + n_layers_to_skip_for_backprop (int): Number of initial linear layers to keep as standard + layers. Since the first layer doesn't need backpropagation (no previous layer to + update), we typically skip 1 layer. Defaults to 1. """ - def _replace(module: torch.nn.Module): - nonlocal n_layers_to_skip + def _replace(module: nn.Module): + nonlocal n_layers_to_skip_for_backprop for name, child in list(module.named_children()): - # Skip modules containing "lora" in their name + + # Skip lora layers as they are computed on the client side if "lora" in name: continue if isinstance(child, LINEAR_LAYERS): - if n_layers_to_skip > 0: - n_layers_to_skip -= 1 + if n_layers_to_skip_for_backprop > 0: + n_layers_to_skip_for_backprop -= 1 # Skip the first eligible layer continue @@ -85,7 +144,9 @@ def _replace(module: torch.nn.Module): # Create the CustomLinear layer custom_layer = CustomLinear( - weight=child.weight, bias=child.bias, weight_transposed=weight_transposed + weight=child.weight, + bias=child.bias, + weight_transposed=weight_transposed, ) # Replace the original layer with the custom layer @@ -96,251 +157,221 @@ def _replace(module: torch.nn.Module): _replace(model) - def update_training_parameters( - self, optimizer=None, lr_scheduler=None, loss_fn=None, training_args=None - ): - """Update training parameters for the LoRA module. + def toggle_calibrate(self, enable: bool = True): + """Toggle calibration mode. Args: - optimizer (optional): The optimizer to use for training. - lr_scheduler (optional): The learning rate scheduler to use for training. - loss_fn (callable, optional): Loss function to compute the loss. - training_args (dict or namespace, optional): Training arguments containing - 'gradient_accumulation_steps' and 'max_grad_norm'. + enable (bool): Whether to enable calibration mode. """ - self.optimizer = optimizer - self.lr_scheduler = lr_scheduler - self.loss_fn = loss_fn - - if training_args is not None: - # Check if training_args is a dict or an object with attributes - if isinstance(training_args, dict): - self.gradient_accumulation_steps = training_args.get( - "gradient_accumulation_steps", 1 - ) - self.max_grad_norm = training_args.get("max_grad_norm", None) - else: - self.gradient_accumulation_steps = getattr( - training_args, "gradient_accumulation_steps", 1 - ) - self.max_grad_norm = getattr(training_args, "max_grad_norm", None) - else: - self.gradient_accumulation_steps = 1 - self.max_grad_norm = None + self.calibrate = enable - def forward( - self, inputs: Tuple[torch.Tensor, ...] - ) -> Tuple[torch.Tensor, Union[torch.Tensor, None]]: + def forward(self, inputs: Tuple[Tensor, ...]) -> Tuple[Tensor, Union[Tensor, None]]: """Forward pass of the LoRA training module. Args: - inputs (tuple): A tuple containing the input tensors. The first two elements should be - the features and the labels. Additional elements will be passed - to the model as needed. + inputs (tuple): A tuple containing the input tensors. Returns: - A tuple containing the loss and gradient norm. + A tuple containing the original (unscaled) loss and None. Raises: - ValueError: If the model does not return a loss when `self.loss_fn` is None. + ValueError: If the model does not return a loss and no loss function is provided. """ assert ( len(inputs) >= 2 ), "Expected at least two inputs in the tuple: inputs (x) and targets (y)" - # Remove this once hybrid model supports multiple inputs - # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4568 - # Extract x (input features) and y (labels) - x, y = inputs[0], inputs[1] + # FIXME: + # Remove when hybrid model supports multiple inputs modules + # Unpack model inputs and labels + *model_inputs, y = inputs - # Additional inputs, if any (e.g., attention_mask) - additional_inputs = inputs[2:] - - # If no loss function is provided, we assume the model can compute the loss internally if self.loss_fn is None: - # Forward pass through the inference model with labels - outputs = self.inference_model(x, labels=y, *additional_inputs) + # Pass inputs and labels to the model + outputs = self.inference_model(*model_inputs, labels=y) - # Use getattr to safely access the loss attribute from the outputs - loss = getattr(outputs, "loss", None) + # Check if outputs is a dict and retrieve the loss + if isinstance(outputs, dict): + loss = outputs.get("loss", None) + else: + loss = getattr(outputs, "loss", None) if loss is None: raise ValueError( - "The model did not return a loss. Ensure that 'labels' are correctly provided." + "The model did not return a loss.", + "Ensure that 'labels' are correctly provided or provide a loss_fn.", ) else: - # Forward pass through the inference model without labels - outputs = self.inference_model(x, *additional_inputs) - - # If the outputs contain several keys, extract the logits + # Forward pass without labels; compute loss manually + outputs = self.inference_model(*model_inputs) if isinstance(outputs, dict) and "logits" in outputs: outputs = outputs["logits"] - - # Compute the loss using the provided loss function loss = self.loss_fn(outputs, y) - # Scale the loss based on gradient accumulation - loss = loss / self.gradient_accumulation_steps + # Scale the loss for gradient accumulation + scaled_loss = loss / self.loss_scaling_factor - # Update gradients # We need to set requires grad to the loss manually because the inference model's last # step is the "lm_head" layer, which might be detached from the graph by the hybrid model - loss.requires_grad_(True) - loss.backward() - - grad_norm = None - if not self.calibrate and self.run_optimizer: - if self.max_grad_norm is not None: - grad_norm = torch.nn.utils.clip_grad_norm_( - self.inference_model.parameters(), max_norm=self.max_grad_norm, norm_type=2 - ) + scaled_loss.requires_grad_(True) + scaled_loss.backward() - if self.optimizer is not None: - self.optimizer.step() + # Return the original (unscaled) loss for logging + return loss.detach(), None - if self.lr_scheduler is not None: - self.lr_scheduler.step() - self.inference_model.zero_grad() +class LoraTrainer: + """Trainer class for LoRA fine-tuning with FHE support. - # Clean gradients after calibration - elif self.calibrate: - self.inference_model.zero_grad() + This class handles the training loop, optimizer, scheduler, + and integrates with the hybrid model. - return loss, grad_norm - - def toggle_calibrate(self, enable: bool = True): - """Toggle calibration mode. - - Args: - enable (bool): Whether to enable calibration mode. - """ - self.calibrate = enable - - def toggle_run_optimizer(self, enable: bool = True): - """Toggle optimizer execution. + Args: + model (nn.Module): The base model with LoRA layers to be fine-tuned. + optimizer (torch.optim.Optimizer): Optimizer for training. + loss_fn (callable): Loss function to compute the loss. + lr_scheduler (optional): Learning rate scheduler. + training_args (dict): Training arguments. + n_layers_to_skip_for_backprop (int): Number of initial linear layers to keep as standard + layers. Since the first layer doesn't need backpropagation (no previous layer to + update), we typically skip 1 layer. Defaults to 1. + """ - Args: - enable (bool): Whether to enable optimizer execution. - """ - self.run_optimizer = enable + def __init__( + self, + model, + optimizer=None, + loss_fn=None, + lr_scheduler=None, + training_args=None, + n_layers_to_skip_for_backprop=1, + ): + self.optimizer = optimizer + self.lr_scheduler = lr_scheduler + self.training_args = training_args or {} + self.gradient_accumulation_steps = self.training_args.get("gradient_accumulation_steps", 1) + self.max_grad_norm = self.training_args.get("max_grad_norm", None) + # Create the LoRA training module + self.lora_training_module = LoraTraining( + model, n_layers_to_skip_for_backprop=n_layers_to_skip_for_backprop, loss_fn=loss_fn + ) -class ForwardModuleLinear(torch.nn.Module): - """Forward module for linear layers.""" + # Determine modules to be executed remotely + self.remote_names = get_remote_names(self.lora_training_module) - def __init__(self, weight, bias=None, weight_transposed=False): - super().__init__() - self.weight = weight - self.bias = bias - self.weight_transposed = weight_transposed # If True, weight is (in_features, out_features) + # Create the hybrid model + self.hybrid_model = HybridFHEModel( + self.lora_training_module, module_names=self.remote_names + ) - def forward(self, input_tensor): - """Forward pass for linear layers. + def compile(self, inputset, n_bits=8): + """Compile the hybrid model with the given input set. Args: - input_tensor: The input tensor. - - Returns: - The output tensor after applying the linear transformation. + inputset (tuple): Input set for compilation. + n_bits (int): Bit width for quantization. """ - if self.weight_transposed: - # Weight is (in_features, out_features) - output = input_tensor @ self.weight - else: - # Weight is (out_features, in_features) - output = input_tensor @ self.weight.t() - if self.bias is not None: - output += self.bias - return output - - -class BackwardModuleLinear(torch.nn.Module): - """Backward module for linear layers.""" + self.lora_training_module.toggle_calibrate(enable=True) + self.hybrid_model.compile_model(inputset, n_bits=n_bits) + self.lora_training_module.toggle_calibrate(enable=False) + + def train( + self, + train_loader: DataLoader, + num_epochs: int = 10, + fhe: str = "simulate", + ): + """Train the model using the hybrid FHE model. - def __init__(self, weight, weight_transposed=False): - super().__init__() - self.weight = weight - self.weight_transposed = weight_transposed + Args: + train_loader (DataLoader): DataLoader for training data. + num_epochs (int): Number of epochs to train. + fhe (str): FHE mode ('disable', 'simulate', 'execute' or 'torch'). + """ + device = torch.device("cpu") + self.lora_training_module.to(device) + self.lora_training_module.inference_model.train() - def forward(self, grad_output): - """Backward pass for linear layers. + # Set the loss scaling factor for gradient accumulation + self.lora_training_module.set_loss_scaling_factor(self.gradient_accumulation_steps) - Args: - grad_output: The gradient output tensor. + epoch_pbar = tqdm(range(1, num_epochs + 1), desc="Training", unit="epoch") - Returns: - The gradient input tensor after applying the backward pass. - """ - if self.weight_transposed: - grad_input = grad_output @ self.weight.t() - else: - grad_input = grad_output @ self.weight - return grad_input + for epoch in epoch_pbar: + total_loss = 0.0 + self.optimizer.zero_grad() # Zero gradients at the start of the epoch + for step, batch in enumerate(train_loader): -class CustomLinear(torch.nn.Module): - """Custom linear module.""" + # Convert the batch to a tuple of inputs on the device. + if batch_dict := try_dict(batch): + batch = batch_dict + # Convert dict to tuple of values and move them to the device + batch = tuple( + v.to(device) if isinstance(v, torch.Tensor) else v for v in batch.values() + ) + elif isinstance(batch, (tuple, list)): + # Move tuple/list elements to the device + batch = tuple( + item.to(device) if isinstance(item, torch.Tensor) else item + for item in batch + ) + else: + # If it's a single non-tensor item, wrap it in a tuple + batch = (batch,) - def __init__(self, weight, bias=None, weight_transposed=False): - super().__init__() - self.forward_module = ForwardModuleLinear(weight, bias, weight_transposed) - self.backward_module = BackwardModuleLinear(weight, weight_transposed) + # Forward pass through the hybrid model + loss, _ = self.hybrid_model(batch, fhe=fhe) - def forward(self, input_tensor): - """Forward pass of the custom linear module. + # Loss scaling and backward is done inside LoraTraining - Args: - input_tensor: The input tensor. + # Accumulate loss for logging + total_loss += loss.item() - Returns: - The output tensor after applying the custom linear module. - """ - return ForwardBackwardModule.apply(input_tensor, self.forward_module, self.backward_module) + # Update weights after gradient accumulation steps + if (step + 1) % self.gradient_accumulation_steps == 0 or (step + 1) == len( + train_loader + ): + if self.max_grad_norm is not None: + torch.nn.utils.clip_grad_norm_( + self.lora_training_module.parameters(), self.max_grad_norm + ) + # Optimizer step + self.optimizer.step() -class ForwardBackwardModule(torch.autograd.Function): - """Custom autograd function for forward and backward passes.""" + # Scheduler step + if self.lr_scheduler is not None: + self.lr_scheduler.step() - @staticmethod - def forward(ctx, input_tensor, forward_module, backward_module): - """Forward pass of the custom autograd function. + # Zero gradients + self.optimizer.zero_grad() - Args: - ctx: The context object. - input_tensor: The input tensor. - forward_module: The forward module. - backward_module: The backward module. + avg_loss = total_loss / len(train_loader) + epoch_pbar.set_postfix( + { + "Epoch": epoch, + "Avg Loss": f"{avg_loss:.4f}", + "FHE Mode": fhe, + } + ) - Returns: - The output tensor after applying the forward pass. - """ - ctx.backward_module = backward_module - output = forward_module.forward(input_tensor) - return output + print(f"Training completed. Final Avg Loss: {avg_loss:.4f}, FHE Mode: {fhe}") - @staticmethod - def backward(ctx, grad_output): - """Backward pass of the custom autograd function. + def save_and_clear_private_info(self, path): + """Save the model and remove private information. Args: - ctx: The context object. - grad_output: The gradient output tensor. - - Returns: - The gradient input tensor after applying the backward pass. + path (str): The path to save the model. """ - backward_module = ctx.backward_module - grad_input = backward_module.forward(grad_output) - - # grad_weight and grad_bias are not needed when computing the backward for LoRA - return grad_input, None, None + self.hybrid_model.save_and_clear_private_info(path) -def get_remote_names(model: torch.nn.Module, include_embedding_layers: bool = False) -> List[str]: +def get_remote_names(model: nn.Module, include_embedding_layers: bool = False) -> List[str]: """Get names of modules to be executed remotely. Args: - model (torch.nn.Module): The model to inspect. + model (nn.Module): The model to inspect. include_embedding_layers (bool): Whether to include embedding layers. Returns: @@ -363,7 +394,7 @@ def get_remote_names(model: torch.nn.Module, include_embedding_layers: bool = Fa elif isinstance(module, CustomLinear): remote_names.append(f"{name}.forward_module") remote_names.append(f"{name}.backward_module") - elif include_embedding_layers and (isinstance(module, torch.nn.Embedding) or is_lm_head): + elif include_embedding_layers and (isinstance(module, nn.Embedding) or is_lm_head): remote_names.append(name) return remote_names diff --git a/tests/torch/test_lora.py b/tests/torch/test_lora.py index a3ee1a03e..d9bee88e5 100644 --- a/tests/torch/test_lora.py +++ b/tests/torch/test_lora.py @@ -1,463 +1,580 @@ -# pylint: disable=redefined-outer-name +"""Tests for the LoRA (Low-Rank Adaptation) functionality in the torch module.""" -"""Tests for the LoraTraining class and related modules in lora.py.""" +# pylint: disable=redefined-outer-name -import sys -from collections import namedtuple -from types import SimpleNamespace -from unittest import mock +from unittest.mock import MagicMock import pytest import torch from torch import nn -from torch.optim import SGD -from torch.optim.lr_scheduler import StepLR -from transformers import Conv1D as TransformerConv1D +from torch.utils.data import DataLoader, Dataset, TensorDataset -from concrete.ml.torch.lora import ( +from concrete.ml.torch.hybrid_backprop_linear import ( BackwardModuleLinear, CustomLinear, - ForwardBackwardModule, ForwardModuleLinear, - LoraTraining, - get_remote_names, ) +from concrete.ml.torch.lora import LoraTrainer, LoraTraining, get_remote_names +# Dummy models and datasets for testing -class DummyConfig: - """A dummy configuration class to mimic model config.""" - - def __init__(self, model_type): - self.model_type = model_type +class DummyLoRAModel(nn.Module): + """Dummy LoRA model for testing.""" -class DummyBaseModel: - """A dummy base model class to mimic base_model.model.""" - - def __init__(self, model_type): - self.model = DummyModel(model_type) + def __init__(self): + super().__init__() + # Simulate LoRA layers by including 'lora_a' attribute + self.lora_a = nn.Parameter(torch.randn(10, 10)) + self.linear1 = nn.Linear(10, 20) + self.linear2 = nn.Linear(20, 10) + + def forward(self, x, **kwargs): + """Forward pass.""" + labels = kwargs.get("labels", None) + logits = self.linear2(torch.relu(self.linear1(x))) + if labels is not None: + loss = nn.functional.mse_loss(logits, labels) + return {"loss": loss} + return {"logits": logits} -class DummyModel(torch.nn.Module): - """A dummy model class to mimic the actual model.""" +class DummyLoRAModelNoLoss(nn.Module): + """Dummy LoRA model without loss function for testing.""" - def __init__(self, model_type): + def __init__(self): super().__init__() - self.config = DummyConfig(model_type) + self.lora_a = nn.Parameter(torch.randn(10, 10)) + self.linear1 = nn.Linear(10, 20) + self.linear2 = nn.Linear(20, 10) - @staticmethod - def forward(x): - """Dummy forward method.""" - return x + def forward(self, x): + """Forward pass.""" + logits = self.linear2(torch.relu(self.linear1(x))) + return {"logits": logits} -class DummyInferenceModel(torch.nn.Module): - """A dummy inference model with various layers.""" +class DummyModel(nn.Module): + """Dummy model for testing.""" def __init__(self): super().__init__() - self.base_model = DummyBaseModel("gpt2") - self.linear1 = torch.nn.Linear(2, 2) - self.conv1d = TransformerConv1D(2, 2) - self.linear2 = torch.nn.Linear(2, 2) - self.lora_layer = torch.nn.Linear(2, 2) # Layer with 'lora' in name - self.lora_layer_name = "lora_layer" - - def forward(self, x, labels=None): - """A simple forward method that returns logits or loss.""" - x = self.linear1(x) - x = self.conv1d(x) - x = self.linear2(x) - x = self.lora_layer(x) - logits = x - if labels is not None: - loss = ((logits - labels) ** 2).mean() - Output = namedtuple("Output", ["loss"]) - return Output(loss=loss) - return {"logits": logits, "something_else": torch.tensor(1.0)} + self.linear1 = nn.Linear(10, 20) + self.linear2 = nn.Linear(20, 10) + + def forward(self, x): + """Forward pass.""" + logits = self.linear2(torch.relu(self.linear1(x))) + return {"logits": logits} @pytest.fixture -def base_inference_model(): - """Fixture for creating a DummyInferenceModel instance.""" - return DummyInferenceModel() +def dummy_lora_model(): + """Dummy LoRA model for testing.""" + return DummyLoRAModel() @pytest.fixture -def base_lora_training(base_inference_model): - """Fixture for creating a LoraTraining instance.""" - return LoraTraining(base_inference_model) +def dummy_model(): + """Dummy model for testing.""" + return DummyModel() -@pytest.mark.parametrize("n_layers_to_skip", [0, 1, 2]) -def test_lora_training_replace_layers(base_lora_training, n_layers_to_skip): - """Test that LoraTraining replaces layers correctly.""" - original_linear1 = base_lora_training.inference_model.linear1 - original_lora_layer = base_lora_training.inference_model.lora_layer +def test_assert_has_lora_layers_with_lora_layers(dummy_lora_model): + """Test assert_has_lora_layers with LoRA layers.""" + LoraTraining.assert_has_lora_layers(dummy_lora_model) - # Replace layers with custom layers - base_lora_training.replace_layers_with_custom( - base_lora_training.inference_model, n_layers_to_skip=n_layers_to_skip - ) - inference_model = base_lora_training.inference_model +def test_assert_has_lora_layers_without_lora_layers(dummy_model): + """Test assert_has_lora_layers without LoRA layers.""" + with pytest.raises(ValueError) as exc_info: + LoraTraining.assert_has_lora_layers(dummy_model) + assert "The model does not contain any detectable LoRA layers" in str(exc_info.value) - if n_layers_to_skip > 0: - # First eligible layer should be skipped - assert inference_model.linear1 is original_linear1 - else: - assert isinstance(inference_model.linear1, CustomLinear) - # Check that other eligible layers are replaced - assert isinstance(inference_model.conv1d, CustomLinear) - assert isinstance(inference_model.linear2, CustomLinear) +def test_replace_layers_with_custom(): + """Test replace_layers_with_custom.""" + model = DummyLoRAModel() + n_layers_to_skip_for_backprop = 1 + LoraTraining.replace_layers_with_custom(model, n_layers_to_skip_for_backprop) + # First linear layer should be skipped, second replaced + assert isinstance(model.linear1, nn.Linear) + assert isinstance(model.linear2, CustomLinear) - # 'lora' layers should not be replaced - assert inference_model.lora_layer is original_lora_layer +def test_replace_layers_with_custom_skips_lora_layers(): + """Test replace_layers_with_custom skips LoRA layers.""" -@pytest.mark.parametrize( - "training_args", - [ - {"gradient_accumulation_steps": 2, "max_grad_norm": 1.0}, # dict - SimpleNamespace(gradient_accumulation_steps=2, max_grad_norm=1.0), # namespace - None, # None - ], -) -def test_update_training_parameters(base_lora_training, training_args): - """Test update_training_parameters with different types of training_args.""" - inference_model = base_lora_training.inference_model - optimizer = SGD(inference_model.parameters(), lr=0.01) - lr_scheduler = StepLR(optimizer, step_size=1) - loss_fn = nn.MSELoss() + class ModelWithLoraLayer(nn.Module): + """Model with LoRA layer for testing.""" - base_lora_training.update_training_parameters(optimizer, lr_scheduler, loss_fn, training_args) + def __init__(self): + super().__init__() + self.lora_linear = nn.Linear(10, 10) + self.linear = nn.Linear(10, 10) + + def forward(self, x): + """Forward pass.""" + x = self.lora_linear(x) + return self.linear(x) - assert base_lora_training.optimizer is optimizer - assert base_lora_training.lr_scheduler is lr_scheduler - assert base_lora_training.loss_fn is loss_fn + model = ModelWithLoraLayer() + n_layers_to_skip_for_backprop = 0 + LoraTraining.replace_layers_with_custom(model, n_layers_to_skip_for_backprop) + assert isinstance(model.lora_linear, nn.Linear) # Should not be replaced + assert isinstance(model.linear, CustomLinear) # Should be replaced - if training_args is None: - assert base_lora_training.gradient_accumulation_steps == 1 # Default - assert base_lora_training.max_grad_norm is None # Default - else: - assert base_lora_training.gradient_accumulation_steps == 2 - assert base_lora_training.max_grad_norm == 1.0 +def test_replace_layers_with_custom_recursive(): + """Test replace_layers_with_custom with nested modules.""" -def test_lora_training_forward_loss_fn_none(base_lora_training): - """Test the forward method when loss_fn is None.""" - x = torch.tensor([[1.0, 2.0]]) - y = torch.tensor([[0.5, 1.5]]) + class ModelWithNestedModules(nn.Module): + """Model with nested modules for testing.""" - loss, _ = base_lora_training((x, y)) + def __init__(self): + super().__init__() + self.layer1 = nn.Sequential(nn.Linear(10, 20), nn.ReLU(), nn.Linear(20, 10)) - expected_loss = ( - base_lora_training.inference_model(x, labels=y).loss - / base_lora_training.gradient_accumulation_steps - ).item() + def forward(self, x): + """Forward pass.""" + return self.layer1(x) - assert abs(loss.item() - expected_loss) < 1e-6 + model = ModelWithNestedModules() + n_layers_to_skip_for_backprop = 0 + LoraTraining.replace_layers_with_custom(model, n_layers_to_skip_for_backprop) + assert isinstance(model.layer1[0], CustomLinear) + assert isinstance(model.layer1[1], nn.ReLU) # Should not be replaced + assert isinstance(model.layer1[2], CustomLinear) -def test_lora_training_forward_with_loss_fn(base_lora_training): - """Test the forward method when loss_fn is provided.""" +def test_forward_with_loss_fn(): + """Test forward with loss function.""" + model = DummyLoRAModel() loss_fn = nn.MSELoss() - base_lora_training.update_training_parameters(loss_fn=loss_fn) + lora_training = LoraTraining(model, loss_fn=loss_fn) + x = torch.randn(5, 10) + y = torch.randn(5, 10) + loss, _ = lora_training((x, y)) + assert isinstance(loss, torch.Tensor) - x = torch.tensor([[1.0, 2.0]]) - y = torch.tensor([[0.5, 1.5]]) - outputs = base_lora_training.inference_model(x) - expected_loss = loss_fn(outputs["logits"], y) / base_lora_training.gradient_accumulation_steps +def test_forward_without_loss_fn_model_returns_loss(): + """Test forward without loss function when model returns loss.""" + model = DummyLoRAModel() + lora_training = LoraTraining(model) + x = torch.randn(5, 10) + y = torch.randn(5, 10) + loss, _ = lora_training((x, y)) + assert isinstance(loss, torch.Tensor) - loss, _ = base_lora_training((x, y)) - assert abs(loss.item() - expected_loss.item()) < 1e-6 +def test_forward_without_loss_fn_model_returns_loss_as_attribute(): + """Test forward without loss function when model returns loss as attribute.""" + class DummyLoRAModelReturnsObject(nn.Module): + """Dummy LoRA model returning object with loss.""" -def test_lora_training_forward_no_loss(): - """Test that LoraTraining raises ValueError when model does not return a loss.""" + def __init__(self): + super().__init__() + self.lora_a = nn.Parameter(torch.randn(10, 10)) + self.linear1 = nn.Linear(10, 20) + self.linear2 = nn.Linear(20, 10) - class NoLossInferenceModel(DummyInferenceModel): - """An inference model that does not return a loss.""" + def forward(self, x, **kwargs): + """Forward pass.""" + labels = kwargs.get("labels", None) + logits = self.linear2(torch.relu(self.linear1(x))) - def forward(self, x, labels=None): - """Forward method that does not return loss.""" - Output = namedtuple("Output", ["something_else"]) - return Output(something_else=torch.tensor(1.0)) + class OutputObject: + """Output object containing logits and optional loss.""" - no_loss_inference_model = NoLossInferenceModel() - lora_training = LoraTraining(no_loss_inference_model) + def __init__(self, logits, loss=None): + self.logits = logits + self.loss = loss - x = torch.tensor([[1.0, 2.0]]) - y = torch.tensor([[0.5, 1.5]]) + if labels is not None: + loss = nn.functional.mse_loss(logits, labels) + return OutputObject(logits, loss) + return OutputObject(logits) - with pytest.raises(ValueError) as exc_info: - lora_training((x, y)) - assert "The model did not return a loss" in str(exc_info.value) + model = DummyLoRAModelReturnsObject() + lora_training = LoraTraining(model) + x = torch.randn(5, 10) + y = torch.randn(5, 10) + loss, _ = lora_training((x, y)) + assert isinstance(loss, torch.Tensor) -@pytest.mark.parametrize("enable", [True, False]) -def test_lora_training_toggle_calibrate(base_lora_training, enable): - """Test the toggle_calibrate method.""" - base_lora_training.toggle_calibrate(enable) - assert base_lora_training.calibrate == enable +def test_forward_with_less_than_two_inputs(): + """Test forward with less than two inputs.""" + model = DummyLoRAModel() + lora_training = LoraTraining(model) + x = torch.randn(5, 10) + with pytest.raises(AssertionError) as exc_info: + lora_training((x,)) + assert "Expected at least two inputs" in str(exc_info.value) -@pytest.mark.parametrize("enable", [True, False]) -def test_lora_training_toggle_run_optimizer(base_lora_training, enable): - """Test the toggle_run_optimizer method.""" - base_lora_training.toggle_run_optimizer(enable) - assert base_lora_training.run_optimizer == enable +def test_toggle_calibrate(): + """Test toggle_calibrate.""" + model = DummyLoRAModel() + lora_training = LoraTraining(model) + lora_training.toggle_calibrate(True) + assert lora_training.calibrate is True + lora_training.toggle_calibrate(False) + assert lora_training.calibrate is False -def test_lora_training_forward_with_optimizer(base_lora_training): - """Test the forward method when run_optimizer is True.""" - inference_model = base_lora_training.inference_model - optimizer = SGD(inference_model.parameters(), lr=0.01) - lr_scheduler = StepLR(optimizer, step_size=1) - loss_fn = nn.MSELoss() - base_lora_training.update_training_parameters( - optimizer, - lr_scheduler, - loss_fn, - SimpleNamespace(gradient_accumulation_steps=1, max_grad_norm=1.0), +def test_set_loss_scaling_factor(): + """Test set_loss_scaling_factor.""" + model = DummyLoRAModel() + lora_training = LoraTraining(model) + lora_training.set_loss_scaling_factor(0.5) + assert lora_training.loss_scaling_factor == 0.5 + + +def test_lora_trainer_init(): + """Test LoraTrainer initialization.""" + model = DummyLoRAModel() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + lora_trainer = LoraTrainer(model, optimizer=optimizer) + assert lora_trainer.lora_training_module is not None + assert lora_trainer.hybrid_model is not None + + +def test_lora_trainer_compile(): + """Test LoraTrainer compile.""" + model = DummyLoRAModel() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + lora_trainer = LoraTrainer(model, optimizer=optimizer) + inputset = [(torch.randn(5, 10), torch.randn(5, 10))] + # Mock the compile_model method + lora_trainer.hybrid_model.compile_model = MagicMock() + lora_trainer.compile(inputset) + lora_trainer.hybrid_model.compile_model.assert_called_once() + assert lora_trainer.lora_training_module.calibrate is False + + +def test_lora_trainer_train(): + """Test LoraTrainer train.""" + model = DummyLoRAModel() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + training_args = {"gradient_accumulation_steps": 1, "max_grad_norm": 1.0} + lora_trainer = LoraTrainer(model, optimizer=optimizer, training_args=training_args) + # Mock the hybrid_model's __call__ method + lora_trainer.hybrid_model = MagicMock( + return_value=(torch.tensor(1.0, requires_grad=True), None) + ) + # Create dummy data loader with different batch types + dataset = TensorDataset(torch.randn(2, 5, 10), torch.randn(2, 5, 10)) + train_loader = DataLoader(dataset, batch_size=1) + lora_trainer.train(train_loader, num_epochs=1, fhe="disable") + + +def test_lora_trainer_train_with_lr_scheduler(): + """Test LoraTrainer train with lr_scheduler.""" + model = DummyLoRAModel() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + lr_scheduler = MagicMock() + training_args = {"gradient_accumulation_steps": 1, "max_grad_norm": 1.0} + lora_trainer = LoraTrainer( + model, optimizer=optimizer, lr_scheduler=lr_scheduler, training_args=training_args ) - base_lora_training.replace_layers_with_custom( - base_lora_training.inference_model, n_layers_to_skip=0 + # Mock the hybrid_model's __call__ method + lora_trainer.hybrid_model = MagicMock( + return_value=(torch.tensor(1.0, requires_grad=True), None) ) - base_lora_training.toggle_run_optimizer(True) + # Create dummy data loader + dataset = TensorDataset(torch.randn(2, 5, 10), torch.randn(2, 5, 10)) + train_loader = DataLoader(dataset, batch_size=1) + lora_trainer.train(train_loader, num_epochs=1) + # Check that lr_scheduler.step() was called + assert lr_scheduler.step.call_count > 0 + + +def test_lora_trainer_save_and_clear_private_info(): + """Test LoraTrainer save_and_clear_private_info.""" + model = DummyLoRAModel() + lora_trainer = LoraTrainer(model) + lora_trainer.hybrid_model.save_and_clear_private_info = MagicMock() + lora_trainer.save_and_clear_private_info("path/to/model") + lora_trainer.hybrid_model.save_and_clear_private_info.assert_called_once_with("path/to/model") + + +def test_custom_linear_forward_backward(): + """Test CustomLinear forward and backward.""" + weight = torch.randn(20, 10) + bias = torch.randn(20) + custom_linear = CustomLinear(weight, bias) + x = torch.randn(5, 10, requires_grad=True) + y = custom_linear(x) + loss = y.sum() + loss.backward() + assert x.grad is not None + + +def test_custom_linear_weight_transposed(): + """Test CustomLinear with weight transposed.""" + weight = torch.randn(10, 20) + bias = torch.randn(20) + custom_linear = CustomLinear(weight, bias, weight_transposed=True) + x = torch.randn(5, 10, requires_grad=True) + y = custom_linear(x) + loss = y.sum() + loss.backward() + assert x.grad is not None + + +def test_get_remote_names(): + """Test get_remote_names.""" + model = DummyLoRAModel() + LoraTraining.replace_layers_with_custom(model, n_layers_to_skip_for_backprop=0) + remote_names = get_remote_names(model) + assert "linear1.forward_module" in remote_names + assert "linear1.backward_module" in remote_names + assert "linear2.forward_module" in remote_names + assert "linear2.backward_module" in remote_names + assert "lora_a" not in remote_names + + +def test_get_remote_names_include_embedding_layers(): + """Test get_remote_names with include_embedding_layers.""" + + class ModelWithEmbedding(nn.Module): + """Model with embedding layer for testing.""" - x = torch.tensor([[1.0, 2.0]]) - y = torch.tensor([[0.5, 1.5]]) + def __init__(self): + super().__init__() + self.embedding = nn.Embedding(10, 10) + self.linear = nn.Linear(10, 10) - # Save initial parameters - initial_params = {name: param.clone() for name, param in inference_model.named_parameters()} + def forward(self, x): + """Forward pass.""" + x = self.embedding(x) + x = self.linear(x) + return x - # Perform forward pass - _, _ = base_lora_training((x, y)) + model = ModelWithEmbedding() + remote_names = get_remote_names(model, include_embedding_layers=True) + assert "embedding" in remote_names + assert "linear" in remote_names - # Ensure that only parameters with "lora" in their name have been updated - for name, param in inference_model.named_parameters(): - if "lora" in name: - assert not torch.equal( - initial_params[name], param - ), f"Lora parameter {name} was not updated" - else: - assert torch.equal( - initial_params[name], param - ), f"Non-lora parameter {name} was unexpectedly updated" +def test_get_remote_names_skips_lm_head_when_excluded(): + """Test get_remote_names skips lm_head when excluded.""" -def test_lora_training_forward_calibrate(base_lora_training): - """Test the forward method when calibration is enabled.""" - inference_model = base_lora_training.inference_model - base_lora_training.toggle_calibrate(True) + class ModelWithLMHead(nn.Module): + """Model with lm_head for testing.""" - x = torch.tensor([[1.0, 2.0]]) - y = torch.tensor([[0.5, 1.5]]) + def __init__(self): + super().__init__() + self.lm_head = nn.Linear(10, 10) + self.linear = nn.Linear(10, 10) - _, _ = base_lora_training((x, y)) + def forward(self, x): + """Forward pass.""" + return self.linear(x) - # Ensure that gradients are zeroed - for param in inference_model.parameters(): - if param.grad is not None: - assert torch.all(param.grad == 0) + model = ModelWithLMHead() + remote_names = get_remote_names(model, include_embedding_layers=False) + assert "lm_head" not in remote_names + assert "linear" in remote_names -@pytest.mark.parametrize("weight_transposed", [False, True]) -def test_forward_module_linear(weight_transposed): - """Test ForwardModuleLinear.""" - weight = torch.tensor([[1.0, 2.0], [3.0, 4.0]]) - bias = torch.tensor([0.5, -0.5]) - module = ForwardModuleLinear(weight, bias, weight_transposed=weight_transposed) +def test_replace_layers_with_transformer_conv1d(monkeypatch): + """Test replace_layers_with_custom with TransformerConv1D.""" - input_tensor = torch.tensor([[1.0, 0.0], [0.0, 1.0]]) - output = module(input_tensor) + class MockTransformerConv1D(nn.Module): + """Mock TransformerConv1D module for testing.""" - if weight_transposed: - expected_output = input_tensor @ weight + bias - else: - expected_output = input_tensor @ weight.t() + bias + def __init__(self, in_features, out_features): + super().__init__() + self.in_features = in_features + self.out_features = out_features + self.weight = nn.Parameter(torch.randn(out_features, in_features)) + self.bias = nn.Parameter(torch.randn(out_features)) - assert torch.allclose(output, expected_output) + def forward(self, x): + """Forward pass.""" + return x @ self.weight.t() + self.bias + # Patch TransformerConv1D and LINEAR_LAYERS in the lora module + monkeypatch.setattr("concrete.ml.torch.lora.TransformerConv1D", MockTransformerConv1D) + monkeypatch.setattr("concrete.ml.torch.lora.LINEAR_LAYERS", (nn.Linear, MockTransformerConv1D)) -@pytest.mark.parametrize("weight_transposed", [False, True]) -def test_backward_module_linear(weight_transposed): - """Test BackwardModuleLinear.""" - weight = torch.tensor([[1.0, 2.0], [3.0, 4.0]]) - module = BackwardModuleLinear(weight, weight_transposed=weight_transposed) + class ModelWithConv1D(nn.Module): + """Model with Conv1D layer for testing.""" - grad_output = torch.tensor([[1.0, 0.0], [0.0, 1.0]]) - grad_input = module(grad_output) + def __init__(self): + super().__init__() + self.conv1d = MockTransformerConv1D(10, 10) - if weight_transposed: - expected_grad_input = grad_output @ weight.t() - else: - expected_grad_input = grad_output @ weight + def forward(self, x): + """Forward pass.""" + return self.conv1d(x) - assert torch.allclose(grad_input, expected_grad_input) + model = ModelWithConv1D() + n_layers_to_skip_for_backprop = 0 + LoraTraining.replace_layers_with_custom(model, n_layers_to_skip_for_backprop) + assert isinstance(model.conv1d, CustomLinear) -@pytest.mark.parametrize("weight_transposed", [False, True]) -def test_custom_linear(weight_transposed): - """Test the CustomLinear module.""" - weight = torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True) - bias = torch.tensor([0.5, -0.5], requires_grad=True) - module = CustomLinear(weight, bias, weight_transposed=weight_transposed) +def test_forward_backward_module(): + """Test the ForwardBackwardModule autograd function.""" + weight = torch.randn(20, 10) + bias = torch.randn(20) + forward_module = ForwardModuleLinear(weight, bias) + backward_module = BackwardModuleLinear(weight) + x = torch.randn(5, 10) + y = forward_module(x) + grad_output = torch.randn_like(y) + grad_input = backward_module(grad_output) + assert grad_input.shape == x.shape - input_tensor = torch.tensor([[1.0, 0.0]], requires_grad=True) - output = module(input_tensor) - if weight_transposed: - expected_output = input_tensor @ weight + bias - else: - expected_output = input_tensor @ weight.t() + bias +def test_lora_training_forward_with_additional_inputs(): + """Test LoraTraining forward with additional inputs.""" - assert torch.allclose(output, expected_output) + class ModelWithAdditionalInputs(nn.Module): + """Model with additional inputs for testing.""" - # Test backward - output.sum().backward() - if weight_transposed: - expected_grad_input = torch.ones_like(output) @ weight.t() - else: - expected_grad_input = torch.ones_like(output) @ weight + def __init__(self): + super().__init__() + self.lora_a = nn.Parameter(torch.randn(10, 10)) + self.linear = nn.Linear(10, 10) + + def forward(self, x, extra_input, labels=None): + """Forward pass with additional inputs.""" + logits = self.linear(x + extra_input) + if labels is not None: + loss = nn.functional.mse_loss(logits, labels) + return {"loss": loss} + return {"logits": logits} + + model = ModelWithAdditionalInputs() + lora_training = LoraTraining(model) + x = torch.randn(5, 10) + y = torch.randn(5, 10) + extra_input = torch.randn(5, 10) + loss, _ = lora_training((x, extra_input, y)) + assert isinstance(loss, torch.Tensor) - assert input_tensor.grad is not None and torch.allclose(input_tensor.grad, expected_grad_input) +def test_lora_training_forward_with_no_loss_fn_and_no_labels(): + """Test LoraTraining when model returns loss=None and no loss_fn provided.""" + model = DummyLoRAModel() + lora_training = LoraTraining(model) + x = torch.randn(5, 10) + y = None # No labels provided + with pytest.raises(ValueError) as exc_info: + lora_training((x, y)) + assert "The model did not return a loss." in str(exc_info.value) -@pytest.mark.parametrize("weight_transposed", [False, True]) -def test_forward_backward_module(weight_transposed): - """Test the ForwardBackwardModule.""" - weight = torch.tensor([[1.0, 2.0], [3.0, 4.0]]) - bias = torch.tensor([0.5, -0.5]) - forward_module = ForwardModuleLinear(weight, bias, weight_transposed=weight_transposed) - backward_module = BackwardModuleLinear(weight, weight_transposed=weight_transposed) - input_tensor = torch.tensor([[1.0, 0.0]], requires_grad=True) - output = ForwardBackwardModule.apply(input_tensor, forward_module, backward_module) +def test_lora_trainer_train_with_various_batch_types(): + """Test LoraTrainer.train with batches of different types.""" + model = DummyLoRAModel() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + lora_trainer = LoraTrainer(model, optimizer=optimizer) - if weight_transposed: - expected_output = input_tensor @ weight + bias - expected_grad_input = torch.ones_like(output) @ weight.t() - else: - expected_output = input_tensor @ weight.t() + bias - expected_grad_input = torch.ones_like(output) @ weight + # Mock the hybrid_model's __call__ method + lora_trainer.hybrid_model = MagicMock( + return_value=(torch.tensor(1.0, requires_grad=True), None) + ) - assert torch.allclose(output, expected_output) + class DictDataset(Dataset): + """Dataset with dict items.""" - # Test backward - output.sum().backward() + def __init__(self, data): + self.data = data - assert input_tensor.grad is not None and torch.allclose(input_tensor.grad, expected_grad_input) + def __len__(self): + return len(self.data) + def __getitem__(self, idx): + return self.data[idx] -def test_get_remote_names(): - """Test get_remote_names function.""" + class ListDataset(Dataset): + """Dataset with list items.""" - class TestModel(torch.nn.Module): - """Test model for get_remote_names test.""" + def __init__(self, data): + self.data = data + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return self.data[idx] + + class NonTensorDataset(Dataset): + """Dataset with non-tensor items.""" + + def __init__(self, data): + self.data = data + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return self.data[idx] + + # Test with dict batch + dataset_dict = [{"input": torch.randn(5, 10), "label": torch.randn(5, 10)} for _ in range(2)] + train_loader_dict: DataLoader = DataLoader(DictDataset(dataset_dict), batch_size=1) + lora_trainer.train(train_loader_dict, num_epochs=1) + + # Test with list/tuple batch + dataset_list = [(torch.randn(5, 10), torch.randn(5, 10)) for _ in range(2)] + train_loader_list: DataLoader = DataLoader(ListDataset(dataset_list), batch_size=1) + lora_trainer.train(train_loader_list, num_epochs=1) + + # Test with single tensor batch + dataset_single = TensorDataset(torch.stack([torch.randn(5, 10) for _ in range(2)])) + train_loader_single: DataLoader = DataLoader(dataset_single, batch_size=1) + lora_trainer.train(train_loader_single, num_epochs=1) + + # Test with single non-tensor item batch + dataset_non_tensor = NonTensorDataset( + [42 for _ in range(2)] + ) # Using integers as non-tensor data + train_loader_non_tensor: DataLoader = DataLoader(dataset_non_tensor, batch_size=1) + lora_trainer.train(train_loader_non_tensor, num_epochs=1) + + +def test_lora_trainer_train_with_gradient_accumulation(): + """Test LoraTrainer.train with gradient accumulation steps.""" + model = DummyLoRAModel() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + training_args = {"gradient_accumulation_steps": 2, "max_grad_norm": 1.0} + lora_trainer = LoraTrainer(model, optimizer=optimizer, training_args=training_args) + # Mock the hybrid_model's __call__ method + lora_trainer.hybrid_model = MagicMock( + return_value=(torch.tensor(1.0, requires_grad=True), None) + ) + # Create dummy data loader + dataset = TensorDataset(torch.randn(4, 5, 10), torch.randn(4, 5, 10)) + train_loader: DataLoader = DataLoader(dataset, batch_size=1) + lora_trainer.train(train_loader, num_epochs=1) + + +def test_get_remote_names_with_lora_in_name(): + """Test get_remote_names skips modules with 'lora' in name.""" + + class ModelWithLoraInName(nn.Module): + """Model with LoRA layer for testing.""" def __init__(self): super().__init__() - self.linear = torch.nn.Linear(10, 10) - self.conv1d = TransformerConv1D(10, 10) - self.embedding = torch.nn.Embedding(10, 10) - self.lm_head = torch.nn.Linear(10, 10) - self.lora_layer = torch.nn.Linear(10, 10) - self.lora_layer_name = "lora_layer" + self.lora_linear = nn.Linear(10, 10) + self.linear = nn.Linear(10, 10) def forward(self, x): - """Forward method.""" - return self.lm_head(self.linear(x)) - - model = TestModel() - - lora_training = LoraTraining(model) - remote_names = get_remote_names(lora_training) - expected_names = [ - "inference_model.linear", - "inference_model.conv1d.forward_module", - "inference_model.conv1d.backward_module", - ] - - assert set(remote_names) == set(expected_names) - - # Test with include_embedding_layers=True - remote_names_with_embeddings = get_remote_names(lora_training, include_embedding_layers=True) - expected_names_with_embeddings = [ - "inference_model.linear", - "inference_model.conv1d.forward_module", - "inference_model.conv1d.backward_module", - # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4609 - "inference_model.embedding", - "inference_model.lm_head.forward_module", - "inference_model.lm_head.backward_module", - ] - assert set(remote_names_with_embeddings) == set(expected_names_with_embeddings) - - -def test_lora_without_transformers(): - """ - Test the lora.py module when the transformers library is not installed. - """ - - # Save the original transformers module if it's already imported - transformers_original = sys.modules.get("transformers", None) - - # Mock the transformers import to simulate it being unavailable - with mock.patch.dict("sys.modules", {"transformers": None}): - # Reload the lora module to apply the mocked transformers import - if "concrete.ml.torch.lora" in sys.modules: - del sys.modules["concrete.ml.torch.lora"] - import concrete.ml.torch.lora as lora # pylint: disable=R0402,C0415 - - # Ensure that TransformerConv1D is None - assert lora.TransformerConv1D is None - - # Create a simple model without any Conv1D layers - model = torch.nn.Sequential( - torch.nn.Linear(10, 20), - torch.nn.ReLU(), - torch.nn.Linear(20, 5), - ) - - # Initialize LoraTraining with the model - lora_training = lora.LoraTraining(model) - - # Check that layers have been replaced with CustomLinear - replaced_layers = [] - for name, module in lora_training.inference_model.named_modules(): - if isinstance(module, lora.CustomLinear): - replaced_layers.append(name) - - # Assert that CustomLinear layers have been added - assert len(replaced_layers) > 0, "No layers were replaced with CustomLinear." - - # Prepare input data - x = torch.randn(3, 10) # Batch size 3, input size 10 - y = torch.randint(0, 5, (3,)) # Batch size 3, number of classes 5 - - # Define a simple loss function - loss_fn = torch.nn.CrossEntropyLoss() - - # Update training parameters - lora_training.update_training_parameters(loss_fn=loss_fn) - - # Perform a forward pass - loss, grad_norm = lora_training((x, y)) - - # Check that loss is computed and gradients are updated - assert loss.requires_grad, "Loss does not require gradients." - assert loss.item() > 0, "Loss should be greater than zero." - - # Since optimizer is not set, grad_norm should be None - assert grad_norm is None, "Gradient norm should be None when optimizer is not set." - - # Restore the original transformers module after the test - if transformers_original is not None: - sys.modules["transformers"] = transformers_original - elif "transformers" in sys.modules: - del sys.modules["transformers"] + """Forward pass with lora_linear.""" + x = self.lora_linear(x) + x = self.linear(x) + return x + + model = ModelWithLoraInName() + remote_names = get_remote_names(model) + assert "lora_linear" not in remote_names + assert "linear" in remote_names diff --git a/use_case_examples/lora_finetuning/GPT2FineTuneHybrid.ipynb b/use_case_examples/lora_finetuning/GPT2FineTuneHybrid.ipynb index c9eada04d..208e5e79b 100644 --- a/use_case_examples/lora_finetuning/GPT2FineTuneHybrid.ipynb +++ b/use_case_examples/lora_finetuning/GPT2FineTuneHybrid.ipynb @@ -111,7 +111,15 @@ "execution_count": 5, "id": "5ac49f9d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LoRA layers detected in the model.\n" + ] + } + ], "source": [ "# Set up LoRA training\n", "lora_training = LoraTraining(peft_model)" @@ -126,7 +134,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "656e3f624a7f4c879b46129e841e4db1", + "model_id": "9775e413ec264b2eb14ee53dbc381474", "version_major": 2, "version_minor": 0 }, @@ -301,11 +309,7 @@ "num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1)\n", "max_steps = math.ceil(training_args.num_train_epochs * num_update_steps_per_epoch)\n", "\n", - "trainer.create_optimizer_and_scheduler(num_training_steps=max_steps)\n", - "\n", - "lora_training.update_training_parameters(\n", - " trainer.optimizer, trainer.lr_scheduler, causal_lm_loss, training_args\n", - ")" + "trainer.create_optimizer_and_scheduler(num_training_steps=max_steps)" ] }, { @@ -338,9 +342,13 @@ "outputs": [], "source": [ "# Prepare input data for calibration\n", - "input_tensor = torch.randint(0, tokenizer.vocab_size, (PER_DEVICE_TRAIN_BATCH_SIZE, BLOCK_SIZE))\n", - "label_tensor = torch.randint(0, tokenizer.vocab_size, (PER_DEVICE_TRAIN_BATCH_SIZE, BLOCK_SIZE))\n", - "attention_mask = torch.ones((PER_DEVICE_TRAIN_BATCH_SIZE, BLOCK_SIZE))\n", + "input_tensor = torch.randint(\n", + " 0, tokenizer.vocab_size, (PER_DEVICE_TRAIN_BATCH_SIZE, BLOCK_SIZE), dtype=torch.long\n", + ")\n", + "label_tensor = torch.randint(\n", + " 0, tokenizer.vocab_size, (PER_DEVICE_TRAIN_BATCH_SIZE, BLOCK_SIZE), dtype=torch.long\n", + ")\n", + "attention_mask = torch.ones((PER_DEVICE_TRAIN_BATCH_SIZE, BLOCK_SIZE), dtype=torch.long)\n", "\n", "inputset = (input_tensor, label_tensor, attention_mask)" ] @@ -377,6 +385,21 @@ " total_epochs = int(training_args.num_train_epochs)\n", " epoch_pbar = tqdm(total=total_epochs, desc=\"Training Progress\", position=0)\n", "\n", + " # Initialize optimizer and scheduler here instead\n", + " optimizer = torch.optim.AdamW(\n", + " hybrid_model.model.parameters(),\n", + " lr=training_args.learning_rate,\n", + " weight_decay=training_args.weight_decay,\n", + " )\n", + "\n", + " num_training_steps = total_epochs * len(train_dataloader)\n", + " lr_scheduler = torch.optim.lr_scheduler.LinearLR(\n", + " optimizer,\n", + " start_factor=1.0,\n", + " end_factor=0.0,\n", + " total_iters=num_training_steps,\n", + " )\n", + "\n", " total_batched_samples = 0\n", " epoch_losses = [] # List to store the loss for each epoch\n", "\n", @@ -407,7 +430,7 @@ " grad_norms.append(grad_norm)\n", "\n", " # Get current learning rate\n", - " current_lr = lora_training.lr_scheduler.get_last_lr()[0]\n", + " current_lr = lr_scheduler.get_last_lr()[0]\n", "\n", " # Get last grad norm\n", " current_grad_norm = grad_norms[-1] if grad_norms else None\n", @@ -846,7 +869,7 @@ "tokenizer.parallelism = False\n", "\n", "# Train the model using FHE simulation\n", - "train_custom_model(hybrid_model, train_dataloader, training_args, tokenizer, fhe=\"simulate\")" + "train_custom_model(hybrid_model, train_dataloader, training_args, tokenizer, fhe=\"disable\")" ] }, { diff --git a/use_case_examples/lora_finetuning/LLamaFineTuning.ipynb b/use_case_examples/lora_finetuning/LLamaFineTuning.ipynb new file mode 100644 index 000000000..b6575886f --- /dev/null +++ b/use_case_examples/lora_finetuning/LLamaFineTuning.ipynb @@ -0,0 +1,345 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fine-Tuning GPT-2 with LoRA and FHE using `LoraTrainer`\n", + "\n", + "This notebook demonstrates how to fine-tune a GPT-2 model using LoRA (Low-Rank Adaptation) with Fully Homomorphic Encryption (FHE). We leverage the `LoraTrainer` API from the `concrete.ml.torch.lora` library to simplify the process.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "import shutil\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import torch\n", + "from datasets import load_dataset\n", + "from peft import LoraConfig, get_peft_model\n", + "from transformers import (\n", + " AutoModelForCausalLM,\n", + " AutoTokenizer,\n", + " DataCollatorForLanguageModeling,\n", + " Trainer,\n", + " TrainingArguments,\n", + ")\n", + "from utils_lora import generate_and_print\n", + "\n", + "# Import LoraTrainer from the provided library\n", + "from concrete.ml.torch.lora import LoraTrainer" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Set seed for reproducibility\n", + "SEED = 0\n", + "random.seed(SEED)\n", + "np.random.seed(SEED)\n", + "torch.manual_seed(SEED)\n", + "if torch.cuda.is_available():\n", + " torch.cuda.manual_seed_all(SEED)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the model and tokenizer\n", + "model_name = \"meta-llama/Llama-3.2-1B\"\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "model = AutoModelForCausalLM.from_pretrained(model_name)\n", + "\n", + "# Ensure the tokenizer has a pad token\n", + "if tokenizer.pad_token is None:\n", + " tokenizer.pad_token = tokenizer.eos_token\n", + "model.config.pad_token_id = model.config.eos_token_id\n", + "\n", + "# Freeze the original model's weights\n", + "for param in model.parameters():\n", + " param.requires_grad = False" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial generation with base model:\n", + "from concrete.ml.sklearn import LogisticRegression\n", + "\n", + "model = LogisticRegression( eta=0.1, max_iter=1000, random_state=42)\n", + "None\n" + ] + } + ], + "source": [ + "# Print the initial generation with the base model\n", + "PROMPT = \"from concrete.ml.sklearn import LogisticRegression\\n\\nmodel = LogisticRegression(\"\n", + "print(\"Initial generation with base model:\")\n", + "print(generate_and_print(PROMPT, model, tokenizer, seed=SEED))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Apply LoRA configuration\n", + "peft_config = LoraConfig(\n", + " r=8,\n", + " lora_alpha=32,\n", + " lora_dropout=0.01,\n", + " bias=\"none\",\n", + " task_type=\"CAUSAL_LM\",\n", + " target_modules=\"all-linear\",\n", + ")\n", + "peft_model = get_peft_model(model, peft_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + } + ], + "source": [ + "# Load the dataset and tokenize it\n", + "dataset = load_dataset(\"json\", data_files=\"data_finetune/dataset.jsonl\", split=\"train\")\n", + "\n", + "\n", + "def tokenize_function(examples):\n", + " return tokenizer(examples[\"text\"], padding=\"longest\", truncation=True)\n", + "\n", + "\n", + "tokenized_dataset = dataset.map(tokenize_function, batched=True)\n", + "data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Define training arguments\n", + "EPOCHS = 10\n", + "PER_DEVICE_TRAIN_BATCH_SIZE = 4\n", + "training_args = TrainingArguments(\n", + " output_dir=\"./checkpoints\",\n", + " num_train_epochs=EPOCHS,\n", + " per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,\n", + " gradient_accumulation_steps=1,\n", + " save_total_limit=1,\n", + " use_cpu=True,\n", + " learning_rate=2e-4,\n", + " lr_scheduler_type=\"linear\",\n", + " seed=SEED,\n", + " data_seed=SEED,\n", + " warmup_steps=10,\n", + " weight_decay=0.01,\n", + " prediction_loss_only=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LoRA layers detected in the model.\n" + ] + } + ], + "source": [ + "# Create optimizer and scheduler using HuggingFace's Trainer\n", + "hf_trainer = Trainer(\n", + " model=peft_model,\n", + " args=training_args,\n", + " train_dataset=tokenized_dataset,\n", + " data_collator=data_collator,\n", + ")\n", + "train_dataloader = hf_trainer.get_train_dataloader()\n", + "hf_trainer.create_optimizer_and_scheduler(num_training_steps=len(train_dataloader) * EPOCHS)\n", + "\n", + "optimizer = hf_trainer.optimizer\n", + "lr_scheduler = hf_trainer.lr_scheduler\n", + "\n", + "\n", + "# Define a causal LM loss function\n", + "def causal_lm_loss(logits, labels, ignore_index=-100):\n", + " shift_logits = logits[..., :-1, :].contiguous()\n", + " shift_labels = labels[..., 1:].contiguous()\n", + " shift_logits = shift_logits.view(-1, shift_logits.size(-1))\n", + " shift_labels = shift_labels.view(-1)\n", + " loss = torch.nn.functional.cross_entropy(\n", + " shift_logits, shift_labels, ignore_index=ignore_index, reduction=\"mean\"\n", + " )\n", + " return loss\n", + "\n", + "\n", + "# Prepare input data for calibration\n", + "lengths = [len(item[\"input_ids\"]) for item in tokenized_dataset]\n", + "if not all(length == lengths[0] for length in lengths):\n", + " raise ValueError(\"All examples must have the same length for calibration.\")\n", + "BLOCK_SIZE = lengths[0]\n", + "\n", + "input_tensor = torch.randint(\n", + " 0, tokenizer.vocab_size, (PER_DEVICE_TRAIN_BATCH_SIZE, BLOCK_SIZE), dtype=torch.long\n", + ")\n", + "label_tensor = torch.randint(\n", + " 0, tokenizer.vocab_size, (PER_DEVICE_TRAIN_BATCH_SIZE, BLOCK_SIZE), dtype=torch.long\n", + ")\n", + "attention_mask = torch.ones((PER_DEVICE_TRAIN_BATCH_SIZE, BLOCK_SIZE), dtype=torch.long)\n", + "inputset = (input_tensor, label_tensor, attention_mask)\n", + "\n", + "# Initialize LoraTrainer\n", + "training_args_dict = vars(training_args)\n", + "lora_trainer = LoraTrainer(\n", + " model=peft_model,\n", + " optimizer=optimizer,\n", + " loss_fn=causal_lm_loss,\n", + " lr_scheduler=lr_scheduler,\n", + " training_args=training_args_dict,\n", + " n_layers_to_skip_for_backprop=3,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Compile the model with FHE\n", + "lora_trainer.compile(inputset, n_bits=16)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting training using LoraTrainer...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Training: 100%|██████████| 10/10 [22:19<00:00, 133.98s/epoch, Epoch=10, Avg Loss=0.0795, FHE Mode=disable]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training completed. Final Avg Loss: 0.0795, FHE Mode: disable\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# Train the model using LoraTrainer\n", + "print(\"Starting training using LoraTrainer...\")\n", + "lora_trainer.train(train_dataloader, num_epochs=EPOCHS, fhe=\"disable\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original model generation:\n", + "from concrete.ml.sklearn import LogisticRegression\n", + "\n", + "model = LogisticRegression( eta=0.1, max_iter=1000, random_state=42)\n", + "None\n", + "Fine-tuned model generation:\n", + "from concrete.ml.sklearn import LogisticRegression\n", + "\n", + "model = LogisticRegression( n_bits=7, max_iter=50)\n", + "None\n" + ] + } + ], + "source": [ + "# Compare generation before and after fine-tuning\n", + "peft_model.disable_adapter_layers()\n", + "print(\"Original model generation:\")\n", + "print(generate_and_print(PROMPT, peft_model, tokenizer, seed=SEED))\n", + "\n", + "peft_model.enable_adapter_layers()\n", + "print(\"Fine-tuned model generation:\")\n", + "print(generate_and_print(PROMPT, peft_model, tokenizer, seed=SEED))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save the fine-tuned model\n", + "save_path = Path(\"deployment/gpt2_lora_finetuned\")\n", + "if save_path.is_dir() and any(save_path.iterdir()):\n", + " shutil.rmtree(save_path)\n", + "lora_trainer.save_and_clear_private_info(save_path)\n", + "\n", + "print(\"Model saved to:\", save_path)" + ] + } + ], + "metadata": { + "execution": { + "timeout": 10800 + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/use_case_examples/lora_finetuning/Makefile b/use_case_examples/lora_finetuning/Makefile index 8942d2e22..ed6edcf86 100644 --- a/use_case_examples/lora_finetuning/Makefile +++ b/use_case_examples/lora_finetuning/Makefile @@ -8,3 +8,6 @@ run_example: one one: @$(TIME_NB) GPT2FineTuneHybrid.ipynb + +two: + @$(TIME_NB) LlamaFineTuning.ipynb \ No newline at end of file diff --git a/use_case_examples/lora_finetuning/data_finetune/dataset.jsonl b/use_case_examples/lora_finetuning/data_finetune/dataset.jsonl new file mode 100644 index 000000000..06363d611 --- /dev/null +++ b/use_case_examples/lora_finetuning/data_finetune/dataset.jsonl @@ -0,0 +1,46 @@ +{"text": "class TinyCNN(nn.Module):"} +{"text": "def __init__(self, n_classes) -> None:\n super().__init__()\n self.conv1 = nn.Conv2d(1, 8, 3, stride=1, padding=0)\n self.conv2 = nn.Conv2d(8, 16, 3, stride=2, padding=0)\n self.conv3 = nn.Conv2d(16, 32, 2, stride=1, padding=0)\n self.fc1 = nn.Linear(32, n_classes)"} +{"text": "def forward(self, x):\n x = self.conv1(x)\n x = torch.relu(x)\n x = self.conv2(x)\n x = torch.relu(x)\n x = self.conv3(x)\n x = torch.relu(x)\n x = x.flatten(1)\n x = self.fc1(x)\n return x\nnet = TinyCNN(10)\n#... (Training loop)...\nq_module = compile_torch_model(net, x_train, rounding_threshold_bits=6, p_error=0.1)\n# Key generation\nq_module.fhe_circuit.keygen()\n# Inference"} +{"text": "self.fc1(x)\n return x\nnet = TinyCNN(10)\n#... (Training loop)...\nq_module = compile_torch_model(net, x_train, rounding_threshold_bits=6, p_error=0.1)\n# Key generation\nq_module.fhe_circuit.keygen()\n# Inference in FHE\ny_pred_fhe = q_module.forward(x_test, fhe=\"execute\")\n\n**4. Quantization-Aware Training:**\npython\nfrom torch import nn\nfrom concrete.ml.torch.compile import compile_brevitas_qat_model\nimport brevitas.nn as qnn\nfrom brev"} +{"text": "in FHE\ny_pred_fhe = q_module.forward(x_test, fhe=\"execute\")\n\n**4. Quantization-Aware Training:**\npython\nfrom torch import nn\nfrom concrete.ml.torch.compile import compile_brevitas_qat_model\nimport brevitas.nn as qnn\nfrom brevitas.core.bit_width import BitWidthImplType\nfrom brevitas.core.quant import QuantType\nfrom brevitas.core.restrict_val import FloatToIntImplType, RestrictValueType\nfrom brevitas.core.scaling import ScalingImplType\nfrom brevitas.core.zero_point import ZeroZeroPoint\nfrom bre"} +{"text": "itas.core.bit_width import BitWidthImplType\nfrom brevitas.core.quant import QuantType\nfrom brevitas.core.restrict_val import FloatToIntImplType, RestrictValueType\nfrom brevitas.core.scaling import ScalingImplType\nfrom brevitas.core.zero_point import ZeroZeroPoint\nfrom brevitas.inject import ExtendedInjector\nfrom brevitas.quant.solver import ActQuantSolver, WeightQuantSolver\nfrom dependencies import value\nfrom torch.nn.utils import prune\n#... (Data loading and preprocessing)..."} +{"text": "class CommonQuant(ExtendedInjector):\n bit_width_impl_type = BitWidthImplType.CONST\n scaling_impl_type = ScalingImplType.CONST\n restrict_scaling_type = RestrictValueType.FP\n zero_point_impl = ZeroZeroPoint\n float_to_int_impl_type = FloatToIntImplType.ROUND\n scaling_per_output_channel = False\n narrow_range = True\n signed = True\n @value"} +{"text": "def quant_type(bit_width): # pylint: disable=no-self-argument\n if bit_width is None:\n return QuantType.FP\n if bit_width == 1:\n return QuantType.BINARY\n return QuantType.INT"} +{"text": "class CommonWeightQuant(CommonQuant, WeightQuantSolver): # pylint: disable=too-many-ancestors\n scaling_const = 1.0\n signed = True"} +{"text": "class CommonActQuant(CommonQuant, ActQuantSolver): # pylint: disable=too-many-ancestors\n min_val = -1.0\n max_val = 1.0"} +{"text": "class QATPrunedSimpleNet(nn.Module):"} +{"text": "def __init__(self, n_hidden, qlinear_args, qidentity_args):\n super().__init__()\n self.pruned_layers = set()\n self.quant_inp = qnn.QuantIdentity(**qidentity_args)\n self.fc1 = qnn.QuantLinear(IN_FEAT, n_hidden, **qlinear_args)\n self.relu1 = qnn.QuantReLU(bit_width=qidentity_args[\"bit_width\"])\n self.fc2 = qnn.QuantLinear(n_hidden, n_hidden, **qlinear_args)\n self.relu2 = qnn.QuantReLU(bit_width=qidentity_args[\"bit_width"} +{"text": ", **qlinear_args)\n self.relu1 = qnn.QuantReLU(bit_width=qidentity_args[\"bit_width\"])\n self.fc2 = qnn.QuantLinear(n_hidden, n_hidden, **qlinear_args)\n self.relu2 = qnn.QuantReLU(bit_width=qidentity_args[\"bit_width\"])\n self.fc3 = qnn.QuantLinear(n_hidden, OUT_FEAT, **qlinear_args)\n for m in self.modules():\n if isinstance(m, qnn.QuantLinear):\n torch.nn.init.uniform_(m.weight.data, -1, 1)"} +{"text": "def forward(self, x):\n x = self.quant_inp(x)\n x = self.relu1(self.fc1(x))\n x = self.relu2(self.fc2(x))\n x = self.fc3(x)\n return x"} +{"text": "def prune(self, max_non_zero):\n # Linear layer weight has dimensions NumOutputs x NumInputs\n for name, layer in self.named_modules():\n if isinstance(layer, qnn.QuantLinear):\n num_zero_weights = (layer.weight.shape[1] - max_non_zero) * layer.weight.shape[0]\n if num_zero_weights <= 0:\n continue\n print(f\"Pruning layer {name} factor {num_zero_weights}\")\n prune.l1_unstructured(layer, \"weight\", amount=num_zero_weights)\n self.pruned_layers.add(name)"} +{"text": "def unprune(self):\n for name, layer in self.named_modules():\n if name in self.pruned_layers:\n prune.remove(layer, \"weight\")\n self.pruned_layers.remove(name)\ntorch_model = QATPrunedSimpleNet(\n n_hidden=n_hidden,\n qlinear_args={\n \"weight_bit_width\": 3,\n \"weight_quant\": CommonWeightQuant,\n \"bias\": True,\n \"bias_quant\": None,\n \"narrow_range\": True,\n },\n qidentity_args={\"bit_width\": 3, \"act_quant\": CommonActQuant},\n)\ntorch"} +{"text": "_args={\n \"weight_bit_width\": 3,\n \"weight_quant\": CommonWeightQuant,\n \"bias\": True,\n \"bias_quant\": None,\n \"narrow_range\": True,\n },\n qidentity_args={\"bit_width\": 3, \"act_quant\": CommonActQuant},\n)\ntorch_model.prune(20)\n#... (Training loop)...\nquantized_numpy_module = compile_brevitas_qat_model(torch_model, x_train)\n# Inference in FHE (simulation)\ny_pred_fhe = quantized_numpy_module.forward(x_test, fhe=\"simulate\")\n\n**5. Client/Server"} +{"text": "_model.prune(20)\n#... (Training loop)...\nquantized_numpy_module = compile_brevitas_qat_model(torch_model, x_train)\n# Inference in FHE (simulation)\ny_pred_fhe = quantized_numpy_module.forward(x_test, fhe=\"simulate\")\n\n**5. Client/Server Deployment (LogisticRegressionTraining.ipynb):**\npython\nfrom pathlib import Path\nfrom tempfile import TemporaryDirectory\nimport numpy as np\nfrom concrete.ml.deployment import FHEModelClient, FHEModelDev, FHEModelServer\nfrom concrete.ml.sklearn import SGDClassifier\nfrom concrete import fhe"} +{"text": "Deployment (LogisticRegressionTraining.ipynb):**\npython\nfrom pathlib import Path\nfrom tempfile import TemporaryDirectory\nimport numpy as np\nfrom concrete.ml.deployment import FHEModelClient, FHEModelDev, FHEModelServer\nfrom concrete.ml.sklearn import SGDClassifier\nfrom concrete import fhe\n#... (Data loading, preprocessing, and model training)...\n# Assuming you have a trained model: sgd_clf_binary_fhe\n# and x_compile_set, y_compile_set for compilation\n# Define the directory where to save the deployment files\nDEPLOYMENT_PATH = Path(\"fhe_training\")"} +{"text": "#... (Data loading, preprocessing, and model training)...\n# Assuming you have a trained model: sgd_clf_binary_fhe\n# and x_compile_set, y_compile_set for compilation\n# Define the directory where to save the deployment files\nDEPLOYMENT_PATH = Path(\"fhe_training\")\nDEPLOYMENT_PATH.mkdir(exist_ok=True)\ndeployment_dir = TemporaryDirectory(dir=str(DEPLOYMENT_PATH))\ndeployment_path = Path(deployment_dir.name)\n# Save the model for deployment\nfhe_dev = FHEModelDev(deployment_path, sgd_clf_binary_fhe)\nfhe_dev.save(mode=\""} +{"text": "DEPLOYMENT_PATH.mkdir(exist_ok=True)\ndeployment_dir = TemporaryDirectory(dir=str(DEPLOYMENT_PATH))\ndeployment_path = Path(deployment_dir.name)\n# Save the model for deployment\nfhe_dev = FHEModelDev(deployment_path, sgd_clf_binary_fhe)\nfhe_dev.save(mode=\"training\")\n# Client-side setup\nfhe_client = FHEModelClient(deployment_path)\nfhe_client.load()\nserialized_evaluation_keys = fhe_client.get_serialized_evaluation_keys()\n# Server-side setup\nfhe_server = FHEModelServer(deployment_path)\nfhe_server.load()\n# Example of encryption,"} +{"text": "training\")\n# Client-side setup\nfhe_client = FHEModelClient(deployment_path)\nfhe_client.load()\nserialized_evaluation_keys = fhe_client.get_serialized_evaluation_keys()\n# Server-side setup\nfhe_server = FHEModelServer(deployment_path)\nfhe_server.load()\n# Example of encryption, server-side processing, and decryption\nbatch_size = sgd_clf_binary_fhe.batch_size\nweights = np.random.rand(1, x_train.shape[1], 1)\nbias = np.random.rand(1, 1, 1)"} +{"text": "def quantize_encrypt_serialize_batches(fhe_client, x, y, weights, bias, batch_size):\n #... (Implementation as before)..."} +{"text": "def server_run(fhe_server, x_batches_enc, y_batches_enc, weights_enc, bias_enc, evaluation_keys):\n #... (Implementation as before)..."} +{"text": "def train_fhe_client_server(\n #... (Parameters as before)...\n):\n #... (Training loop)\n # Quantize, encrypt and serialize the batched inputs as well as the weight and bias values\n x_batches_enc, y_batches_enc, weights_enc, bias_enc = quantize_encrypt_serialize_batches(\n fhe_client, x, y, weights, bias, batch_size\n )\n # Iterate the circuit over the batches on the server\n fitted_weights_enc, fitted_bias_enc = server_run(\n fhe_server,\n x_batches_enc,\n y_batches_enc,\n weights_enc,"} +{"text": "_serialize_batches(\n fhe_client, x, y, weights, bias, batch_size\n )\n # Iterate the circuit over the batches on the server\n fitted_weights_enc, fitted_bias_enc = server_run(\n fhe_server,\n x_batches_enc,\n y_batches_enc,\n weights_enc,\n bias_enc,\n serialized_evaluation_keys,\n )\n # Back on the client, deserialize, decrypt and de-quantize the fitted weight and bias values\n weights, bias = fhe_client.deserialize_decrypt_dequantize(\n fitted_weights_enc, fitted_bias_enc\n )\n return weights, bias,"} +{"text": "bias_enc,\n serialized_evaluation_keys,\n )\n # Back on the client, deserialize, decrypt and de-quantize the fitted weight and bias values\n weights, bias = fhe_client.deserialize_decrypt_dequantize(\n fitted_weights_enc, fitted_bias_enc\n )\n return weights, bias, acc_history\n# Cleanup\ndeployment_dir.cleanup()\n\n**6. Hyper-parameter Tuning with GridSearchCV (XGBClassifier.ipynb, DecisionTreeRegressor.ipynb):**\npython\nfrom sklearn.model_selection import GridSearchCV\nfrom concrete.ml.sklearn import XGBClassifier as ConcreteXGBClassifier\nfrom"} +{"text": "acc_history\n# Cleanup\ndeployment_dir.cleanup()\n\n**6. Hyper-parameter Tuning with GridSearchCV (XGBClassifier.ipynb, DecisionTreeRegressor.ipynb):**\npython\nfrom sklearn.model_selection import GridSearchCV\nfrom concrete.ml.sklearn import XGBClassifier as ConcreteXGBClassifier\nfrom sklearn.metrics import make_scorer, matthews_corrcoef\n#... (Data loading and preprocessing)...\n# Create scorer with the MCC metric\ngrid_scorer = make_scorer(matthews_corrcoef, greater_is_better=True)\n# Define the parameter grid to search\nparam_grid = {"} +{"text": "sklearn.metrics import make_scorer, matthews_corrcoef\n#... (Data loading and preprocessing)...\n# Create scorer with the MCC metric\ngrid_scorer = make_scorer(matthews_corrcoef, greater_is_better=True)\n# Define the parameter grid to search\nparam_grid = {\n \"n_bits\": [5, 6],\n \"max_depth\": [2, 3],\n \"n_estimators\": [10, 20, 50],\n}\n# Instantiate GridSearchCV with the Concrete ML model\ngrid_search = GridSearchCV(\n ConcreteXGBClassifier(),\n param_grid"} +{"text": "\"n_bits\": [5, 6],\n \"max_depth\": [2, 3],\n \"n_estimators\": [10, 20, 50],\n}\n# Instantiate GridSearchCV with the Concrete ML model\ngrid_search = GridSearchCV(\n ConcreteXGBClassifier(),\n param_grid,\n cv=5,\n scoring=grid_scorer,\n error_score=\"raise\",\n verbose=1,\n)\n# Run the grid search\ngrid_search.fit(x_train, y_train)\n# Get the best parameters\nbest_params = grid_search.best_params_\n# Create a new model with the best parameters"} +{"text": ",\n cv=5,\n scoring=grid_scorer,\n error_score=\"raise\",\n verbose=1,\n)\n# Run the grid search\ngrid_search.fit(x_train, y_train)\n# Get the best parameters\nbest_params = grid_search.best_params_\n# Create a new model with the best parameters\nbest_model = ConcreteXGBClassifier(**best_params)\nbest_model.fit(x_train, y_train)\n# Compile and proceed with FHE inference as shown in other examples\n\n**7. GLM Models (GLMComparison.ipynb):**\n* **Poisson Regressor**\npython\nfrom concrete"} +{"text": "best_model = ConcreteXGBClassifier(**best_params)\nbest_model.fit(x_train, y_train)\n# Compile and proceed with FHE inference as shown in other examples\n\n**7. GLM Models (GLMComparison.ipynb):**\n* **Poisson Regressor**\npython\nfrom concrete.ml.sklearn import PoissonRegressor as ConcretePoissonRegressor\n#... (Data loading and preprocessing)...\nconcrete_pr = ConcretePoissonRegressor(n_bits=8)\nconcrete_pr.fit(x_train, y_train, sample_weight=train_weights)\ncircuit = concrete_pr.compile(x_train)\n# Key generation"} +{"text": ".ml.sklearn import PoissonRegressor as ConcretePoissonRegressor\n#... (Data loading and preprocessing)...\nconcrete_pr = ConcretePoissonRegressor(n_bits=8)\nconcrete_pr.fit(x_train, y_train, sample_weight=train_weights)\ncircuit = concrete_pr.compile(x_train)\n# Key generation\ncircuit.client.keygen(force=False)\n# Inference in FHE\ny_pred_fhe = concrete_pr.predict(x_test, fhe=\"execute\")\n\n* **Gamma Regressor**\npython\nfrom concrete.ml.sklearn import GammaRegressor as ConcreteGammaRegressor\n#... (Data loading and preprocessing)..."} +{"text": "circuit.client.keygen(force=False)\n# Inference in FHE\ny_pred_fhe = concrete_pr.predict(x_test, fhe=\"execute\")\n\n* **Gamma Regressor**\npython\nfrom concrete.ml.sklearn import GammaRegressor as ConcreteGammaRegressor\n#... (Data loading and preprocessing)...\nconcrete_gr = ConcreteGammaRegressor(n_bits=8)\nconcrete_gr.fit(x_train, y_train, sample_weight=train_weights)\ncircuit = concrete_gr.compile(x_train)\n# Key generation\ncircuit.client.keygen(force=False)\n# Inference in FHE\ny_pred_fhe = concrete_gr.predict(x"} +{"text": "concrete_gr = ConcreteGammaRegressor(n_bits=8)\nconcrete_gr.fit(x_train, y_train, sample_weight=train_weights)\ncircuit = concrete_gr.compile(x_train)\n# Key generation\ncircuit.client.keygen(force=False)\n# Inference in FHE\ny_pred_fhe = concrete_gr.predict(x_test, fhe=\"execute\")\n\n* **Tweedie Regressor**\npython\nfrom concrete.ml.sklearn import TweedieRegressor as ConcreteTweedieRegressor\n#... (Data loading and preprocessing)...\nconcrete_tr = ConcreteTweedieRegressor(n_bits=8, power=1.9"} +{"text": "_test, fhe=\"execute\")\n\n* **Tweedie Regressor**\npython\nfrom concrete.ml.sklearn import TweedieRegressor as ConcreteTweedieRegressor\n#... (Data loading and preprocessing)...\nconcrete_tr = ConcreteTweedieRegressor(n_bits=8, power=1.9)\nconcrete_tr.fit(x_train, y_train, sample_weight=train_weights)\ncircuit = concrete_tr.compile(x_train)\n# Key generation\ncircuit.client.keygen(force=False)\n# Inference in FHE\ny_pred_fhe = concrete_tr.predict(x_test, fhe=\"execute\")\n\n**8. Fine"} +{"text": ")\nconcrete_tr.fit(x_train, y_train, sample_weight=train_weights)\ncircuit = concrete_tr.compile(x_train)\n# Key generation\ncircuit.client.keygen(force=False)\n# Inference in FHE\ny_pred_fhe = concrete_tr.predict(x_test, fhe=\"execute\")\n\n**8. Fine-tuning with LoRA (LoraMLP.ipynb):**\npython\nimport torch\nfrom peft import LoraConfig, get_peft_model\nfrom torch import nn, optim\nfrom concrete.ml.torch.lora import LoraTrainer\n#... (Data loading and preprocessing)...\n# Define"} +{"text": "-tuning with LoRA (LoraMLP.ipynb):**\npython\nimport torch\nfrom peft import LoraConfig, get_peft_model\nfrom torch import nn, optim\nfrom concrete.ml.torch.lora import LoraTrainer\n#... (Data loading and preprocessing)...\n# Define an MLP model without LoRA layers"} +{"text": "class SimpleMLP(nn.Module):"} +{"text": "def __init__(self, input_size=2, hidden_size=128, num_classes=2):\n super().__init__()\n self.fc1 = nn.Linear(input_size, hidden_size)\n self.relu = nn.ReLU()\n self.fc2 = nn.Linear(hidden_size, num_classes)"} +{"text": "def forward(self, x):\n out = self.fc1(x)\n out = self.relu(out)\n out = self.fc2(out)\n return out\n# Instantiate the model\nmodel = SimpleMLP()\n#... (Training loop for Task 1)...\n# Apply LoRA to the model using peft\nlora_config = LoraConfig(\n r=1, lora_alpha=1, lora_dropout=0.01, target_modules=[\"fc1\", \"fc2\"], bias=\"none\"\n)\npeft_model = get_peft_model(model, lora_config)\n# Update training parameters"} +{"text": "using peft\nlora_config = LoraConfig(\n r=1, lora_alpha=1, lora_dropout=0.01, target_modules=[\"fc1\", \"fc2\"], bias=\"none\"\n)\npeft_model = get_peft_model(model, lora_config)\n# Update training parameters, including loss function\noptimizer = optim.Adam(filter(lambda p: p.requires_grad, peft_model.parameters()), lr=0.01)\nloss_fn = nn.CrossEntropyLoss()\ntraining_args = {\"gradient_accumulation_steps\": 1}\n# Set up LoRA training\nlora_trainer = LoraTrainer"} +{"text": ", including loss function\noptimizer = optim.Adam(filter(lambda p: p.requires_grad, peft_model.parameters()), lr=0.01)\nloss_fn = nn.CrossEntropyLoss()\ntraining_args = {\"gradient_accumulation_steps\": 1}\n# Set up LoRA training\nlora_trainer = LoraTrainer(peft_model, optimizer=optimizer, loss_fn=loss_fn, training_args=training_args)\n# Prepare input data for calibration\nbatch_size_per_task = batch_size // 2\ninputset = (\n torch.cat([X_task1[:batch_size_per_task], X_task2[:batch_size_per_task]]"} +{"text": "(peft_model, optimizer=optimizer, loss_fn=loss_fn, training_args=training_args)\n# Prepare input data for calibration\nbatch_size_per_task = batch_size // 2\ninputset = (\n torch.cat([X_task1[:batch_size_per_task], X_task2[:batch_size_per_task]]),\n torch.cat([y_task1[:batch_size_per_task], y_task2[:batch_size_per_task]]),\n)\n# Compile the model\nlora_trainer.compile(inputset, n_bits=8)\n# Fine-tune the model on Task 2 using LoRA\nlora_trainer.train(train_loader"} +{"text": "),\n torch.cat([y_task1[:batch_size_per_task], y_task2[:batch_size_per_task]]),\n)\n# Compile the model\nlora_trainer.compile(inputset, n_bits=8)\n# Fine-tune the model on Task 2 using LoRA\nlora_trainer.train(train_loader_task2, num_epochs=10, fhe=\"execute\")\n# Enable/Disable LoRA adapters\npeft_model.enable_adapter_layers()\npeft_model.disable_adapter_layers()\n# Print trainable (lora) parameters\npeft_model.print_trainable_parameters()\n# Save the model and remove all layers that will be done"} +{"text": "_task2, num_epochs=10, fhe=\"execute\")\n# Enable/Disable LoRA adapters\npeft_model.enable_adapter_layers()\npeft_model.disable_adapter_layers()\n# Print trainable (lora) parameters\npeft_model.print_trainable_parameters()\n# Save the model and remove all layers that will be done on the server\npath = Path(\"lora_mlp\")\nif path.is_dir() and any(path.iterdir()):\n shutil.rmtree(path)\nlora_trainer.save_and_clear_private_info(path)"} diff --git a/use_case_examples/lora_finetuning/data_finetune/raw_cml_1.7.0_examples.txt b/use_case_examples/lora_finetuning/data_finetune/raw_cml_1.7.0_examples.txt new file mode 100644 index 000000000..6adba5a62 --- /dev/null +++ b/use_case_examples/lora_finetuning/data_finetune/raw_cml_1.7.0_examples.txt @@ -0,0 +1,458 @@ +**1. Linear Models:** +* **Logistic Regression:** +python +from concrete.ml.sklearn import LogisticRegression as ConcreteLogisticRegression +# ... (Data loading and preprocessing) ... +concrete_logr = ConcreteLogisticRegression(n_bits=8) +concrete_logr.fit(x_train, y_train) +fhe_circuit = concrete_logr.compile(x_train) +# Key generation +fhe_circuit.client.keygen(force=False) +# Inference in FHE +y_pred_fhe = concrete_logr.predict(x_test, fhe="execute") + +* **Linear Regression:** +python +from concrete.ml.sklearn import LinearRegression as ConcreteLinearRegression +# ... (Data loading and preprocessing) ... +concrete_lr = ConcreteLinearRegression(n_bits=8) +concrete_lr.fit(x_train, y_train) +fhe_circuit = concrete_lr.compile(x_train) +# Key generation +fhe_circuit.client.keygen(force=False) +# Inference in FHE +y_pred_fhe = concrete_lr.predict(x_test, fhe="execute") + +* **Linear SVR:** +python +from concrete.ml.sklearn.svm import LinearSVR as ConcreteLinearSVR +# ... (Data loading and preprocessing) ... +concrete_svr = ConcreteLinearSVR(n_bits=8, C=0.5) +concrete_svr.fit(x_train, y_train) +circuit = concrete_svr.compile(x_train) +# Key generation +circuit.client.keygen(force=False) +# Inference in FHE +y_pred_fhe = concrete_svr.predict(x_test, fhe="execute") + +* **Linear SVC** +python +from concrete.ml.sklearn.svm import LinearSVC as ConcreteLinearSVC +# ... (Data loading and preprocessing) ... +concrete_svc = ConcreteLinearSVC(n_bits=8, C=0.025) +concrete_svc.fit(x_train, y_train) +circuit = concrete_svc.compile(x_train) +# Inference in FHE +y_pred_fhe = concrete_svc.predict(x_test, fhe="execute") + +**2. Tree-Based Models:** +* **XGBoost Classifier:** +python +from concrete.ml.sklearn import XGBClassifier as ConcreteXGBClassifier +# ... (Data loading and preprocessing) ... +concrete_xgb = ConcreteXGBClassifier(n_bits=6, n_estimators=50, max_depth=4) +concrete_xgb.fit(x_train, y_train) +circuit = concrete_xgb.compile(x_train) +# Key generation +circuit.client.keygen(force=False) +# Inference in FHE +y_preds_fhe = concrete_xgb.predict(x_test, fhe="execute") + +* **XGBoost Regressor:** +python +from concrete.ml.sklearn import XGBRegressor as ConcreteXGBRegressor +# ... (Data loading and preprocessing) ... +concrete_xgb = ConcreteXGBRegressor(n_bits=6, n_estimators=50, max_depth=4) +concrete_xgb.fit(x_train, y_train) +circuit = concrete_xgb.compile(x_train) +# Key generation +circuit.client.keygen(force=False) +# Inference in FHE +y_preds_fhe = concrete_xgb.predict(x_test, fhe="execute") + +* **Decision Tree Classifier:** +python +from concrete.ml.sklearn import DecisionTreeClassifier as ConcreteDecisionTreeClassifier +# ... (Data loading and preprocessing) ... +model = ConcreteDecisionTreeClassifier( + max_features="log2", + min_samples_leaf=1, + min_samples_split=2, + max_depth=6, + n_bits=6, +) +model.fit(x_train, y_train) +circuit = model.compile(x_train) +# Key generation +circuit.client.keygen(force=False) +# Inference in FHE +y_pred_fhe = model.predict(x_test, fhe="execute") + +* **Decision Tree Regressor:** +python +from concrete.ml.sklearn import DecisionTreeRegressor as ConcreteDecisionTreeRegressor +# ... (Data loading and preprocessing) ... +model = ConcreteDecisionTreeRegressor( + max_depth=10, + max_features=5, + min_samples_leaf=2, + min_samples_split=10, + n_bits=6, + random_state=42, +) +model.fit(x_train, y_train) +circuit = model.compile(x_train) +# Key generation +circuit.client.keygen(force=False) +# Inference in FHE +y_pred_fhe = model.predict(x_test, fhe="execute") + +* **Random Forest Classifier:** +python +from concrete.ml.sklearn import RandomForestClassifier +# ... (Data loading and preprocessing) ... +model = RandomForestClassifier(max_depth=4, n_estimators=5, n_bits=5) +model.fit(x_train, y_train) +circuit = model.compile(x_train) +# Key generation +circuit.client.keygen(force=False) +# Inference in FHE +y_pred_fhe = model.predict(x_test, fhe="execute") + +* **Random Forest Regressor:** +python +from concrete.ml.sklearn import RandomForestRegressor +# ... (Data loading and preprocessing) ... +model = RandomForestRegressor(n_bits=5, n_estimators=50, max_depth=4) +model.fit(x_train, y_train) +circuit = model.compile(x_train) +# Key generation +circuit.client.keygen(force=False) +# Inference in FHE +y_pred_fhe = model.predict(x_test, fhe="execute") + +**3. Neural Networks:** +* **Fully Connected Neural Network:** +python +from torch import nn +from concrete.ml.sklearn import NeuralNetClassifier +# ... (Data loading and preprocessing) ... +parameters_neural_net = { + "module__n_w_bits": 2, + "module__n_a_bits": 4, + "module__n_accum_bits": 32, + "module__n_hidden_neurons_multiplier": 6, + "module__n_layers": 2, # 1 hidden layer + "module__activation_function": nn.ReLU, + "max_epochs": 400, + "verbose": 0, + "lr": 0.001, +} +model = NeuralNetClassifier(batch_size=32, **parameters_neural_net) +model.fit(X=x_train, y=y_train) +fhe_circuit = model.compile(x_train) +# Key generation +fhe_circuit.client.keygen(force=False) +# Inference in FHE +y_pred_fhe = model.predict(x_test, fhe="execute") + +* **Convolutional Neural Network:** +python +import torch +from torch import nn +from concrete.ml.torch.compile import compile_torch_model +# ... (Data loading and preprocessing) ... +class TinyCNN(nn.Module): + def __init__(self, n_classes) -> None: + super().__init__() + self.conv1 = nn.Conv2d(1, 8, 3, stride=1, padding=0) + self.conv2 = nn.Conv2d(8, 16, 3, stride=2, padding=0) + self.conv3 = nn.Conv2d(16, 32, 2, stride=1, padding=0) + self.fc1 = nn.Linear(32, n_classes) + def forward(self, x): + x = self.conv1(x) + x = torch.relu(x) + x = self.conv2(x) + x = torch.relu(x) + x = self.conv3(x) + x = torch.relu(x) + x = x.flatten(1) + x = self.fc1(x) + return x +net = TinyCNN(10) +# ... (Training loop) ... +q_module = compile_torch_model(net, x_train, rounding_threshold_bits=6, p_error=0.1) +# Key generation +q_module.fhe_circuit.keygen() +# Inference in FHE +y_pred_fhe = q_module.forward(x_test, fhe="execute") + +**4. Quantization-Aware Training:** +python +from torch import nn +from concrete.ml.torch.compile import compile_brevitas_qat_model +import brevitas.nn as qnn +from brevitas.core.bit_width import BitWidthImplType +from brevitas.core.quant import QuantType +from brevitas.core.restrict_val import FloatToIntImplType, RestrictValueType +from brevitas.core.scaling import ScalingImplType +from brevitas.core.zero_point import ZeroZeroPoint +from brevitas.inject import ExtendedInjector +from brevitas.quant.solver import ActQuantSolver, WeightQuantSolver +from dependencies import value +from torch.nn.utils import prune +# ... (Data loading and preprocessing) ... +class CommonQuant(ExtendedInjector): + bit_width_impl_type = BitWidthImplType.CONST + scaling_impl_type = ScalingImplType.CONST + restrict_scaling_type = RestrictValueType.FP + zero_point_impl = ZeroZeroPoint + float_to_int_impl_type = FloatToIntImplType.ROUND + scaling_per_output_channel = False + narrow_range = True + signed = True + @value + def quant_type(bit_width): # pylint: disable=no-self-argument + if bit_width is None: + return QuantType.FP + if bit_width == 1: + return QuantType.BINARY + return QuantType.INT +class CommonWeightQuant(CommonQuant, WeightQuantSolver): # pylint: disable=too-many-ancestors + scaling_const = 1.0 + signed = True +class CommonActQuant(CommonQuant, ActQuantSolver): # pylint: disable=too-many-ancestors + min_val = -1.0 + max_val = 1.0 +class QATPrunedSimpleNet(nn.Module): + def __init__(self, n_hidden, qlinear_args, qidentity_args): + super().__init__() + self.pruned_layers = set() + self.quant_inp = qnn.QuantIdentity(**qidentity_args) + self.fc1 = qnn.QuantLinear(IN_FEAT, n_hidden, **qlinear_args) + self.relu1 = qnn.QuantReLU(bit_width=qidentity_args["bit_width"]) + self.fc2 = qnn.QuantLinear(n_hidden, n_hidden, **qlinear_args) + self.relu2 = qnn.QuantReLU(bit_width=qidentity_args["bit_width"]) + self.fc3 = qnn.QuantLinear(n_hidden, OUT_FEAT, **qlinear_args) + for m in self.modules(): + if isinstance(m, qnn.QuantLinear): + torch.nn.init.uniform_(m.weight.data, -1, 1) + def forward(self, x): + x = self.quant_inp(x) + x = self.relu1(self.fc1(x)) + x = self.relu2(self.fc2(x)) + x = self.fc3(x) + return x + def prune(self, max_non_zero): + # Linear layer weight has dimensions NumOutputs x NumInputs + for name, layer in self.named_modules(): + if isinstance(layer, qnn.QuantLinear): + num_zero_weights = (layer.weight.shape[1] - max_non_zero) * layer.weight.shape[0] + if num_zero_weights <= 0: + continue + print(f"Pruning layer {name} factor {num_zero_weights}") + prune.l1_unstructured(layer, "weight", amount=num_zero_weights) + self.pruned_layers.add(name) + def unprune(self): + for name, layer in self.named_modules(): + if name in self.pruned_layers: + prune.remove(layer, "weight") + self.pruned_layers.remove(name) +torch_model = QATPrunedSimpleNet( + n_hidden=n_hidden, + qlinear_args={ + "weight_bit_width": 3, + "weight_quant": CommonWeightQuant, + "bias": True, + "bias_quant": None, + "narrow_range": True, + }, + qidentity_args={"bit_width": 3, "act_quant": CommonActQuant}, +) +torch_model.prune(20) +# ... (Training loop) ... +quantized_numpy_module = compile_brevitas_qat_model(torch_model, x_train) +# Inference in FHE (simulation) +y_pred_fhe = quantized_numpy_module.forward(x_test, fhe="simulate") + +**5. Client/Server Deployment (LogisticRegressionTraining.ipynb):** +python +from pathlib import Path +from tempfile import TemporaryDirectory +import numpy as np +from concrete.ml.deployment import FHEModelClient, FHEModelDev, FHEModelServer +from concrete.ml.sklearn import SGDClassifier +from concrete import fhe +# ... (Data loading, preprocessing, and model training) ... +# Assuming you have a trained model: sgd_clf_binary_fhe +# and x_compile_set, y_compile_set for compilation +# Define the directory where to save the deployment files +DEPLOYMENT_PATH = Path("fhe_training") +DEPLOYMENT_PATH.mkdir(exist_ok=True) +deployment_dir = TemporaryDirectory(dir=str(DEPLOYMENT_PATH)) +deployment_path = Path(deployment_dir.name) +# Save the model for deployment +fhe_dev = FHEModelDev(deployment_path, sgd_clf_binary_fhe) +fhe_dev.save(mode="training") +# Client-side setup +fhe_client = FHEModelClient(deployment_path) +fhe_client.load() +serialized_evaluation_keys = fhe_client.get_serialized_evaluation_keys() +# Server-side setup +fhe_server = FHEModelServer(deployment_path) +fhe_server.load() +# Example of encryption, server-side processing, and decryption +batch_size = sgd_clf_binary_fhe.batch_size +weights = np.random.rand(1, x_train.shape[1], 1) +bias = np.random.rand(1, 1, 1) +def quantize_encrypt_serialize_batches(fhe_client, x, y, weights, bias, batch_size): + # ... (Implementation as before) ... +def server_run(fhe_server, x_batches_enc, y_batches_enc, weights_enc, bias_enc, evaluation_keys): + # ... (Implementation as before) ... +def train_fhe_client_server( + # ... (Parameters as before) ... +): + # ... (Training loop) + # Quantize, encrypt and serialize the batched inputs as well as the weight and bias values + x_batches_enc, y_batches_enc, weights_enc, bias_enc = quantize_encrypt_serialize_batches( + fhe_client, x, y, weights, bias, batch_size + ) + # Iterate the circuit over the batches on the server + fitted_weights_enc, fitted_bias_enc = server_run( + fhe_server, + x_batches_enc, + y_batches_enc, + weights_enc, + bias_enc, + serialized_evaluation_keys, + ) + # Back on the client, deserialize, decrypt and de-quantize the fitted weight and bias values + weights, bias = fhe_client.deserialize_decrypt_dequantize( + fitted_weights_enc, fitted_bias_enc + ) + return weights, bias, acc_history +# Cleanup +deployment_dir.cleanup() + +**6. Hyper-parameter Tuning with GridSearchCV (XGBClassifier.ipynb, DecisionTreeRegressor.ipynb):** +python +from sklearn.model_selection import GridSearchCV +from concrete.ml.sklearn import XGBClassifier as ConcreteXGBClassifier +from sklearn.metrics import make_scorer, matthews_corrcoef +# ... (Data loading and preprocessing) ... +# Create scorer with the MCC metric +grid_scorer = make_scorer(matthews_corrcoef, greater_is_better=True) +# Define the parameter grid to search +param_grid = { + "n_bits": [5, 6], + "max_depth": [2, 3], + "n_estimators": [10, 20, 50], +} +# Instantiate GridSearchCV with the Concrete ML model +grid_search = GridSearchCV( + ConcreteXGBClassifier(), + param_grid, + cv=5, + scoring=grid_scorer, + error_score="raise", + verbose=1, +) +# Run the grid search +grid_search.fit(x_train, y_train) +# Get the best parameters +best_params = grid_search.best_params_ +# Create a new model with the best parameters +best_model = ConcreteXGBClassifier(**best_params) +best_model.fit(x_train, y_train) +# Compile and proceed with FHE inference as shown in other examples + +**7. GLM Models (GLMComparison.ipynb):** +* **Poisson Regressor** +python +from concrete.ml.sklearn import PoissonRegressor as ConcretePoissonRegressor +# ... (Data loading and preprocessing) ... +concrete_pr = ConcretePoissonRegressor(n_bits=8) +concrete_pr.fit(x_train, y_train, sample_weight=train_weights) +circuit = concrete_pr.compile(x_train) +# Key generation +circuit.client.keygen(force=False) +# Inference in FHE +y_pred_fhe = concrete_pr.predict(x_test, fhe="execute") + +* **Gamma Regressor** +python +from concrete.ml.sklearn import GammaRegressor as ConcreteGammaRegressor +# ... (Data loading and preprocessing) ... +concrete_gr = ConcreteGammaRegressor(n_bits=8) +concrete_gr.fit(x_train, y_train, sample_weight=train_weights) +circuit = concrete_gr.compile(x_train) +# Key generation +circuit.client.keygen(force=False) +# Inference in FHE +y_pred_fhe = concrete_gr.predict(x_test, fhe="execute") + +* **Tweedie Regressor** +python +from concrete.ml.sklearn import TweedieRegressor as ConcreteTweedieRegressor +# ... (Data loading and preprocessing) ... +concrete_tr = ConcreteTweedieRegressor(n_bits=8, power=1.9) +concrete_tr.fit(x_train, y_train, sample_weight=train_weights) +circuit = concrete_tr.compile(x_train) +# Key generation +circuit.client.keygen(force=False) +# Inference in FHE +y_pred_fhe = concrete_tr.predict(x_test, fhe="execute") + +**8. Fine-tuning with LoRA (LoraMLP.ipynb):** +python +import torch +from peft import LoraConfig, get_peft_model +from torch import nn, optim +from concrete.ml.torch.lora import LoraTrainer +# ... (Data loading and preprocessing) ... +# Define an MLP model without LoRA layers +class SimpleMLP(nn.Module): + def __init__(self, input_size=2, hidden_size=128, num_classes=2): + super().__init__() + self.fc1 = nn.Linear(input_size, hidden_size) + self.relu = nn.ReLU() + self.fc2 = nn.Linear(hidden_size, num_classes) + def forward(self, x): + out = self.fc1(x) + out = self.relu(out) + out = self.fc2(out) + return out +# Instantiate the model +model = SimpleMLP() +# ... (Training loop for Task 1) ... +# Apply LoRA to the model using peft +lora_config = LoraConfig( + r=1, lora_alpha=1, lora_dropout=0.01, target_modules=["fc1", "fc2"], bias="none" +) +peft_model = get_peft_model(model, lora_config) +# Update training parameters, including loss function +optimizer = optim.Adam(filter(lambda p: p.requires_grad, peft_model.parameters()), lr=0.01) +loss_fn = nn.CrossEntropyLoss() +training_args = {"gradient_accumulation_steps": 1} +# Set up LoRA training +lora_trainer = LoraTrainer(peft_model, optimizer=optimizer, loss_fn=loss_fn, training_args=training_args) +# Prepare input data for calibration +batch_size_per_task = batch_size // 2 +inputset = ( + torch.cat([X_task1[:batch_size_per_task], X_task2[:batch_size_per_task]]), + torch.cat([y_task1[:batch_size_per_task], y_task2[:batch_size_per_task]]), +) +# Compile the model +lora_trainer.compile(inputset, n_bits=8) +# Fine-tune the model on Task 2 using LoRA +lora_trainer.train(train_loader_task2, num_epochs=10, fhe="execute") +# Enable/Disable LoRA adapters +peft_model.enable_adapter_layers() +peft_model.disable_adapter_layers() +# Print trainable (lora) parameters +peft_model.print_trainable_parameters() +# Save the model and remove all layers that will be done on the server +path = Path("lora_mlp") +if path.is_dir() and any(path.iterdir()): + shutil.rmtree(path) +lora_trainer.save_and_clear_private_info(path) diff --git a/use_case_examples/lora_finetuning/requirements.txt b/use_case_examples/lora_finetuning/requirements.txt index 7ea93063a..e99a87ffe 100644 --- a/use_case_examples/lora_finetuning/requirements.txt +++ b/use_case_examples/lora_finetuning/requirements.txt @@ -4,5 +4,6 @@ peft==0.11.1 Jinja2==3.1.4 matplotlib==3.7.5 datasets==3.0.1 +accelerate==1.2.0 jupyter==1.0.0 tqdm==4.66.5 \ No newline at end of file diff --git a/use_case_examples/lora_finetuning/scripts/create_dataset.py b/use_case_examples/lora_finetuning/scripts/create_dataset.py new file mode 100644 index 000000000..091f33e71 --- /dev/null +++ b/use_case_examples/lora_finetuning/scripts/create_dataset.py @@ -0,0 +1,109 @@ +import json +import re +from pathlib import Path + +from transformers import AutoTokenizer + + +def init_tokenizer(): + return AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B") + + +def chunk_text_by_tokens(text, tokenizer, max_tokens=128): + """Split text into chunks that don't exceed max_tokens with overlap.""" + overlap_tokens = max_tokens // 2 + tokens = tokenizer.encode(text) + chunks = [] + + # Start indices for each chunk + start_idx = 0 + + while start_idx < len(tokens): + # Calculate end index for current chunk + end_idx = min(start_idx + max_tokens, len(tokens)) + + # Get current chunk + current_chunk = tokens[start_idx:end_idx] + chunk_text = tokenizer.decode(current_chunk, skip_special_tokens=True) + + if chunk_text.strip(): + chunks.append(chunk_text) + + # Move start_idx forward by (max_tokens - overlap_tokens) + start_idx += max_tokens - overlap_tokens + + # If the remaining text is shorter than the overlap, we're done + if len(tokens) - start_idx < overlap_tokens: + break + + return chunks + + +def split_code_into_snippets(code): + # Split code into functions, classes, and other logical blocks + pattern = re.compile(r"^\s*(def |class )", re.MULTILINE) + indices = [match.start() for match in pattern.finditer(code)] + indices.append(len(code)) + snippets = [code[indices[i] : indices[i + 1]] for i in range(len(indices) - 1)] + return snippets + + +def process_code_file(code_file_path, tokenizer, max_tokens=128): + with open(code_file_path, "r", encoding="utf-8") as file: + code = file.read() + snippets = split_code_into_snippets(code) + # Further split snippets if they exceed token limit + tokenized_snippets = [] + for snippet in snippets: + tokenized_snippets.extend(chunk_text_by_tokens(snippet, tokenizer, max_tokens)) + return tokenized_snippets + + +def process_documentation_file(doc_file_path, tokenizer, max_tokens=128): + with open(doc_file_path, "r", encoding="utf-8") as file: + documentation = file.read() + snippets = documentation.split("\n\n") + # Further split snippets if they exceed token limit + tokenized_snippets = [] + for snippet in snippets: + tokenized_snippets.extend(chunk_text_by_tokens(snippet, tokenizer, max_tokens)) + return tokenized_snippets + + +def save_to_jsonl(snippets, output_file_path): + with open(output_file_path, "w", encoding="utf-8") as outfile: + for snippet in snippets: + snippet = snippet.strip() + if snippet: + json_line = json.dumps({"text": snippet}) + outfile.write(json_line + "\n") + + +def main(): + # Get the absolute path to the script's location + script_dir = Path(__file__).resolve().parent + + # Calculate paths relative to the script location + output_dir = script_dir.parent / "data_finetune" + + # Paths to your code and documentation files + code_file_path = output_dir / "raw_cml_1.7.0_examples.txt" + output_file_path = output_dir / "dataset.jsonl" + + # Initialize tokenizer + tokenizer = init_tokenizer() + max_tokens = 128 + + # Process code files with token control + code_snippets = process_code_file(code_file_path, tokenizer, max_tokens) + + # Combine snippets + all_snippets = code_snippets + + # Save to dataset.jsonl + save_to_jsonl(all_snippets, output_file_path) + print(f"Dataset saved to {output_file_path}") + + +if __name__ == "__main__": + main() diff --git a/use_case_examples/lora_finetuning/utils_lora.py b/use_case_examples/lora_finetuning/utils_lora.py index 1cad80804..0ffd40d4c 100644 --- a/use_case_examples/lora_finetuning/utils_lora.py +++ b/use_case_examples/lora_finetuning/utils_lora.py @@ -6,6 +6,33 @@ import numpy as np import torch import torch.backends.cudnn as cudnn +from transformers.generation.stopping_criteria import ( # Add this line + StoppingCriteria, + StoppingCriteriaList, +) + + +class NewlineStopping(StoppingCriteria): + def __init__(self, tokenizer): + self.tokenizer = tokenizer + # Get all token IDs that represent newline characters + self.newline_tokens = set( + [ + self.tokenizer.encode("\n")[0], + self.tokenizer.encode("\r")[0] if len(self.tokenizer.encode("\r")) > 0 else None, + ( + self.tokenizer.encode("\r\n")[0] + if len(self.tokenizer.encode("\r\n")) > 0 + else None + ), + ] + ) + self.newline_tokens.discard(None) + + def __call__(self, input_ids, scores, **kwargs): + # Check if the last generated token is a newline + last_token = input_ids[0][-1].item() + return last_token in self.newline_tokens def generate_and_print(prompt, model, tokenizer, seed=None, max_new_tokens=30): @@ -54,8 +81,11 @@ def generate_and_print(prompt, model, tokenizer, seed=None, max_new_tokens=30): if generated_text.startswith(prompt): generated_text = generated_text[len(prompt) :].strip() - # Print the user prompt and the generated text separated by a newline - print(f"{prompt}\n{generated_text}") + # Only keep text up to the first newline + generated_text = generated_text.split("\n")[0] + + # Print the prompt and generated text on the same line + print(f"{prompt} {generated_text}") def print_weights_and_size(model, print_detail=False): From 439bbfb52478c6e89c6c3a95d97b86125db6a2af Mon Sep 17 00:00:00 2001 From: jfrery Date: Tue, 10 Dec 2024 18:56:23 +0100 Subject: [PATCH 02/11] chore: fix pcc --- src/concrete/ml/torch/lora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/concrete/ml/torch/lora.py b/src/concrete/ml/torch/lora.py index f516816ec..b7e72021d 100644 --- a/src/concrete/ml/torch/lora.py +++ b/src/concrete/ml/torch/lora.py @@ -317,7 +317,7 @@ def train( for item in batch ) else: - # If it's a single non-tensor item, wrap it in a tuple + # If it is a single non-tensor item, wrap it in a tuple batch = (batch,) # Forward pass through the hybrid model From f3a4a70e05b88f17b050d0443517d697de6b0ac6 Mon Sep 17 00:00:00 2001 From: jfrery Date: Tue, 10 Dec 2024 20:20:29 +0100 Subject: [PATCH 03/11] chore: fix api update in codeblock --- docs/deep-learning/lora_training.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/docs/deep-learning/lora_training.md b/docs/deep-learning/lora_training.md index 7166cdea2..781f11af9 100644 --- a/docs/deep-learning/lora_training.md +++ b/docs/deep-learning/lora_training.md @@ -88,15 +88,6 @@ parameters: ```python lora_training = LoraTraining(peft_model) - - -# Update training parameters, including loss function -lora_training.update_training_parameters( - optimizer=optim.Adam(filter(lambda p: p.requires_grad, peft_model.parameters()), lr=0.01), - loss_fn=nn.CrossEntropyLoss(), - training_args={"gradient_accumulation_steps": 1}, -) - ``` ### 3. Compile a hybrid FHE model for the LORA adapted PyTorch model From 14c7be1e5a6ac94e043f5e8fc36759f739b76bc6 Mon Sep 17 00:00:00 2001 From: jfrery Date: Tue, 10 Dec 2024 22:40:19 +0100 Subject: [PATCH 04/11] chore: update codeblock + doc --- docs/deep-learning/lora_training.md | 97 +++++++++++++++-------------- src/concrete/ml/torch/lora.py | 15 ++++- tests/torch/test_lora.py | 31 +++++++-- 3 files changed, 89 insertions(+), 54 deletions(-) diff --git a/docs/deep-learning/lora_training.md b/docs/deep-learning/lora_training.md index 781f11af9..fa48c8f83 100644 --- a/docs/deep-learning/lora_training.md +++ b/docs/deep-learning/lora_training.md @@ -7,22 +7,20 @@ Small models can be fine-tuned using a single-client/single-server setup. For la ## Overview {% hint style="info" %} -Refer to [this notebook](../advanced_examples/LoraMLP.ipynb) to see the tutorial about applying FHE LORA fine-tuning to a small neural network. +Refer to [this notebook](../advanced_examples/LoraMLP.ipynb) to see the tutorial about applying FHE LoRA fine-tuning to a small neural network. {% endhint %} -Concrete ML supports LORA, a parameter efficient fine-tuning (PEFT) approach, in the [hybrid model](../guides/hybrid-models.md) paradigm. LORA adds adapters, which contain a low number of fine-tunable weights, to the linear layers in an original model. +Concrete ML supports LoRA, a parameter-efficient fine-tuning (PEFT) approach, in the [hybrid model](../guides/hybrid-models.md) paradigm. LoRA adds adapter layers, which contain a small number of trainable parameters, to the linear layers of a base model. -In this setup, Concrete ML outsources the forward and backward passes of the model's original logic to one or more remote servers. Meanwhile, the forward and backward passes over the LORA weights, the loss computation and the weight updates are performed by the client side. As the number of LORA weights is low, this does not significantly increase the computational load for the model training client machine. For large LLMs, over 99% of the model's weights can be outsourced. +In this setup, Concrete ML outsources the computationally intensive parts of forward and backward passes for large models to one or more remote servers. The training client machine only handles the LoRA-adapter forward/backward passes, loss computation, and adapter weight updates. Since the LoRA adapters are small, this additional computation on the client side is minimal. For large LLMs, over 99% of the model's weights can remain outsourced. The main benefit of hybrid-model LORA training is outsourcing the computation of linear layers, which are typically large in LLMs. These layers require substantial hardware for inference and gradient computation. By securely outsourcing this work, Concrete ML removes the memory bottleneck that previously limited such operations. ## Usage -Concrete ML integrates with the [`peft` package](https://huggingface.co/docs/peft/index), -which adds LORA layer adapters to a model's linear layers. Here are the steps to convert -a model to hybrid FHE LORA training. +Concrete ML integrates with the [`peft` package](https://huggingface.co/docs/peft/index) to add LoRA adapters to a model's linear layers. Below are the steps to convert a model into a hybrid FHE LoRA training setup. -### 1. Apply the `peft` LORA layers +### 1. Apply the `peft` LoRA layers The `LoraConfig` class from the `peft` package contains the various LORA parameters. You can specify which layers have LORA adapters through the `target_modules` argument. For a detailed reference of the various configuration options, refer to the @@ -31,9 +29,10 @@ documentation. ```python import torch +import torch.nn.functional as F from torch import nn, optim from peft import LoraConfig, get_peft_model -from concrete.ml.torch.lora import LoraTraining, get_remote_names +from concrete.ml.torch.lora import LoraTrainer from concrete.ml.torch.hybrid_model import HybridFHEModel from sklearn.datasets import make_circles from torch.utils.data import DataLoader, TensorDataset @@ -54,81 +53,85 @@ class SimpleMLP(nn.Module): out = self.fc2(out) return out +# Create an initial model +model = SimpleMLP() + +# Apply LoRA configuration lora_config = LoraConfig( - r=1, lora_alpha=1, lora_dropout=0.01, target_modules=["fc1", "fc2"], bias="none" + r=1, + lora_alpha=1, + lora_dropout=0.01, + target_modules=["fc1", "fc2"], + bias="none" ) -model = SimpleMLP() -# The initial training loop of the model should be -# added at this point on an initial data-set +peft_model = get_peft_model(model, lora_config) -# A second data-set, task2 is generated +# Generate a second dataset for demonstration purposes X_task2, y_task2 = make_circles(n_samples=32, noise=0.2, factor=0.5) train_loader_task2 = DataLoader( TensorDataset(torch.Tensor(X_task2), torch.LongTensor(y_task2)), batch_size=32, shuffle=True ) - -# Apply LoRA to the model -peft_model = get_peft_model(model, lora_config) ``` ### 2. Convert the LORA model to use custom Concrete ML layers -Concrete ML requires converting the `peft` model to add -FHE compatible layers. In this step, you can configure several fine-tuning -parameters: +Next, we need to integrate the LoRA-adapted `peft_model` into the Concrete ML hybrid FHE training framework. This is done using the `LoraTrainer` class, which handles the logic of encrypting outsourced computations, running the forward and backward passes, and updating the LoRA adapter weights. + +You can configure: -- The number of gradient accumulation steps: LORA commonly accumulate gradients over several gradient descent steps before updating weights. -- The optimizer parameters -- The loss function +- The loss function. +- The optimizer and its parameters. +- Gradient accumulation steps (if needed). ```python -lora_training = LoraTraining(peft_model) +# Define a simple loss function +def simple_loss(outputs, targets): + return F.cross_entropy(outputs, targets) + +# Create an Adam optimizer +optimizer = optim.Adam(peft_model.parameters(), lr=1e-3) + +# Initialize trainer with the loss and optimizer +lora_trainer = LoraTrainer( + peft_model, + optimizer=optimizer, + loss_fn=simple_loss, +) ``` ### 3. Compile a hybrid FHE model for the LORA adapted PyTorch model -Compile the hybrid FHE model to convert the selected outsourced layers to use FHE, while the rest will run on the client side. Note that the exchange of encrypted activations and gradients may require significant bandwidth. +Before training in FHE, we need to compile the model. Compilation calibrates and converts the outsourced linear layers to their FHE equivalents. The compile method uses representative data for this step. ```python -# Find layers that can be outsourced -remote_names = get_remote_names(lora_training) - -# Build the hybrid FHE model -hybrid_model = HybridFHEModel(lora_training, module_names=remote_names) - # Build a representative data-set for compilation inputset = ( torch.Tensor(X_task2[:16]), torch.LongTensor(y_task2[:16]), ) -# Calibrate and compile the model -hybrid_model.model.toggle_calibrate(enable=True) -hybrid_model.compile_model(inputset, n_bits=8) -hybrid_model.model.toggle_calibrate(enable=False) +# Calibrate and compile the model with 8-bit quantization +lora_trainer.compile(inputset, n_bits=8) ``` +At this point, the trainer has a hybrid FHE model ready for encrypted execution of the outsourced layers. The LoRA layers remain on the client side in the clear. + ### 4. Train the model on private data -Finally, the hybrid model can be trained, similar to training a PyTorch model. The client handles training data batches generation and iteration. +You can now train the hybrid FHE model with your private data. The train method will run forward and backward passes, updating only the LoRA adapter weights locally while securely outsourcing the main layers’ computations. ```python -# Assume train_loader is a torch.DataLoader - -hybrid_model.model.inference_model.train() -hybrid_model.model.toggle_run_optimizer(enable=True) - -for x_batch, y_batch in train_loader_task2: - loss, _ = hybrid_model((x_batch, y_batch), fhe="execute") +# Train in FHE mode +lora_trainer.train(train_loader_task2, fhe="execute") ``` ## Additional options @@ -136,12 +139,12 @@ for x_batch, y_batch in train_loader_task2: ### Inference Once fine-tuned, the LORA hybrid FHE model can perform inference only, through the -`model.inference_model` attribute of the hybrid FHE model. +`peft_model` attribute of the hybrid FHE model. ```python -hybrid_model.model.inference_model(x) +peft_model(x) ``` ### Toggle LORA layers @@ -151,9 +154,9 @@ To compare to the original model, you can disable the LORA weights to use the or ```python -hybrid_model.model.inference_model.disable_adapter_layers() -hybrid_model.model.inference_model(x) +peft_model.disable_adapter_layers() +peft_model(x) # Re-enable the LORA weights -hybrid_model.model.inference_model.enable_adapter_layers() +peft_model.enable_adapter_layers() ``` diff --git a/src/concrete/ml/torch/lora.py b/src/concrete/ml/torch/lora.py index b7e72021d..c6ce2b700 100644 --- a/src/concrete/ml/torch/lora.py +++ b/src/concrete/ml/torch/lora.py @@ -65,6 +65,17 @@ class LoraTraining(torch.nn.Module): def __init__(self, model, n_layers_to_skip_for_backprop=1, loss_fn=None): super().__init__() + # Check if model accepts labels when no loss_fn is provided + if loss_fn is None: + from inspect import signature + + forward_sig = signature(model.forward) + if "labels" not in forward_sig.parameters: + raise ValueError( + "When no loss_fn is provided, the model's forward method" + "must accept a 'labels' parameter" + ) + # Assert that the model contains LoRA layers self.assert_has_lora_layers(model) @@ -239,7 +250,7 @@ class LoraTrainer: def __init__( self, model, - optimizer=None, + optimizer, loss_fn=None, lr_scheduler=None, training_args=None, @@ -251,7 +262,7 @@ def __init__( self.gradient_accumulation_steps = self.training_args.get("gradient_accumulation_steps", 1) self.max_grad_norm = self.training_args.get("max_grad_norm", None) - # Create the LoRA training module + # Create the LoraTraining module self.lora_training_module = LoraTraining( model, n_layers_to_skip_for_backprop=n_layers_to_skip_for_backprop, loss_fn=loss_fn ) diff --git a/tests/torch/test_lora.py b/tests/torch/test_lora.py index d9bee88e5..4f4da5f23 100644 --- a/tests/torch/test_lora.py +++ b/tests/torch/test_lora.py @@ -29,9 +29,8 @@ def __init__(self): self.linear1 = nn.Linear(10, 20) self.linear2 = nn.Linear(20, 10) - def forward(self, x, **kwargs): + def forward(self, x, labels=None): """Forward pass.""" - labels = kwargs.get("labels", None) logits = self.linear2(torch.relu(self.linear1(x))) if labels is not None: loss = nn.functional.mse_loss(logits, labels) @@ -180,9 +179,8 @@ def __init__(self): self.linear1 = nn.Linear(10, 20) self.linear2 = nn.Linear(20, 10) - def forward(self, x, **kwargs): + def forward(self, x, labels=None): """Forward pass.""" - labels = kwargs.get("labels", None) logits = self.linear2(torch.relu(self.linear1(x))) class OutputObject: @@ -295,7 +293,8 @@ def test_lora_trainer_train_with_lr_scheduler(): def test_lora_trainer_save_and_clear_private_info(): """Test LoraTrainer save_and_clear_private_info.""" model = DummyLoRAModel() - lora_trainer = LoraTrainer(model) + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + lora_trainer = LoraTrainer(model, optimizer=optimizer, loss_fn=nn.MSELoss()) lora_trainer.hybrid_model.save_and_clear_private_info = MagicMock() lora_trainer.save_and_clear_private_info("path/to/model") lora_trainer.hybrid_model.save_and_clear_private_info.assert_called_once_with("path/to/model") @@ -578,3 +577,25 @@ def forward(self, x): remote_names = get_remote_names(model) assert "lora_linear" not in remote_names assert "linear" in remote_names + + +def test_lora_training_init_validates_model_signature(): + """Test LoraTraining initialization validates model's forward signature.""" + + class ModelWithoutLabels(nn.Module): + """Model without labels parameter in forward.""" + + def __init__(self): + super().__init__() + self.lora_a = nn.Parameter(torch.randn(10, 10)) + self.linear = nn.Linear(10, 10) + + def forward(self, x): # No labels parameter + """Forward pass without labels parameter.""" + return {"logits": self.linear(x)} + + model = ModelWithoutLabels() + + with pytest.raises(ValueError) as exc_info: + LoraTraining(model, loss_fn=None) # No loss_fn provided + assert "must accept a 'labels' parameter" in str(exc_info.value) From c85868c28a25e6243d2187a5af3c4dfed02a49f6 Mon Sep 17 00:00:00 2001 From: jfrery Date: Wed, 11 Dec 2024 08:56:04 +0100 Subject: [PATCH 05/11] chore: fix forbidden words --- docs/deep-learning/lora_training.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/deep-learning/lora_training.md b/docs/deep-learning/lora_training.md index fa48c8f83..1ee27b3c0 100644 --- a/docs/deep-learning/lora_training.md +++ b/docs/deep-learning/lora_training.md @@ -67,7 +67,7 @@ lora_config = LoraConfig( peft_model = get_peft_model(model, lora_config) -# Generate a second dataset for demonstration purposes +# Generate a second data-set for demonstration purposes X_task2, y_task2 = make_circles(n_samples=32, noise=0.2, factor=0.5) train_loader_task2 = DataLoader( TensorDataset(torch.Tensor(X_task2), torch.LongTensor(y_task2)), From 333c46dd4a24a7ca1872a5ad44e3b6e367c5ab0b Mon Sep 17 00:00:00 2001 From: jfrery Date: Mon, 16 Dec 2024 15:20:50 +0100 Subject: [PATCH 06/11] chore: review --- docs/advanced_examples/LoraMLP.ipynb | 23 +++++-------------- docs/deep-learning/lora_training.md | 18 +++++++-------- script/doc_utils/check_forbidden_words.py | 2 ++ src/concrete/ml/torch/lora.py | 12 +++++++--- .../lora_finetuning/GPT2FineTuneHybrid.ipynb | 2 +- .../lora_finetuning/LLamaFineTuning.ipynb | 2 +- 6 files changed, 28 insertions(+), 31 deletions(-) diff --git a/docs/advanced_examples/LoraMLP.ipynb b/docs/advanced_examples/LoraMLP.ipynb index 7a7015614..529603215 100644 --- a/docs/advanced_examples/LoraMLP.ipynb +++ b/docs/advanced_examples/LoraMLP.ipynb @@ -8,27 +8,16 @@ "\n", "This notebook demonstrates encrypted fine-tuning of a small MLP model with LoRA. A model trained on an initial dataset is adapted to a second dataset using LoRA fine-tuning.\n", "\n", - "The fine-tuning dataset and the LoRA weights that are trained are protected using encryption. Thus, the training can be outsourced to a remote server without leaking any sensitive data.\n", + "The fine-tuning dataset and the trained LoRA weights are protected using encryption. Thus, training can be securely outsourced to a remote server without compromising any sensitive data.\n", "\n", - "The hybrid model approach is applied to fine-tuning: only the linear layers of the original model are outsourced to the server. The forward and backward passes on these original weights are performed with encrypted activations and gradients. The LoRA weights are kept by the client, and the client performs the forward and backward passes on the LoRA weights." + "The hybrid model approach is applied to fine-tuning: only the linear layers of the original model are outsourced to the server. The forward and backward passes on these layers are performed using encrypted activations and gradients. Meanwhile, the LoRA weights are kept by the client, which performs locally the forward and backward passes on the LoRA weights." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import shutil\n", "from pathlib import Path\n", @@ -46,7 +35,7 @@ "# Set random seed for reproducibility\n", "SEED = 42\n", "np.random.seed(SEED)\n", - "torch.manual_seed(SEED)" + "torch.manual_seed(SEED);" ] }, { @@ -55,7 +44,7 @@ "source": [ "## Data preparation\n", "\n", - "Two datasets are generated: one for the original training, and a second one on which LORA fine-tuning is performed." + "Two datasets are generated: one for the original training, and a second one on which LoRA fine-tuning is performed." ] }, { @@ -75,7 +64,7 @@ } ], "source": [ - "# Task 1: Two interleaving half circles (make_moons)\n", + "# Task 1: Two interleaving half circles\n", "X_task1, y_task1 = make_moons(n_samples=500, noise=0.1)\n", "# Task 2: Two concentric circles\n", "X_task2, y_task2 = make_circles(n_samples=500, noise=0.2, factor=0.5)\n", diff --git a/docs/deep-learning/lora_training.md b/docs/deep-learning/lora_training.md index 1ee27b3c0..4c4ec053f 100644 --- a/docs/deep-learning/lora_training.md +++ b/docs/deep-learning/lora_training.md @@ -1,6 +1,6 @@ # Encrypted fine-tuning -This document explains how to fine-tune neural-network models and large language-models(LLMs) on private data. +This document explains how to fine-tune neural-network models and large language-models (LLMs) on private data. Small models can be fine-tuned using a single-client/single-server setup. For larger models (such as GPT-2 and above), consider using distributed computation across multiple worker nodes to perform training on encrypted data for optimal latency. @@ -14,7 +14,7 @@ Concrete ML supports LoRA, a parameter-efficient fine-tuning (PEFT) approach, in In this setup, Concrete ML outsources the computationally intensive parts of forward and backward passes for large models to one or more remote servers. The training client machine only handles the LoRA-adapter forward/backward passes, loss computation, and adapter weight updates. Since the LoRA adapters are small, this additional computation on the client side is minimal. For large LLMs, over 99% of the model's weights can remain outsourced. -The main benefit of hybrid-model LORA training is outsourcing the computation of linear layers, which are typically large in LLMs. These layers require substantial hardware for inference and gradient computation. By securely outsourcing this work, Concrete ML removes the memory bottleneck that previously limited such operations. +The main benefit of hybrid-model LoRA training is outsourcing the computation of linear layers, which are typically large in LLMs. These layers require substantial hardware for inference and gradient computation. By securely outsourcing this work, Concrete ML removes the memory bottleneck that previously limited such operations. ## Usage @@ -22,7 +22,7 @@ Concrete ML integrates with the [`peft` package](https://huggingface.co/docs/pef ### 1. Apply the `peft` LoRA layers -The `LoraConfig` class from the `peft` package contains the various LORA parameters. You can specify which layers have LORA adapters through the `target_modules` argument. +The `LoraConfig` class from the `peft` package contains the various LoRA parameters. You can specify which layers have LoRA adapters through the `target_modules` argument. For a detailed reference of the various configuration options, refer to the [`LoraConfig`](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraConfig) documentation. @@ -76,7 +76,7 @@ train_loader_task2 = DataLoader( ) ``` -### 2. Convert the LORA model to use custom Concrete ML layers +### 2. Convert the LoRA model to use custom Concrete ML layers Next, we need to integrate the LoRA-adapted `peft_model` into the Concrete ML hybrid FHE training framework. This is done using the `LoraTrainer` class, which handles the logic of encrypting outsourced computations, running the forward and backward passes, and updating the LoRA adapter weights. @@ -104,7 +104,7 @@ lora_trainer = LoraTrainer( ) ``` -### 3. Compile a hybrid FHE model for the LORA adapted PyTorch model +### 3. Compile a hybrid FHE model for the LoRA adapted PyTorch model Before training in FHE, we need to compile the model. Compilation calibrates and converts the outsourced linear layers to their FHE equivalents. The compile method uses representative data for this step. @@ -138,7 +138,7 @@ lora_trainer.train(train_loader_task2, fhe="execute") ### Inference -Once fine-tuned, the LORA hybrid FHE model can perform inference only, through the +Once fine-tuned, the LoRA hybrid FHE model can perform inference only, through the `peft_model` attribute of the hybrid FHE model. @@ -147,9 +147,9 @@ Once fine-tuned, the LORA hybrid FHE model can perform inference only, through t peft_model(x) ``` -### Toggle LORA layers +### Toggle LoRA layers -To compare to the original model, you can disable the LORA weights to use the original model for inference. +To compare to the original model, you can disable the LoRA weights to use the original model for inference. @@ -157,6 +157,6 @@ To compare to the original model, you can disable the LORA weights to use the or peft_model.disable_adapter_layers() peft_model(x) -# Re-enable the LORA weights +# Re-enable the LoRA weights peft_model.enable_adapter_layers() ``` diff --git a/script/doc_utils/check_forbidden_words.py b/script/doc_utils/check_forbidden_words.py index 11e7cc5b6..a9a9c16a7 100644 --- a/script/doc_utils/check_forbidden_words.py +++ b/script/doc_utils/check_forbidden_words.py @@ -190,6 +190,8 @@ def process_file(file_str: str, do_open_problematic_files=False): ("eg", [], []), # use e.g., ("eg,", [], []), # use e.g., ("eg., ", [], []), # use e.g., + ("Lora", [], []), # use LoRA + ("LORA", [], []), # use LoRA ] # For later # "We" or "Our", or more generally, passive form diff --git a/src/concrete/ml/torch/lora.py b/src/concrete/ml/torch/lora.py index c6ce2b700..30eec4ef0 100644 --- a/src/concrete/ml/torch/lora.py +++ b/src/concrete/ml/torch/lora.py @@ -87,10 +87,16 @@ def __init__(self, model, n_layers_to_skip_for_backprop=1, loss_fn=None): self.loss_scaling_factor = 1.0 def set_loss_scaling_factor(self, loss_scaling_factor: float): - """Set the loss scaling factor for gradient accumulation. + """Set a scaling factor for the loss to account for gradient accumulation. + + This ensures that gradients are correctly averaged over multiple + mini-batches when performing gradient accumulation, preventing them + from being scaled up by the number of accumulation steps. Args: - loss_scaling_factor (float): The factor to scale the loss by. + loss_scaling_factor (float): The number of gradient accumulation steps. + The loss will be divided by this factor + before backpropagation. """ self.loss_scaling_factor = loss_scaling_factor @@ -130,7 +136,7 @@ def replace_layers_with_custom(model: nn.Module, n_layers_to_skip_for_backprop: model (nn.Module): The model to replace layers in. n_layers_to_skip_for_backprop (int): Number of initial linear layers to keep as standard layers. Since the first layer doesn't need backpropagation (no previous layer to - update), we typically skip 1 layer. Defaults to 1. + update), we typically skip 1 layer. """ def _replace(module: nn.Module): diff --git a/use_case_examples/lora_finetuning/GPT2FineTuneHybrid.ipynb b/use_case_examples/lora_finetuning/GPT2FineTuneHybrid.ipynb index 208e5e79b..d102766fa 100644 --- a/use_case_examples/lora_finetuning/GPT2FineTuneHybrid.ipynb +++ b/use_case_examples/lora_finetuning/GPT2FineTuneHybrid.ipynb @@ -877,7 +877,7 @@ "id": "65d448c8", "metadata": {}, "source": [ - "Note that our goal is to showcase the use of FHE for fine-tuning a model. The a dataset used contains 68 examples for a total of 2386 tokens. This a very small dataset which does not allow the model to learn a lot of information and output very interesting results." + "Note that our goal is to showcase the use of FHE for encrypted fine-tuning. The dataset consists of 68 examples and a total of 2,386 tokens, which is relatively small. Despite its limited size, which offers little support for the model's learning process, it still manages to produce interesting results." ] }, { diff --git a/use_case_examples/lora_finetuning/LLamaFineTuning.ipynb b/use_case_examples/lora_finetuning/LLamaFineTuning.ipynb index b6575886f..ab29c525f 100644 --- a/use_case_examples/lora_finetuning/LLamaFineTuning.ipynb +++ b/use_case_examples/lora_finetuning/LLamaFineTuning.ipynb @@ -6,7 +6,7 @@ "source": [ "# Fine-Tuning GPT-2 with LoRA and FHE using `LoraTrainer`\n", "\n", - "This notebook demonstrates how to fine-tune a GPT-2 model using LoRA (Low-Rank Adaptation) with Fully Homomorphic Encryption (FHE). We leverage the `LoraTrainer` API from the `concrete.ml.torch.lora` library to simplify the process.\n" + "This notebook demonstrates how to fine-tune a Llama-3.2-1B model using LoRA (Low-Rank Adaptation) with Fully Homomorphic Encryption (FHE). We leverage the `LoraTrainer` API from the `concrete.ml.torch.lora` library to simplify the process.\n" ] }, { From 4922992b8c29c1019076844206dbe5b5aa946eca Mon Sep 17 00:00:00 2001 From: jfrery Date: Tue, 17 Dec 2024 08:52:48 +0100 Subject: [PATCH 07/11] chore: Concrete-ML -> Concrete ML --- .github/workflows/ci_timing.yaml | 2 +- .github/workflows/release.yaml | 4 +-- .../DecisionTreeRegressor.ipynb | 18 +++++----- docs/advanced_examples/LinearSVR.ipynb | 4 +-- .../RegressorComparison.ipynb | 8 ++--- docs/advanced_examples/SVMClassifier.ipynb | 36 +++++++++---------- docs/advanced_examples/aggregated_code.txt | 14 ++++---- docs/conventions.md | 2 +- ...oncrete.ml.common.serialization.encoder.md | 2 +- .../ml/common/serialization/encoder.py | 2 +- src/concrete/ml/sklearn/glm.py | 8 ++--- src/concrete/ml/sklearn/neighbors.py | 2 +- src/concrete/ml/sklearn/qnn.py | 8 ++--- src/concrete/ml/sklearn/rf.py | 8 ++--- src/concrete/ml/sklearn/svm.py | 8 ++--- src/concrete/ml/sklearn/tree.py | 8 ++--- src/concrete/ml/sklearn/xgb.py | 8 ++--- .../lora_finetuning/GPT2FineTuneHybrid.ipynb | 4 +-- 18 files changed, 73 insertions(+), 73 deletions(-) diff --git a/.github/workflows/ci_timing.yaml b/.github/workflows/ci_timing.yaml index 0e20cbf70..a48a6dad7 100644 --- a/.github/workflows/ci_timing.yaml +++ b/.github/workflows/ci_timing.yaml @@ -1,4 +1,4 @@ -# This workflow uses GitHub CLI to get timings of last 50 runs of Concrete-ML main CI +# This workflow uses GitHub CLI to get timings of last 50 runs of Concrete ML main CI # and send it to slack and add it as an artifact on the workflow name: CML build time on: diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 90c3b7c07..c96c543ea 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -288,7 +288,7 @@ jobs: tags: true # This action creates docker and pypi images directly on the AWS EC2 instance - # The 'PRIVATE_RELEASE_IMAGE_BASE' variable is kept here in case Concrete-ML starts to publish + # The 'PRIVATE_RELEASE_IMAGE_BASE' variable is kept here in case Concrete ML starts to publish # private nightly releases one day. Currently, release candidates and actual releases are all # done through the 'PUBLIC_RELEASE_IMAGE_BASE' image. The private image is also used to list all # tags easily @@ -471,7 +471,7 @@ jobs: echo "" >> "${SECRETS_FILE}" echo "SECRETS_FILE=${SECRETS_FILE}" >> "$GITHUB_ENV" - - name: Build Docker Concrete-ML Image + - name: Build Docker Concrete ML Image if: ${{ success() && !cancelled() }} uses: docker/build-push-action@48aba3b46d1b1fec4febb7c5d0c644b249a11355 with: diff --git a/docs/advanced_examples/DecisionTreeRegressor.ipynb b/docs/advanced_examples/DecisionTreeRegressor.ipynb index 82aa6dd3c..29b1371e1 100644 --- a/docs/advanced_examples/DecisionTreeRegressor.ipynb +++ b/docs/advanced_examples/DecisionTreeRegressor.ipynb @@ -6,9 +6,9 @@ "id": "5755bc04", "metadata": {}, "source": [ - "# Decision Tree Regression Using Concrete-ML\n", + "# Decision Tree Regression Using Concrete ML\n", "\n", - "In this tutorial, we show how to create, train and evaluate a decision tree regression model using Concrete-ML library.\n", + "In this tutorial, we show how to create, train and evaluate a decision tree regression model using Concrete ML library.\n", "\n" ] }, @@ -18,16 +18,16 @@ "id": "2c256087-c16a-4249-9c90-3f4863938385", "metadata": {}, "source": [ - "### Introducing Concrete-ML\n", + "### Introducing Concrete ML\n", "\n", - "> Concrete-ML is an open-source, privacy-preserving, machine learning inference framework based on fully homomorphic encryption (FHE).\n", + "> Concrete ML is an open-source, privacy-preserving, machine learning inference framework based on fully homomorphic encryption (FHE).\n", "> It enables data scientists without any prior knowledge of cryptography to automatically turn machine learning models into their FHE equivalent,using familiar APIs from Scikit-learn and PyTorch.\n", "> — [Zama documentation](../README.md)\n", "\n", "This tutorial does not require a deep understanding of the technology behind concrete-ML.\n", "Nonetheless, newcomers might be interested in reading introductory sections of the official documentation such as:\n", "\n", - "- [What is Concrete-ML](../README.md)\n", + "- [What is Concrete ML](../README.md)\n", "- [Key Concepts](../getting-started/concepts.md)\n", "\n", "In the tutorial, we will be using the following terminology:\n", @@ -233,10 +233,10 @@ "source": [ "## Training A Decision Tree\n", "\n", - "ConcreteDecisionTreeRegressor is the Concrete-ML equivalent of scikit-learn's DecisionTreeRegressor.\n", + "ConcreteDecisionTreeRegressor is the Concrete ML equivalent of scikit-learn's DecisionTreeRegressor.\n", "It supports the same parameters and a similar interface, with the extra capability of predicting directly on ciphertext without the need to decipher it, thus preservacy privacy.\n", "\n", - "Currently, Concrete-ML models must be trained on plaintext. To see how it works, we train a DecisionTreeRegressor with default parameters and estimate its accuracy on test data. Note here that predictions are done on plaintext too, but soon, we will predict on ciphertext." + "Currently, Concrete ML models must be trained on plaintext. To see how it works, we train a DecisionTreeRegressor with default parameters and estimate its accuracy on test data. Note here that predictions are done on plaintext too, but soon, we will predict on ciphertext." ] }, { @@ -479,7 +479,7 @@ "source": [ "## Predicting on Ciphertext\n", "If the predictions are similar although slightly less accurate, the real advantage of ConcreteML is privacy.\n", - "We now show how we can perform prediction on ciphertext with Concrete-ML, so that the model does not need to decipher the data at all to compute its estimate." + "We now show how we can perform prediction on ciphertext with Concrete ML, so that the model does not need to decipher the data at all to compute its estimate." ] }, { @@ -798,7 +798,7 @@ "Once the model is carefully trained and quantized, it is ready to be deployed and used in production. Here are some useful links on the subject:\n", " \n", " - [Inference in the Cloud](../getting-started/cloud.md) summarize the steps for cloud deployment\n", - " - [Production Deployment](../guides/client_server.md) offers a high-level view of how to deploy a Concrete-ML model in a client/server setting.\n", + " - [Production Deployment](../guides/client_server.md) offers a high-level view of how to deploy a Concrete ML model in a client/server setting.\n", " - [Client Server in Concrete ML](./ClientServer.ipynb) provides a more hands-on approach as another tutorial." ] } diff --git a/docs/advanced_examples/LinearSVR.ipynb b/docs/advanced_examples/LinearSVR.ipynb index 7be591052..00b91e8f3 100644 --- a/docs/advanced_examples/LinearSVR.ipynb +++ b/docs/advanced_examples/LinearSVR.ipynb @@ -88,7 +88,7 @@ "\n", "\n", "def get_concrete_plot_config(mse_score=None):\n", - " label = \"Concrete-ML\"\n", + " label = \"Concrete ML\"\n", " if mse_score is not None:\n", " label += f\", {'$MSE$'}={mse_score:.4f}\"\n", " return {\"c\": \"orange\", \"linewidth\": 2.5, \"label\": label}" @@ -646,7 +646,7 @@ "y_pred_sklearn = sklearn_rgs.predict(X_test)\n", "print(f\"Execution time: {(time.time() - time_begin) / len(X_test):.4f} seconds per sample\")\n", "\n", - "# Now predict using clear quantized Concrete-ML model on testing set\n", + "# Now predict using clear quantized Concrete ML model on testing set\n", "time_begin = time.time()\n", "y_preds_quantized = concrete_rgs.predict(X_test)\n", "print(f\"Execution time: {(time.time() - time_begin) / len(X_test):.4f} seconds per sample\")" diff --git a/docs/advanced_examples/RegressorComparison.ipynb b/docs/advanced_examples/RegressorComparison.ipynb index b003b0d87..0d5cef665 100644 --- a/docs/advanced_examples/RegressorComparison.ipynb +++ b/docs/advanced_examples/RegressorComparison.ipynb @@ -210,7 +210,7 @@ " # Instantiate the model\n", " model = regressor()\n", "\n", - " # Train the model and retrieve both the Concrete-ML model and its equivalent one from\n", + " # Train the model and retrieve both the Concrete ML model and its equivalent one from\n", " # scikit-learn\n", " # If the model is a NeuralNetClassifier, instantiate a scikit-learn MLPClassifier\n", " # separately in order to be able to be able to compare the results with a float model\n", @@ -249,7 +249,7 @@ " time_end = time.time()\n", " print(f\"Key generation time: {time_end - time_begin:.2f} seconds\")\n", "\n", - " # Compute the predictions in FHE using the Concrete-ML model\n", + " # Compute the predictions in FHE using the Concrete ML model\n", " time_begin = time.time()\n", " concrete_y_pred = concrete_model.predict(X_poly_test[:1], fhe=\"execute\")\n", " time_end = time.time()\n", @@ -276,7 +276,7 @@ " bitwidth = circuit.graph.maximum_integer_bit_width()\n", "\n", " # Plot the predictions\n", - " ax.plot(X_test, concrete_y_pred, c=\"blue\", linewidth=2.5, label=\"Concrete-ML\")\n", + " ax.plot(X_test, concrete_y_pred, c=\"blue\", linewidth=2.5, label=\"Concrete ML\")\n", "\n", " # Plot the predictions\n", " ax.plot(X_test, sklearn_y_pred, c=\"red\", linewidth=2.5, label=\"scikit-learn\")\n", @@ -284,7 +284,7 @@ " ax.text(\n", " 0.5,\n", " 0.80,\n", - " f\"Concrete-ML R2: {concrete_score:.2f}\\n scikit-learn R2: {sklearn_score:.2f}\\n\",\n", + " f\"Concrete ML R2: {concrete_score:.2f}\\n scikit-learn R2: {sklearn_score:.2f}\\n\",\n", " transform=ax.transAxes,\n", " fontsize=12,\n", " va=\"top\",\n", diff --git a/docs/advanced_examples/SVMClassifier.ipynb b/docs/advanced_examples/SVMClassifier.ipynb index b68037489..c6bbfc02b 100644 --- a/docs/advanced_examples/SVMClassifier.ipynb +++ b/docs/advanced_examples/SVMClassifier.ipynb @@ -6,12 +6,12 @@ "id": "d07c3896", "metadata": {}, "source": [ - "# Support Vector Machine (SVM) classification using Concrete-ML\n", + "# Support Vector Machine (SVM) classification using Concrete ML\n", "\n", - " In this tutorial, we show how to create, train, and evaluate a Support Vector Machine (SVM) model using Concrete-ML library for a classification task.\n", + " In this tutorial, we show how to create, train, and evaluate a Support Vector Machine (SVM) model using Concrete ML library for a classification task.\n", "\n", "It is cut in 2 parts:\n", - "1. a quick setup of a LinearSVC model with Concrete-ML\n", + "1. a quick setup of a LinearSVC model with Concrete ML\n", "2. a more in-depth approach taking a closer look to the concrete-ml specifics\n" ] }, @@ -30,15 +30,15 @@ "id": "d3654d52", "metadata": {}, "source": [ - "### Concrete-ML and useful links\n", + "### Concrete ML and useful links\n", "\n", - "> Concrete-ML is an open-source, privacy-preserving, machine learning inference framework based on fully homomorphic encryption (FHE). It enables data scientists without any prior knowledge of cryptography to automatically turn machine learning models into their FHE equivalent, using familiar APIs from Scikit-learn and PyTorch.\n", + "> Concrete ML is an open-source, privacy-preserving, machine learning inference framework based on fully homomorphic encryption (FHE). It enables data scientists without any prior knowledge of cryptography to automatically turn machine learning models into their FHE equivalent, using familiar APIs from Scikit-learn and PyTorch.\n", "> \n", "> — [Zama documentation](../README.md)\n", "\n", - "This tutorial does not require any knowledge of Concrete-ML. Newcomers might nonetheless be interested in reading some of the introductory sections of the official documentation, such as:\n", + "This tutorial does not require any knowledge of Concrete ML. Newcomers might nonetheless be interested in reading some of the introductory sections of the official documentation, such as:\n", "\n", - "- [What is Concrete-ML](../README.md)\n", + "- [What is Concrete ML](../README.md)\n", "- [Key Concepts](../getting-started/concepts.md)\n", "\n", "### Support Vector Machine\n", @@ -46,7 +46,7 @@ "SVM is a machine learning algorithm for classification and regression. LinearSVC is an efficient implementation of SVM\n", "that works best when the data is linearly separable. In this tutorial, we use the [pulsar star dataset](https://www.kaggle.com/datasets/colearninglounge/predicting-pulsar-starintermediate) to determine whether a neutron star can be classified as a pulsar star.\n", "\n", - "Concrete-ML exposes a LinearSVC class which implements the\n", + "Concrete ML exposes a LinearSVC class which implements the\n", "[scikit-learn LinearSVC](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html) interface, so you should feel right at home.\n", "\n", "### Setup code\n", @@ -342,9 +342,9 @@ "id": "12e827d0", "metadata": {}, "source": [ - "## Part 1: Train a simple model with Concrete-ML\n", + "## Part 1: Train a simple model with Concrete ML\n", "\n", - "The following code quickly scaffolds a Concrete-ML LinearSVC code, which should sound familiar.\n" + "The following code quickly scaffolds a Concrete ML LinearSVC code, which should sound familiar.\n" ] }, { @@ -403,7 +403,7 @@ } ], "source": [ - "# Perform the same steps with the Concrete-ML LinearSVC implementation\n", + "# Perform the same steps with the Concrete ML LinearSVC implementation\n", "svm_concrete = ConcreteLinearSVC(max_iter=100, n_bits=8)\n", "svm_concrete.fit(X_train, y_train)\n", "# plot the boundary\n", @@ -468,15 +468,15 @@ "\n", "#### Simplicity of execution\n", "\n", - "For a high-level use-case, Concrete-ML offers a very similar interface to scikit-learn. The main difference is *a model needs to be compiled to allow execution in FHE*.\n", + "For a high-level use-case, Concrete ML offers a very similar interface to scikit-learn. The main difference is *a model needs to be compiled to allow execution in FHE*.\n", "\n", "#### Model Accuracy\n", "\n", - "Concrete-ML prediction accuracy can be slightly worse than a regular scikit-learn implementation. This is because of [quantization](../explanations/quantization.md): number precision needs to be fixed-size for the model to be evaluated in FHE. This can be alleviated down to where the accuracy difference is none or negligible (which is the case here with a 8 bit size).\n", + "Concrete ML prediction accuracy can be slightly worse than a regular scikit-learn implementation. This is because of [quantization](../explanations/quantization.md): number precision needs to be fixed-size for the model to be evaluated in FHE. This can be alleviated down to where the accuracy difference is none or negligible (which is the case here with a 8 bit size).\n", "\n", "#### Execution time\n", "\n", - "The execution speed can be slower in Concrete-ML, especially during compilation and FHE inference phases, because enabling FHE operations uses more resources than regular inference on plain data. However, the speed can be improved by decreasing the precision of the data and model's weights thanks to the n_bits parameter. But, depending on the project, there is a trade-off between a slower but more accurate model and a faster but less accurate model." + "The execution speed can be slower in Concrete ML, especially during compilation and FHE inference phases, because enabling FHE operations uses more resources than regular inference on plain data. However, the speed can be improved by decreasing the precision of the data and model's weights thanks to the n_bits parameter. But, depending on the project, there is a trade-off between a slower but more accurate model and a faster but less accurate model." ] }, { @@ -536,7 +536,7 @@ "\n", "### Step b: quantize the model\n", "\n", - "So far most of Concrete-ML specificities have conveniently been avoided for the sake of simplicity. The first Concrete-ML specific step of developping a model is to quantize it, which soberly means to turn the model into an integer equivalent.\n", + "So far most of Concrete ML specificities have conveniently been avoided for the sake of simplicity. The first Concrete ML specific step of developping a model is to quantize it, which soberly means to turn the model into an integer equivalent.\n", "\n", "Although it is strongly encouraged to read the [Zama introduction to quantization](../explanations/quantization.md), the key takeaway is **a model needs to be reduced to a *discrete*, smaller set in order for the encryption to happen**. Otherwise the data becomes too large to be manipulated in FHE. \n", "\n", @@ -764,7 +764,7 @@ "- the model itself\n", "- the hardware executing the model\n", "\n", - "Setting up a model in Concrete-ML requires some additional work compared to standard models. For instance, users must select the quantization bit-width for both the model's weight and input data, which can be complex and time-consuming while using real FHE inference. However, Concrete-ML provides an FHE simulation mode that allows users to identify optimal hyper-parameters with the best trade-off between latency and performance.\n", + "Setting up a model in Concrete ML requires some additional work compared to standard models. For instance, users must select the quantization bit-width for both the model's weight and input data, which can be complex and time-consuming while using real FHE inference. However, Concrete ML provides an FHE simulation mode that allows users to identify optimal hyper-parameters with the best trade-off between latency and performance.\n", "\n", "> Testing FHE models on very large data-sets can take a long time. Furthermore, not all models are compatible with FHE constraints out-of-the-box. Simulation using the FHE simulation allows you to execute a model that was quantized, to measure the accuracy it would have in FHE, but also to determine the modifications required to make it FHE compatible.\n", ">\n", @@ -849,13 +849,13 @@ "source": [ "## Conclusion\n", "\n", - "Setting up FHE with Concrete-ML on a LinearSVC model is very simple, in the regard that Concrete-ML provides an implementation of the [scikit-learn LinearSVC interface](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html). As a matter of fact, a working FHE model can be setup with just a few lines of code.\n", + "Setting up FHE with Concrete ML on a LinearSVC model is very simple, in the regard that Concrete ML provides an implementation of the [scikit-learn LinearSVC interface](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html). As a matter of fact, a working FHE model can be setup with just a few lines of code.\n", "\n", "Setting up a model with FHE benefits nonetheless from some additional work. For LinearSVC models, the main point is to select a relevant bit-size for [quantizing](../explanations/quantization.md) the model. Some additional tools can smooth up the development workflow, such as alleviating the [compilation](../explanations/compilation.md) time by making use of the [FHE simulation](../explanations/compilation.md#fhe-simulation) \n", "\n", "Once the model is carefully trained and quantized, it is ready to be deployed and used in production. Here are some useful links that cover this subject:\n", "- [Inference in the Cloud](../getting-started/cloud.md) summarize the steps for cloud deployment\n", - "- [Production Deployment](../guides/client_server.md) offers a high-level view of how to deploy a Concrete-ML model in a client/server setting.\n", + "- [Production Deployment](../guides/client_server.md) offers a high-level view of how to deploy a Concrete ML model in a client/server setting.\n", "- [Client Server in Concrete ML](ClientServer.ipynb) provides a more hands-on approach as another tutorial." ] } diff --git a/docs/advanced_examples/aggregated_code.txt b/docs/advanced_examples/aggregated_code.txt index b142c7788..c3501acad 100644 --- a/docs/advanced_examples/aggregated_code.txt +++ b/docs/advanced_examples/aggregated_code.txt @@ -1428,7 +1428,7 @@ svm_sklearn.fit(X_train, y_train) # plot the boundary plot_decision_boundary(svm_sklearn, X_test, y_test) -# Perform the same steps with the Concrete-ML LinearSVC implementation +# Perform the same steps with the Concrete ML LinearSVC implementation svm_concrete = ConcreteLinearSVC(max_iter=100, n_bits=8) svm_concrete.fit(X_train, y_train) # plot the boundary @@ -1544,7 +1544,7 @@ def get_sklearn_plot_config(mse_score=None): def get_concrete_plot_config(mse_score=None): - label = "Concrete-ML" + label = "Concrete ML" if mse_score is not None: label += f", {'$MSE$'}={mse_score:.4f}" return {"c": "orange", "linewidth": 2.5, "label": label} @@ -1671,7 +1671,7 @@ time_begin = time.time() y_pred_sklearn = sklearn_rgs.predict(X_test) print(f"Execution time: {(time.time() - time_begin) / len(X_test):.4f} seconds per sample") -# Now predict using clear quantized Concrete-ML model on testing set +# Now predict using clear quantized Concrete ML model on testing set time_begin = time.time() y_preds_quantized = concrete_rgs.predict(X_test) print(f"Execution time: {(time.time() - time_begin) / len(X_test):.4f} seconds per sample") @@ -4567,7 +4567,7 @@ def make_regressor_comparison(title, regressors, **kwargs): # Instantiate the model model = regressor() - # Train the model and retrieve both the Concrete-ML model and its equivalent one from + # Train the model and retrieve both the Concrete ML model and its equivalent one from # scikit-learn # If the model is a NeuralNetClassifier, instantiate a scikit-learn MLPClassifier # separately in order to be able to be able to compare the results with a float model @@ -4606,7 +4606,7 @@ def make_regressor_comparison(title, regressors, **kwargs): time_end = time.time() print(f"Key generation time: {time_end - time_begin:.2f} seconds") - # Compute the predictions in FHE using the Concrete-ML model + # Compute the predictions in FHE using the Concrete ML model time_begin = time.time() concrete_y_pred = concrete_model.predict(X_poly_test[:1], fhe="execute") time_end = time.time() @@ -4633,7 +4633,7 @@ def make_regressor_comparison(title, regressors, **kwargs): bitwidth = circuit.graph.maximum_integer_bit_width() # Plot the predictions - ax.plot(X_test, concrete_y_pred, c="blue", linewidth=2.5, label="Concrete-ML") + ax.plot(X_test, concrete_y_pred, c="blue", linewidth=2.5, label="Concrete ML") # Plot the predictions ax.plot(X_test, sklearn_y_pred, c="red", linewidth=2.5, label="scikit-learn") @@ -4641,7 +4641,7 @@ def make_regressor_comparison(title, regressors, **kwargs): ax.text( 0.5, 0.80, - f"Concrete-ML R2: {concrete_score:.2f}\n scikit-learn R2: {sklearn_score:.2f}\n", + f"Concrete ML R2: {concrete_score:.2f}\n scikit-learn R2: {sklearn_score:.2f}\n", transform=ax.transAxes, fontsize=12, va="top", diff --git a/docs/conventions.md b/docs/conventions.md index b230b8813..72b8fc801 100644 --- a/docs/conventions.md +++ b/docs/conventions.md @@ -23,7 +23,7 @@ Let's use following conventions for the docs. If a new convention needs to be de 1. google is a verb ("you can google" but not "you can Google") : but try to avoid this 1. Programs: - Jupyter - - Concrete ML (no Concrete-ML) + - Concrete ML (no Concrete ML) - pytest except when title where it is capitalized - Python - torch (for the code) and PyTorch (for the product) diff --git a/docs/references/api/concrete.ml.common.serialization.encoder.md b/docs/references/api/concrete.ml.common.serialization.encoder.md index 031e6f15d..763fd11fe 100644 --- a/docs/references/api/concrete.ml.common.serialization.encoder.md +++ b/docs/references/api/concrete.ml.common.serialization.encoder.md @@ -45,7 +45,7 @@ Non-native types are serialized manually and dumped in a custom dict format that The name should be unique for each type, as it is used in the ConcreteDecoder class to detect the initial type and apply the proper load method to the serialized object. The serialized value is the value that was serialized manually in a native type. Additional arguments such as a numpy array's dtype are also properly serialized. If an object has an unexpected type or is not serializable, an error is thrown. -The ConcreteEncoder is only meant to encode Concrete-ML's built-in models and therefore only supports the necessary types. For example, torch.Tensor objects are not serializable using this encoder as built-in models only use numpy arrays. However, the list of supported types might expand in future releases if new models are added and need new types. +The ConcreteEncoder is only meant to encode Concrete ML's built-in models and therefore only supports the necessary types. For example, torch.Tensor objects are not serializable using this encoder as built-in models only use numpy arrays. However, the list of supported types might expand in future releases if new models are added and need new types. ______________________________________________________________________ diff --git a/src/concrete/ml/common/serialization/encoder.py b/src/concrete/ml/common/serialization/encoder.py index d18b14227..1e6dcbaf4 100644 --- a/src/concrete/ml/common/serialization/encoder.py +++ b/src/concrete/ml/common/serialization/encoder.py @@ -68,7 +68,7 @@ class ConcreteEncoder(JSONEncoder): as a numpy array's dtype are also properly serialized. If an object has an unexpected type or is not serializable, an error is thrown. - The ConcreteEncoder is only meant to encode Concrete-ML's built-in models and therefore only + The ConcreteEncoder is only meant to encode Concrete ML's built-in models and therefore only supports the necessary types. For example, torch.Tensor objects are not serializable using this encoder as built-in models only use numpy arrays. However, the list of supported types might expand in future releases if new models are added and need new types. diff --git a/src/concrete/ml/sklearn/glm.py b/src/concrete/ml/sklearn/glm.py index ecdbcf56a..4c7d0b6c8 100644 --- a/src/concrete/ml/sklearn/glm.py +++ b/src/concrete/ml/sklearn/glm.py @@ -83,7 +83,7 @@ def dump_dict(self) -> Dict: metadata: Dict[str, Any] = {} - # Concrete-ML + # Concrete ML metadata["n_bits"] = self.n_bits metadata["sklearn_model"] = self.sklearn_model metadata["_is_fitted"] = self._is_fitted @@ -113,7 +113,7 @@ def load_dict(cls, metadata: Dict): # Instantiate the model obj = cls(n_bits=metadata["n_bits"]) - # Concrete-ML + # Concrete ML obj.n_bits = metadata["n_bits"] obj.sklearn_model = metadata["sklearn_model"] obj.onnx_model_ = metadata["onnx_model_"] @@ -327,7 +327,7 @@ def dump_dict(self) -> Dict: metadata: Dict[str, Any] = {} - # Concrete-ML + # Concrete ML metadata["n_bits"] = self.n_bits metadata["sklearn_model"] = self.sklearn_model metadata["_is_fitted"] = self._is_fitted @@ -358,7 +358,7 @@ def load_dict(cls, metadata: Dict): # Instantiate the model obj = cls(n_bits=metadata["n_bits"]) - # Concrete-ML + # Concrete ML obj.sklearn_model = metadata["sklearn_model"] obj.onnx_model_ = metadata["onnx_model_"] obj._is_fitted = metadata["_is_fitted"] diff --git a/src/concrete/ml/sklearn/neighbors.py b/src/concrete/ml/sklearn/neighbors.py index 727529419..9737d624f 100644 --- a/src/concrete/ml/sklearn/neighbors.py +++ b/src/concrete/ml/sklearn/neighbors.py @@ -97,7 +97,7 @@ def load_dict(cls, metadata: Dict): # Instantiate the model obj = cls(n_bits=metadata["n_bits"]) - # Concrete-ML + # Concrete ML obj.sklearn_model = metadata["sklearn_model"] obj._is_fitted = metadata["_is_fitted"] obj._is_compiled = metadata["_is_compiled"] diff --git a/src/concrete/ml/sklearn/qnn.py b/src/concrete/ml/sklearn/qnn.py index 2981a66de..95c4914ad 100644 --- a/src/concrete/ml/sklearn/qnn.py +++ b/src/concrete/ml/sklearn/qnn.py @@ -228,7 +228,7 @@ def dump_dict(self) -> Dict[str, Any]: metadata["optimizer"] = optimizer.getvalue().hex() metadata["criterion"] = criterion.getvalue().hex() - # Concrete-ML + # Concrete ML metadata["_is_fitted"] = self._is_fitted metadata["_is_compiled"] = self._is_compiled metadata["input_quantizers"] = self.input_quantizers @@ -314,7 +314,7 @@ def load_dict(cls, metadata: Dict): module__n_layers=metadata["module__n_layers"], ) - # Concrete-ML + # Concrete ML obj._is_fitted = metadata["_is_fitted"] obj._is_compiled = metadata["_is_compiled"] obj.input_quantizers = metadata["input_quantizers"] @@ -540,7 +540,7 @@ def dump_dict(self) -> Dict[str, Any]: metadata["optimizer"] = optimizer.getvalue().hex() metadata["criterion"] = criterion.getvalue().hex() - # Concrete-ML + # Concrete ML metadata["_is_fitted"] = self._is_fitted metadata["_is_compiled"] = self._is_compiled metadata["input_quantizers"] = self.input_quantizers @@ -628,7 +628,7 @@ def load_dict(cls, metadata: Dict): classes=metadata["classes_"], ) - # Concrete-ML + # Concrete ML obj._is_fitted = metadata["_is_fitted"] obj._is_compiled = metadata["_is_compiled"] obj.input_quantizers = metadata["input_quantizers"] diff --git a/src/concrete/ml/sklearn/rf.py b/src/concrete/ml/sklearn/rf.py index a3181d12a..c0673b36b 100644 --- a/src/concrete/ml/sklearn/rf.py +++ b/src/concrete/ml/sklearn/rf.py @@ -77,7 +77,7 @@ def post_processing(self, y_preds: numpy.ndarray) -> numpy.ndarray: def dump_dict(self) -> Dict[str, Any]: metadata: Dict[str, Any] = {} - # Concrete-ML + # Concrete ML metadata["n_bits"] = self.n_bits metadata["sklearn_model"] = self.sklearn_model metadata["_is_fitted"] = self._is_fitted @@ -117,7 +117,7 @@ def load_dict(cls, metadata: Dict): # Instantiate the model obj = RandomForestClassifier(n_bits=metadata["n_bits"]) - # Concrete-ML + # Concrete ML obj.sklearn_model = metadata["sklearn_model"] obj._is_fitted = metadata["_is_fitted"] obj._is_compiled = metadata["_is_compiled"] @@ -219,7 +219,7 @@ def __init__( def dump_dict(self) -> Dict[str, Any]: metadata: Dict[str, Any] = {} - # Concrete-ML + # Concrete ML metadata["n_bits"] = self.n_bits metadata["sklearn_model"] = self.sklearn_model metadata["_is_fitted"] = self._is_fitted @@ -259,7 +259,7 @@ def load_dict(cls, metadata: Dict): # Instantiate the model obj = RandomForestRegressor(n_bits=metadata["n_bits"]) - # Concrete-ML + # Concrete ML obj.sklearn_model = metadata["sklearn_model"] obj._is_fitted = metadata["_is_fitted"] obj._is_compiled = metadata["_is_compiled"] diff --git a/src/concrete/ml/sklearn/svm.py b/src/concrete/ml/sklearn/svm.py index 509500b3f..093d7e141 100644 --- a/src/concrete/ml/sklearn/svm.py +++ b/src/concrete/ml/sklearn/svm.py @@ -61,7 +61,7 @@ def dump_dict(self) -> Dict[str, Any]: metadata: Dict[str, Any] = {} - # Concrete-ML + # Concrete ML metadata["n_bits"] = self.n_bits metadata["sklearn_model"] = self.sklearn_model metadata["_is_fitted"] = self._is_fitted @@ -94,7 +94,7 @@ def load_dict(cls, metadata: Dict): # Instantiate the model obj = LinearSVR() - # Concrete-ML + # Concrete ML obj.n_bits = metadata["n_bits"] obj.sklearn_model = metadata["sklearn_model"] obj._is_fitted = metadata["_is_fitted"] @@ -180,7 +180,7 @@ def dump_dict(self) -> Dict[str, Any]: metadata: Dict[str, Any] = {} - # Concrete-ML + # Concrete ML metadata["n_bits"] = self.n_bits metadata["sklearn_model"] = self.sklearn_model metadata["_is_fitted"] = self._is_fitted @@ -215,7 +215,7 @@ def load_dict(cls, metadata: Dict): # Instantiate the model obj = LinearSVC() - # Concrete-ML + # Concrete ML obj.n_bits = metadata["n_bits"] obj.sklearn_model = metadata["sklearn_model"] obj._is_fitted = metadata["_is_fitted"] diff --git a/src/concrete/ml/sklearn/tree.py b/src/concrete/ml/sklearn/tree.py index fba10ca3f..048bd6046 100644 --- a/src/concrete/ml/sklearn/tree.py +++ b/src/concrete/ml/sklearn/tree.py @@ -77,7 +77,7 @@ def post_processing(self, y_preds: numpy.ndarray) -> numpy.ndarray: def dump_dict(self) -> Dict[str, Any]: metadata: Dict[str, Any] = {} - # Concrete-ML + # Concrete ML metadata["n_bits"] = self.n_bits metadata["sklearn_model"] = self.sklearn_model metadata["_is_fitted"] = self._is_fitted @@ -112,7 +112,7 @@ def load_dict(cls, metadata: Dict): # Instantiate the model obj = cls(n_bits=metadata["n_bits"]) - # Concrete-ML + # Concrete ML obj.sklearn_model = metadata["sklearn_model"] obj._is_fitted = metadata["_is_fitted"] obj._is_compiled = metadata["_is_compiled"] @@ -208,7 +208,7 @@ def __getattr__(self, attr: str): def dump_dict(self) -> Dict[str, Any]: metadata: Dict[str, Any] = {} - # Concrete-ML + # Concrete ML metadata["n_bits"] = self.n_bits metadata["sklearn_model"] = self.sklearn_model metadata["_is_fitted"] = self._is_fitted @@ -242,7 +242,7 @@ def load_dict(cls, metadata: Dict): # Instantiate the model obj = cls(n_bits=metadata["n_bits"]) - # Concrete-ML + # Concrete ML obj.sklearn_model = metadata["sklearn_model"] obj._is_fitted = metadata["_is_fitted"] obj._fhe_ensembling = metadata["_fhe_ensembling"] diff --git a/src/concrete/ml/sklearn/xgb.py b/src/concrete/ml/sklearn/xgb.py index e0687da78..366a3ae58 100644 --- a/src/concrete/ml/sklearn/xgb.py +++ b/src/concrete/ml/sklearn/xgb.py @@ -137,7 +137,7 @@ def __init__( def dump_dict(self) -> Dict[str, Any]: metadata: Dict[str, Any] = {} - # Concrete-ML + # Concrete ML metadata["n_bits"] = self.n_bits metadata["sklearn_model"] = self.sklearn_model metadata["_is_fitted"] = self._is_fitted @@ -208,7 +208,7 @@ def load_dict(cls, metadata: Dict): # Instantiate the model obj = XGBClassifier(n_bits=metadata["n_bits"]) - # Concrete-ML + # Concrete ML obj.sklearn_model = metadata["sklearn_model"] obj._is_fitted = metadata["_is_fitted"] obj._is_compiled = metadata["_is_compiled"] @@ -417,7 +417,7 @@ def post_processing(self, y_preds: numpy.ndarray) -> numpy.ndarray: def dump_dict(self) -> Dict[str, Any]: metadata: Dict[str, Any] = {} - # Concrete-ML + # Concrete ML metadata["n_bits"] = self.n_bits metadata["sklearn_model"] = self.sklearn_model metadata["_is_fitted"] = self._is_fitted @@ -487,7 +487,7 @@ def load_dict(cls, metadata: Dict): # Instantiate the model obj = XGBRegressor(n_bits=metadata["n_bits"]) - # Concrete-ML + # Concrete ML obj.sklearn_model = metadata["sklearn_model"] obj._is_fitted = metadata["_is_fitted"] obj._is_compiled = metadata["_is_compiled"] diff --git a/use_case_examples/lora_finetuning/GPT2FineTuneHybrid.ipynb b/use_case_examples/lora_finetuning/GPT2FineTuneHybrid.ipynb index d102766fa..c229016a0 100644 --- a/use_case_examples/lora_finetuning/GPT2FineTuneHybrid.ipynb +++ b/use_case_examples/lora_finetuning/GPT2FineTuneHybrid.ipynb @@ -5,9 +5,9 @@ "id": "dfccd8e6", "metadata": {}, "source": [ - "# Fine-Tuning GPT-2 on Encrypted Data with LoRA and Concrete-ML\n", + "# Fine-Tuning GPT-2 on Encrypted Data with LoRA and Concrete ML\n", "\n", - "In this notebook, we perform fine-tuning of a GPT-2 model using LoRA and Concrete-ML." + "In this notebook, we perform fine-tuning of a GPT-2 model using LoRA and Concrete ML." ] }, { From 825e4b66fb1bb2bfad4ff8e343f60acc31e1ad7e Mon Sep 17 00:00:00 2001 From: jfrery Date: Tue, 17 Dec 2024 09:05:08 +0100 Subject: [PATCH 08/11] chore: update readme with llama --- use_case_examples/lora_finetuning/README.md | 44 ++++++++++++++++----- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/use_case_examples/lora_finetuning/README.md b/use_case_examples/lora_finetuning/README.md index cf16d2176..36ae88b3e 100644 --- a/use_case_examples/lora_finetuning/README.md +++ b/use_case_examples/lora_finetuning/README.md @@ -1,16 +1,20 @@ -# Privacy Preserving GPT2 LoRA +# Privacy Preserving Language Models LoRA Fine-tuning -This project demonstrates how to fine-tune GPT-2 using Low-Rank Adaptation (LoRA) weights with Fully Homomorphic Encryption (FHE). The goal is to train a specialized model in a privacy-preserving manner, with minimal memory requirements. +This use case demonstrates how to fine-tune language models (GPT-2 and LLaMA) using Low-Rank Adaptation (LoRA) weights with Fully Homomorphic Encryption (FHE). The goal is to train specialized models in a privacy-preserving manner, with minimal memory requirements. ## Overview -Fine-tuning large language models typically requires access to sensitive data, which can raise privacy concerns. By leveraging FHE, we can perform computations on encrypted data, ensuring that the data remains private throughout the training process. In this approach, the LoRA weights are only known to the user who owns the data and the memory hungry foundation model remains on the server. +Fine-tuning large language models typically requires access to sensitive data, which can raise privacy concerns. By leveraging FHE, we can perform computations on encrypted foundation model weights, ensuring that the data remain private throughout the training process. The LoRA weights are kept in clear on the client side. + ## Key Features -- **LoRA Fine-Tuning**: Fine-tune GPT-2 by adapting low-rank weights. -- **Hybrid Model**: Combine traditional and encrypted computations for optimal performance. -- **Low Memory Requirements**: Minimal client-side memory needed for LoRA weights. +- **LoRA Fine-Tuning**: Fine-tune language models by adapting low-rank weights +- **Hybrid Model**: Combine encrypted foundation model weights with clear LoRA weights for optimal performance +- **Low Memory Requirements**: Minimal client-side memory needed for LoRA weights +- **Multiple Approaches**: + - Custom training implementation for GPT-2 + - Simplified API-based approach for LLaMA using the `LoraTrainer` ## Setup @@ -26,9 +30,25 @@ pip install -r requirements.txt ## Usage -### Prepare the Dataset +### Available Notebooks + +The repository includes two example notebooks: + +1. **GPT2FineTuneHybrid.ipynb**: + - Uses a custom training implementation + - Fine-tunes GPT-2 on a small Q&A data-set about FHE + - Shows low-level control over the training process + +2. **LLamaFineTuning.ipynb**: + - Uses Concrete ML's `LoraTrainer` API for simplified implementation + - Fine-tunes LLaMA on Concrete ML code examples + - Shows how to use the high-level API for encrypted fine-tuning -Replace the data-set in the `data_finetune` directory to the one you want to use for fine-tuning. +### Prepare the data-set + +Each notebook includes its own data-set: +- GPT-2 uses a small Q&A data-set about FHE in `data_finetune/what_is_fhe.txt` +- LLaMA uses Concrete ML code examples in `data_finetune/data-set.jsonl` ### Run the Fine-Tuning Script @@ -47,14 +67,18 @@ In a deployment or production scenario, the model can be fine-tuned as follows: ## Results -The fine-tuned model can generate specialized text based on the provided data-set while ensuring data privacy through FHE. +### GPT-2 Results After fine-tuning, the model's weights are distributed between the client and server as follows: - Total weights removed from the server: 68.24% - LoRA weights kept on the client: 147,456 (approximately 0.12% of the original model's weights) -Note that the embedding are not considered for now but contain a significant amount of weights (around 30%) for GPT2. They will be considered in a future version of Concrete ML. +Note that the embeddings are not considered for now but contain a significant amount of weights (around 30%) for GPT2. They will be considered in a future version of Concrete ML. + +### LLaMA Results + +TBD ## Conclusion From 8014ec5556c6ec777ae1edd05ef3badfc90a9cf9 Mon Sep 17 00:00:00 2001 From: jfrery Date: Tue, 17 Dec 2024 10:23:03 +0100 Subject: [PATCH 09/11] chore: re-word LoraMLP notebook --- docs/advanced_examples/LoraMLP.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/advanced_examples/LoraMLP.ipynb b/docs/advanced_examples/LoraMLP.ipynb index 529603215..8a5dfd169 100644 --- a/docs/advanced_examples/LoraMLP.ipynb +++ b/docs/advanced_examples/LoraMLP.ipynb @@ -10,7 +10,7 @@ "\n", "The fine-tuning dataset and the trained LoRA weights are protected using encryption. Thus, training can be securely outsourced to a remote server without compromising any sensitive data.\n", "\n", - "The hybrid model approach is applied to fine-tuning: only the linear layers of the original model are outsourced to the server. The forward and backward passes on these layers are performed using encrypted activations and gradients. Meanwhile, the LoRA weights are kept by the client, which performs locally the forward and backward passes on the LoRA weights." + "The hybrid approach is applied to fine-tuning: only the linear layers of the original model are outsourced to the server. The forward and backward passes on these layers are performed using encrypted activations and gradients. Meanwhile, the LoRA weights are kept by the client, which performs locally the forward and backward passes on the LoRA weights." ] }, { @@ -249,7 +249,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Setup FHE fine-tuning with LoraTraining and HybridFHEModel" + "## Setup FHE fine-tuning with LoraTrainer" ] }, { @@ -431,7 +431,7 @@ "\n", "lora_trainer.save_and_clear_private_info(path)\n", "\n", - "# At this point, the hybrid_model only contains the trainable parameters of the LoRA layers.\n", + "# At this point, the client's model only contains the trainable parameters of the LoRA layers.\n", "peft_model.print_trainable_parameters()" ] }, @@ -446,7 +446,7 @@ "**Key Takeaways:**\n", " \n", "- **Efficiency with LoRA:** While this example utilizes an MLP model with a relatively high proportion of LoRA weights due to its simplicity, the approach scales effectively to larger models like large language models (LLMs). In such cases, LoRA typically accounts for **less than one percent** of the total model parameters, ensuring minimal memory and computational overhead on the client side.\n", - "- **Scalability and Practicality:** The hybrid model approach demonstrated here is particularly beneficial for scenarios where client devices have limited resources. Memory heavy computations are offloaded to a secure server and the client handles only the lightweight LoRA adjustments locally." + "- **Scalability and Practicality:** The hybrid approach demonstrated here is particularly beneficial for scenarios where client devices have limited resources. Memory heavy computations are offloaded to a secure server and the client handles only the lightweight LoRA adjustments locally." ] } ], From abfb76a49991ed87f5fffe38fe9b5544ace640cf Mon Sep 17 00:00:00 2001 From: jfrery Date: Thu, 19 Dec 2024 10:31:35 +0100 Subject: [PATCH 10/11] chore: fix gpt2 custom training + review - fix wrong unpacking of inputs in LoraTraining + add check - add optimizer step in gpt2 - typo in llama notebook - update version in requirements --- src/concrete/ml/torch/lora.py | 38 ++++++++---- tests/torch/test_lora.py | 58 ++++++++++++++++--- .../lora_finetuning/GPT2FineTuneHybrid.ipynb | 36 +++++------- .../lora_finetuning/LLamaFineTuning.ipynb | 2 +- use_case_examples/lora_finetuning/README.md | 12 ++-- .../lora_finetuning/requirements.txt | 10 ++-- 6 files changed, 106 insertions(+), 50 deletions(-) diff --git a/src/concrete/ml/torch/lora.py b/src/concrete/ml/torch/lora.py index 30eec4ef0..d80cdab0a 100644 --- a/src/concrete/ml/torch/lora.py +++ b/src/concrete/ml/torch/lora.py @@ -195,17 +195,29 @@ def forward(self, inputs: Tuple[Tensor, ...]) -> Tuple[Tensor, Union[Tensor, Non ValueError: If the model does not return a loss and no loss function is provided. """ assert ( - len(inputs) >= 2 + len(inputs) >= 2 and len(inputs) <= 3 ), "Expected at least two inputs in the tuple: inputs (x) and targets (y)" - # FIXME: - # Remove when hybrid model supports multiple inputs modules - # Unpack model inputs and labels - *model_inputs, y = inputs + # Unpack depending on how many inputs we have + if len(inputs) == 2: + input_ids, labels = inputs + attention_mask = None + else: + input_ids, labels, attention_mask = inputs + + # Validate attention mask + assert torch.all( + torch.logical_or(attention_mask == 0, attention_mask == 1) + ), "Invalid attention mask provided. Attention mask should only contain 0s and 1s." if self.loss_fn is None: # Pass inputs and labels to the model - outputs = self.inference_model(*model_inputs, labels=y) + if attention_mask is not None: + outputs = self.inference_model( + input_ids, labels=labels, attention_mask=attention_mask + ) + else: + outputs = self.inference_model(input_ids, labels=labels) # Check if outputs is a dict and retrieve the loss if isinstance(outputs, dict): @@ -219,10 +231,16 @@ def forward(self, inputs: Tuple[Tensor, ...]) -> Tuple[Tensor, Union[Tensor, Non ) else: # Forward pass without labels; compute loss manually - outputs = self.inference_model(*model_inputs) - if isinstance(outputs, dict) and "logits" in outputs: - outputs = outputs["logits"] - loss = self.loss_fn(outputs, y) + if attention_mask is not None: + logits = self.inference_model(input_ids, attention_mask=attention_mask) + else: + logits = self.inference_model(input_ids) + + # If logits is a dict with 'logits' key, extract it + if isinstance(logits, dict) and "logits" in logits: + logits = logits["logits"] + + loss = self.loss_fn(logits, labels) # Scale the loss for gradient accumulation scaled_loss = loss / self.loss_scaling_factor diff --git a/tests/torch/test_lora.py b/tests/torch/test_lora.py index 4f4da5f23..dfdff971b 100644 --- a/tests/torch/test_lora.py +++ b/tests/torch/test_lora.py @@ -431,31 +431,73 @@ def test_forward_backward_module(): assert grad_input.shape == x.shape +def test_lora_training_forward_with_loss_fn_and_attention_mask(): + """Test LoraTraining forward using a custom loss_fn and attention_mask.""" + + class ModelWithAttention(nn.Module): + """Model that supports attention_mask for testing.""" + + def __init__(self): + super().__init__() + self.lora_a = nn.Parameter(torch.randn(10, 10)) + self.linear = nn.Linear(10, 10) + + def forward(self, x, attention_mask=None): + """Forward pass.""" + if attention_mask is not None: + return {"logits": self.linear(x + attention_mask)} + return {"logits": self.linear(x)} + + # Define a simple loss function + def simple_loss_fn(logits, labels): + return nn.MSELoss()(logits, labels) + + model = ModelWithAttention() + + # Instantiate LoraTraining with a custom loss_fn + lora_training = LoraTraining(model, loss_fn=simple_loss_fn) + + x = torch.randn(5, 10) + y = torch.randn(5, 10) + attention_mask = torch.randn(5, 10) + + # Call forward with (input_ids, labels, attention_mask) + loss, _ = lora_training((x, y, attention_mask)) + assert isinstance(loss, torch.Tensor) + + def test_lora_training_forward_with_additional_inputs(): """Test LoraTraining forward with additional inputs.""" - class ModelWithAdditionalInputs(nn.Module): - """Model with additional inputs for testing.""" + class ModelWithAttention(nn.Module): + """Model with attention input for testing.""" def __init__(self): super().__init__() self.lora_a = nn.Parameter(torch.randn(10, 10)) self.linear = nn.Linear(10, 10) - def forward(self, x, extra_input, labels=None): - """Forward pass with additional inputs.""" - logits = self.linear(x + extra_input) + def forward(self, x, attention_mask=None, labels=None): + """Forward pass with an attention mask.""" + # Just treat the attention_mask as an extra input + # and add it to x before passing through linear. + if attention_mask is not None: + logits = self.linear(x + attention_mask) + else: + logits = self.linear(x) + if labels is not None: loss = nn.functional.mse_loss(logits, labels) return {"loss": loss} return {"logits": logits} - model = ModelWithAdditionalInputs() + model = ModelWithAttention() lora_training = LoraTraining(model) x = torch.randn(5, 10) y = torch.randn(5, 10) - extra_input = torch.randn(5, 10) - loss, _ = lora_training((x, extra_input, y)) + attention_mask = torch.randn(5, 10) + + loss, _ = lora_training((x, y, attention_mask)) assert isinstance(loss, torch.Tensor) diff --git a/use_case_examples/lora_finetuning/GPT2FineTuneHybrid.ipynb b/use_case_examples/lora_finetuning/GPT2FineTuneHybrid.ipynb index c229016a0..51a99b21b 100644 --- a/use_case_examples/lora_finetuning/GPT2FineTuneHybrid.ipynb +++ b/use_case_examples/lora_finetuning/GPT2FineTuneHybrid.ipynb @@ -309,7 +309,10 @@ "num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1)\n", "max_steps = math.ceil(training_args.num_train_epochs * num_update_steps_per_epoch)\n", "\n", - "trainer.create_optimizer_and_scheduler(num_training_steps=max_steps)" + "trainer.create_optimizer_and_scheduler(num_training_steps=max_steps)\n", + "\n", + "lr_scheduler = trainer.lr_scheduler\n", + "optimizer = trainer.optimizer" ] }, { @@ -381,27 +384,11 @@ "\n", " # Training loop\n", " peft_model.train()\n", - " lora_training.run_optimizer = True\n", " total_epochs = int(training_args.num_train_epochs)\n", " epoch_pbar = tqdm(total=total_epochs, desc=\"Training Progress\", position=0)\n", "\n", - " # Initialize optimizer and scheduler here instead\n", - " optimizer = torch.optim.AdamW(\n", - " hybrid_model.model.parameters(),\n", - " lr=training_args.learning_rate,\n", - " weight_decay=training_args.weight_decay,\n", - " )\n", - "\n", - " num_training_steps = total_epochs * len(train_dataloader)\n", - " lr_scheduler = torch.optim.lr_scheduler.LinearLR(\n", - " optimizer,\n", - " start_factor=1.0,\n", - " end_factor=0.0,\n", - " total_iters=num_training_steps,\n", - " )\n", - "\n", " total_batched_samples = 0\n", - " epoch_losses = [] # List to store the loss for each epoch\n", + " epoch_losses = []\n", "\n", " # Generate text before the first epoch\n", " print(\"Generating text before the first epoch:\\n\")\n", @@ -415,17 +402,24 @@ " grad_norms = []\n", "\n", " for _, batch in enumerate(train_dataloader):\n", - "\n", " total_batched_samples += 1\n", - "\n", " batch = {k: v.to(device) for k, v in batch.items()}\n", "\n", + " # Zero the gradients\n", + " optimizer.zero_grad()\n", + "\n", + " # Forward pass\n", " loss, grad_norm = hybrid_model(\n", " (batch[\"input_ids\"], batch[\"labels\"], batch[\"attention_mask\"]), fhe=fhe\n", " )\n", "\n", - " total_loss += loss.item()\n", + " # Optimizer step\n", + " optimizer.step()\n", "\n", + " # Learning rate scheduler step\n", + " lr_scheduler.step()\n", + "\n", + " total_loss += loss.item()\n", " if grad_norm is not None:\n", " grad_norms.append(grad_norm)\n", "\n", diff --git a/use_case_examples/lora_finetuning/LLamaFineTuning.ipynb b/use_case_examples/lora_finetuning/LLamaFineTuning.ipynb index ab29c525f..7ee8a6810 100644 --- a/use_case_examples/lora_finetuning/LLamaFineTuning.ipynb +++ b/use_case_examples/lora_finetuning/LLamaFineTuning.ipynb @@ -326,7 +326,7 @@ "outputs": [], "source": [ "# Save the fine-tuned model\n", - "save_path = Path(\"deployment/gpt2_lora_finetuned\")\n", + "save_path = Path(\"deployment/llama_lora_finetuned\")\n", "if save_path.is_dir() and any(save_path.iterdir()):\n", " shutil.rmtree(save_path)\n", "lora_trainer.save_and_clear_private_info(save_path)\n", diff --git a/use_case_examples/lora_finetuning/README.md b/use_case_examples/lora_finetuning/README.md index 36ae88b3e..a1513298f 100644 --- a/use_case_examples/lora_finetuning/README.md +++ b/use_case_examples/lora_finetuning/README.md @@ -6,13 +6,12 @@ This use case demonstrates how to fine-tune language models (GPT-2 and LLaMA) us Fine-tuning large language models typically requires access to sensitive data, which can raise privacy concerns. By leveraging FHE, we can perform computations on encrypted foundation model weights, ensuring that the data remain private throughout the training process. The LoRA weights are kept in clear on the client side. - ## Key Features - **LoRA Fine-Tuning**: Fine-tune language models by adapting low-rank weights - **Hybrid Model**: Combine encrypted foundation model weights with clear LoRA weights for optimal performance - **Low Memory Requirements**: Minimal client-side memory needed for LoRA weights -- **Multiple Approaches**: +- **Multiple Approaches**: - Custom training implementation for GPT-2 - Simplified API-based approach for LLaMA using the `LoraTrainer` @@ -34,12 +33,14 @@ pip install -r requirements.txt The repository includes two example notebooks: -1. **GPT2FineTuneHybrid.ipynb**: +1. **GPT2FineTuneHybrid.ipynb**: + - Uses a custom training implementation - Fine-tunes GPT-2 on a small Q&A data-set about FHE - Shows low-level control over the training process -2. **LLamaFineTuning.ipynb**: +1. **LLamaFineTuning.ipynb**: + - Uses Concrete ML's `LoraTrainer` API for simplified implementation - Fine-tunes LLaMA on Concrete ML code examples - Shows how to use the high-level API for encrypted fine-tuning @@ -47,6 +48,7 @@ The repository includes two example notebooks: ### Prepare the data-set Each notebook includes its own data-set: + - GPT-2 uses a small Q&A data-set about FHE in `data_finetune/what_is_fhe.txt` - LLaMA uses Concrete ML code examples in `data_finetune/data-set.jsonl` @@ -67,8 +69,8 @@ In a deployment or production scenario, the model can be fine-tuned as follows: ## Results - ### GPT-2 Results + After fine-tuning, the model's weights are distributed between the client and server as follows: - Total weights removed from the server: 68.24% diff --git a/use_case_examples/lora_finetuning/requirements.txt b/use_case_examples/lora_finetuning/requirements.txt index e99a87ffe..da6495fef 100644 --- a/use_case_examples/lora_finetuning/requirements.txt +++ b/use_case_examples/lora_finetuning/requirements.txt @@ -1,9 +1,9 @@ -e ../../. -transformers==4.41.2 -peft==0.11.1 +transformers==4.46.3 +peft==0.12.0 Jinja2==3.1.4 matplotlib==3.7.5 -datasets==3.0.1 +datasets==3.1.0 accelerate==1.2.0 -jupyter==1.0.0 -tqdm==4.66.5 \ No newline at end of file +jupyter==1.1.1 +tqdm==4.67.1 \ No newline at end of file From 960c99bfa339671bf4b274e7ee346c8550851cbe Mon Sep 17 00:00:00 2001 From: jfrery Date: Thu, 19 Dec 2024 12:06:41 +0100 Subject: [PATCH 11/11] chore: fix test for attention assertion --- tests/torch/test_lora.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/torch/test_lora.py b/tests/torch/test_lora.py index dfdff971b..03a38d929 100644 --- a/tests/torch/test_lora.py +++ b/tests/torch/test_lora.py @@ -459,7 +459,7 @@ def simple_loss_fn(logits, labels): x = torch.randn(5, 10) y = torch.randn(5, 10) - attention_mask = torch.randn(5, 10) + attention_mask = torch.randint(0, 2, (5, 10)) # Call forward with (input_ids, labels, attention_mask) loss, _ = lora_training((x, y, attention_mask)) @@ -495,7 +495,7 @@ def forward(self, x, attention_mask=None, labels=None): lora_training = LoraTraining(model) x = torch.randn(5, 10) y = torch.randn(5, 10) - attention_mask = torch.randn(5, 10) + attention_mask = torch.randint(0, 2, (5, 10)) loss, _ = lora_training((x, y, attention_mask)) assert isinstance(loss, torch.Tensor)